Spaces:
Running
Running
File size: 4,266 Bytes
c1bee18 c192021 c1bee18 c192021 c1bee18 c192021 c1bee18 c192021 c1bee18 c192021 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
"""
Chat functionality handler for HF-Inferoxy AI Hub.
Handles chat completion requests with streaming responses.
"""
import os
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
validate_proxy_key,
parse_model_and_provider,
format_error_message
)
def chat_respond(
message,
history: list[dict[str, str]],
system_message,
model_name,
max_tokens,
temperature,
top_p,
):
"""
Chat completion function using HF-Inferoxy token management.
"""
# Validate proxy API key
is_valid, error_msg = validate_proxy_key()
if not is_valid:
yield error_msg
return
proxy_api_key = os.getenv("PROXY_KEY")
try:
# Get token from HF-Inferoxy proxy server
print(f"π Chat: Requesting token from proxy...")
token, token_id = get_proxy_token(api_key=proxy_api_key)
print(f"β
Chat: Got token: {token_id}")
# Parse model name and provider if specified
model, provider = parse_model_and_provider(model_name)
print(f"π€ Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")
# Prepare messages first
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
print(f"π¬ Chat: Prepared {len(messages)} messages, creating client...")
# Create client with provider (auto if none specified) and always pass model
client = InferenceClient(
provider=provider if provider else "auto",
api_key=token
)
print(f"π Chat: Client created, starting inference...")
chat_completion_kwargs = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"stream": True,
"temperature": temperature,
"top_p": top_p,
}
response = ""
print(f"π‘ Chat: Making streaming request...")
stream = client.chat_completion(**chat_completion_kwargs)
print(f"π Chat: Got stream, starting to iterate...")
for message in stream:
choices = message.choices
token_content = ""
if len(choices) and choices[0].delta.content:
token_content = choices[0].delta.content
response += token_content
yield response
# Report successful token usage
report_token_status(token_id, "success", api_key=proxy_api_key)
except HfHubHTTPError as e:
# Report HF Hub errors
if 'token_id' in locals():
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
yield format_error_message("HuggingFace API Error", str(e))
except Exception as e:
# Report other errors
if 'token_id' in locals():
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
yield format_error_message("Unexpected Error", str(e))
def handle_chat_submit(message, history, system_msg, model_name, max_tokens, temperature, top_p):
"""
Handle chat submission and manage conversation history with streaming.
"""
if not message.strip():
yield history, ""
return
# Add user message to history
history = history + [{"role": "user", "content": message}]
# Generate response with streaming
response_generator = chat_respond(
message,
history[:-1], # Don't include the current message in history for the function
system_msg,
model_name,
max_tokens,
temperature,
top_p
)
# Stream the assistant response token by token
assistant_response = ""
for partial_response in response_generator:
assistant_response = partial_response
# Update history with the current partial response and yield it
current_history = history + [{"role": "assistant", "content": assistant_response}]
yield current_history, ""
|