Spaces:
Running
Running
| """ | |
| Chat functionality handler for HF-Inferoxy AI Hub. | |
| Handles chat completion requests with streaming responses. | |
| """ | |
| import os | |
| from huggingface_hub import InferenceClient | |
| from huggingface_hub.errors import HfHubHTTPError | |
| from hf_token_utils import get_proxy_token, report_token_status | |
| from utils import ( | |
| validate_proxy_key, | |
| parse_model_and_provider, | |
| format_error_message | |
| ) | |
| def chat_respond( | |
| message, | |
| history: list[dict[str, str]], | |
| system_message, | |
| model_name, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| ): | |
| """ | |
| Chat completion function using HF-Inferoxy token management. | |
| """ | |
| # Validate proxy API key | |
| is_valid, error_msg = validate_proxy_key() | |
| if not is_valid: | |
| yield error_msg | |
| return | |
| proxy_api_key = os.getenv("PROXY_KEY") | |
| try: | |
| # Get token from HF-Inferoxy proxy server | |
| print(f"π Chat: Requesting token from proxy...") | |
| token, token_id = get_proxy_token(api_key=proxy_api_key) | |
| print(f"β Chat: Got token: {token_id}") | |
| # Parse model name and provider if specified | |
| model, provider = parse_model_and_provider(model_name) | |
| print(f"π€ Chat: Using model='{model}', provider='{provider if provider else 'auto'}'") | |
| # Prepare messages first | |
| messages = [{"role": "system", "content": system_message}] | |
| messages.extend(history) | |
| messages.append({"role": "user", "content": message}) | |
| print(f"π¬ Chat: Prepared {len(messages)} messages, creating client...") | |
| # Create client with provider (auto if none specified) and always pass model | |
| client = InferenceClient( | |
| provider=provider if provider else "auto", | |
| api_key=token | |
| ) | |
| print(f"π Chat: Client created, starting inference...") | |
| chat_completion_kwargs = { | |
| "model": model, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "stream": True, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| } | |
| response = "" | |
| print(f"π‘ Chat: Making streaming request...") | |
| stream = client.chat_completion(**chat_completion_kwargs) | |
| print(f"π Chat: Got stream, starting to iterate...") | |
| for message in stream: | |
| choices = message.choices | |
| token_content = "" | |
| if len(choices) and choices[0].delta.content: | |
| token_content = choices[0].delta.content | |
| response += token_content | |
| yield response | |
| # Report successful token usage | |
| report_token_status(token_id, "success", api_key=proxy_api_key) | |
| except HfHubHTTPError as e: | |
| # Report HF Hub errors | |
| if 'token_id' in locals(): | |
| report_token_status(token_id, "error", str(e), api_key=proxy_api_key) | |
| yield format_error_message("HuggingFace API Error", str(e)) | |
| except Exception as e: | |
| # Report other errors | |
| if 'token_id' in locals(): | |
| report_token_status(token_id, "error", str(e), api_key=proxy_api_key) | |
| yield format_error_message("Unexpected Error", str(e)) | |
| def handle_chat_submit(message, history, system_msg, model_name, max_tokens, temperature, top_p): | |
| """ | |
| Handle chat submission and manage conversation history with streaming. | |
| """ | |
| if not message.strip(): | |
| yield history, "" | |
| return | |
| # Add user message to history | |
| history = history + [{"role": "user", "content": message}] | |
| # Generate response with streaming | |
| response_generator = chat_respond( | |
| message, | |
| history[:-1], # Don't include the current message in history for the function | |
| system_msg, | |
| model_name, | |
| max_tokens, | |
| temperature, | |
| top_p | |
| ) | |
| # Stream the assistant response token by token | |
| assistant_response = "" | |
| for partial_response in response_generator: | |
| assistant_response = partial_response | |
| # Update history with the current partial response and yield it | |
| current_history = history + [{"role": "assistant", "content": assistant_response}] | |
| yield current_history, "" | |