""" Chat generation service """ import torch from typing import Tuple from .model_service import model_service from ..config import AVAILABLE_MODELS class ChatService: @staticmethod def generate_response( prompt: str, model_name: str, system_prompt: str = None, temperature: float = 0.7, max_new_tokens: int = 1024 ) -> Tuple[str, str, str, bool]: """ Generate chat response Returns: (thinking_content, final_content, model_used, supports_thinking) """ if not model_service.is_model_loaded(model_name): raise ValueError(f"Model {model_name} is not loaded") # Get model and tokenizer model_data = model_service.models_cache[model_name] model = model_data["model"] tokenizer = model_data["tokenizer"] model_info = AVAILABLE_MODELS[model_name] # Build the prompt messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) # Apply chat template formatted_prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device) # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id ) # Decode generated_tokens = outputs[0][inputs['input_ids'].shape[1]:] generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) # Parse thinking vs final content for thinking models thinking_content = "" final_content = generated_text if model_info["supports_thinking"] and "" in generated_text: parts = generated_text.split("") if len(parts) > 1: thinking_part = parts[1] if "" in thinking_part: thinking_content = thinking_part.split("")[0].strip() remaining = thinking_part.split("", 1)[1] if "" in thinking_part else "" final_content = remaining.strip() return ( thinking_content, final_content, model_name, model_info["supports_thinking"] ) # Global chat service instance chat_service = ChatService()