Spaces:

Eliot0110
/

Travel_Assistant

Sleeping

App Files Files Community

Eliot0110 commited on Aug 6

Commit

794c23a

1 Parent(s): 86c5051

improve: voice-totext and classifier model

Browse files

Files changed (2) hide show

modules/ai_model.py +77 -45
modules/intent_classifier.py +1 -2

modules/ai_model.py CHANGED Viewed

@@ -108,6 +108,58 @@ class AIModel:
             return "audio"
         return "text"
     def format_input(self, input_type: str, raw_input: str) -> Tuple[str, Union[str, Image.Image, None]]:
@@ -133,9 +185,22 @@ class AIModel:
                 return "text", None, f"图片加载失败，请检查路径或URL。"
         elif input_type == "audio":
-            log.warning("⚠️ 音频处理功能暂未实现")
-            return "text", None, "抱歉，音频输入功能正在开发中。请使用文字描述您的需求。"
         else:  # text
             return input_type, None, raw_input
@@ -143,36 +208,14 @@ class AIModel:
     def run_inference(self, input_type: str, formatted_input: Union[str, Image.Image], prompt: str,temperature: float = 0.5) -> str:
         try:
-            # 截断过长的 prompt
-            if len(prompt) > 500:
-                prompt = prompt[:500] + "..."
-            # 准备输入 (处理图片或文本)
-            if input_type == "image" and isinstance(formatted_input, Image.Image):
-                image_token = getattr(self.processor.tokenizer, 'image_token', '<image>')
-                if image_token not in prompt:
-                    prompt = f"{image_token}\n{prompt}"
-                inputs = self.processor(
-                    text=prompt,
-                    images=formatted_input,
-                    return_tensors="pt"
-                ).to(self.model.device, dtype=torch.bfloat16)
-            else:
-                inputs = self.processor(
-                    text=prompt,
-                    return_tensors="pt"
-                ).to(self.model.device, dtype=torch.bfloat16)
-            if hasattr(inputs, 'input_ids') and inputs.input_ids.shape[-1] > 512:
-                log.warning(f"⚠️ 截断过长输入: {inputs.input_ids.shape[-1]} -> 512")
-                inputs.input_ids = inputs.input_ids[:, :512]
-                if hasattr(inputs, 'attention_mask'):
-                    inputs.attention_mask = inputs.attention_mask[:, :512]
             with torch.inference_mode():
                 generation_args = {
-                    "max_new_tokens": 512,
                     "pad_token_id": self.processor.tokenizer.eos_token_id,
                     "use_cache": True
                 }
@@ -218,7 +261,7 @@ class AIModel:
         full_prompt = "\n".join([msg.get("content", "") for msg in messages])
-        temperature = kwargs.get("temperature", 0.7)
         if kwargs.get("response_format", {}).get("type") == "json_object":
             # 在 prompt 末尾添加指令，强制模型输出 JSON
@@ -236,16 +279,8 @@ class AIModel:
         )
-    def _build_limited_prompt(self, processed_text: str, context: str = "") -> str:
-        """构建长度受限的prompt - 新增辅助方法"""
-        # 限制输入长度
-        if len(processed_text) > 200:
-            processed_text = processed_text[:200] + "..."
-        if context and len(context) > 300:
-            context = context[:300] + "..."
-        # 保持你原有的prompt结构
         if context:
             return (
                 f"你是一个专业的旅游助手。请基于以下背景信息，用中文友好地回答用户的问题。\n\n"
@@ -274,13 +309,10 @@ class AIModel:
             input_type, formatted_data, processed_text = self.format_input(input_type, user_input)
             # 3. 构建prompt - 使用你的原有结构
-            prompt = self._build_limited_prompt(processed_text, context)
             # 4. 执行推理
-            if input_type == "image" and formatted_data is not None:
-                return self.run_inference("image", formatted_data, prompt)
-            else:
-                return self.run_inference("text", processed_text, prompt)
         except Exception as e:
             log.error(f"❌ 生成回复时发生错误: {e}", exc_info=True)

             return "audio"
         return "text"
+    def transcribe_audio(self, audio_path: str) -> str:
+        """
+        使用 Hugging Face Inference API 将音频文件转写为文本。
+        - 通过环境变量加载 HF_TOKEN 保证安全。
+        - 包含网络请求超时和状态码检查，增强健壮性。
+        """
+        # 1. 从环境变量安全地获取 Token
+        hf_token = os.getenv("Assitant_tocken")
+        API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large" # 建议使用更新的 v3 版本
+        headers = {"Authorization": f"Bearer {hf_token}"}
+        # 2. 检查音频文件是否存在
+        if not os.path.exists(audio_path):
+            log.error(f"❌ 音频文件不存在: {audio_path}")
+            raise FileNotFoundError(f"指定的音频文件路径不存在: {audio_path}")
+        try:
+            with open(audio_path, "rb") as f:
+                # 3. 发送请求，并设置较长的超时时间 (例如 60 秒)
+                log.info(f"🎤 正在向 HF API 发送音频数据... (超时设置为60秒)")
+                response = requests.post(API_URL, headers=headers, data=f, timeout=60)
+            # 4. 检查 HTTP 响应状态码，主动抛出错误
+            response.raise_for_status()  # 如果状态码不是 2xx，则会引发 HTTPError
+            result = response.json()
+            log.info("✅ HF API 响应成功。")
+            # 5. 可靠地提取结果或处理错误信息
+            if "text" in result:
+                return result["text"].strip()
+            else:
+                error_message = result.get("error", "未知的 API 错误结构。")
+                log.error(f"❌ 转录失败，API 返回: {error_message}")
+                # 如果模型正在加载，HuggingFace 会在 error 字段中提示
+                if isinstance(error_message, dict) and "estimated_time" in error_message:
+                    raise RuntimeError(f"模型正在加载中，请稍后重试。预计等待时间: {error_message['estimated_time']:.1f}秒")
+                raise RuntimeError(f"转录失败: {error_message}")
+        except requests.exceptions.Timeout:
+            log.error("❌ 请求超时！API 未在60秒内响应。")
+            raise RuntimeError("语音识别服务请求超时，请稍后再试。")
+        except requests.exceptions.RequestException as e:
+            log.error(f"❌ 网络请求失败: {e}")
+            raise RuntimeError(f"无法连接到语音识别服务: {e}")
+        except Exception as e:
+            # 捕获其他所有可能的异常，例如文件读取错误、JSON解码错误等
+            log.error(f"❌ 处理音频时发生未知错误: {e}", exc_info=True)
+            raise e
     def format_input(self, input_type: str, raw_input: str) -> Tuple[str, Union[str, Image.Image, None]]:
                 return "text", None, f"图片加载失败，请检查路径或URL。"
         elif input_type == "audio":
+            try:
+                # --- 音频处理核心 ---
+                # 假设: 您的类中有一个方法 `transcribe_audio` 用于语音转文字。
+                # 您需要自行实现这个方法, 例如通过调用 Whisper, FunASR 或其他 ASR 服务。
+                # 它接收音频文件路径 (raw_input) 并返回转写的文本字符串。
+                log.info(f"🎤 开始处理音频文件: {raw_input}")
+                transcribed_text = self.transcribe_audio(raw_input)
+                log.info(f"✅ 音频转写成功: '{transcribed_text[:50]}...'")
+                # 注意：处理成功后，我们将 input_type 转为 "text"，
+                # 因为音频内容已变为文本，后续流程可以统一处理。
+                return "text", None, transcribed_text
+            except Exception as e:
+                log.error(f"❌ 音频处理失败: {e}", exc_info=True)
+                return "text", None, f"音频处理失败，请检查文件或稍后再试。"
         else:  # text
             return input_type, None, raw_input
     def run_inference(self, input_type: str, formatted_input: Union[str, Image.Image], prompt: str,temperature: float = 0.5) -> str:
         try:
+            inputs = self.processor(
+                text=prompt,
+                return_tensors="pt"
+            ).to(self.model.device, dtype=torch.bfloat16)
             with torch.inference_mode():
                 generation_args = {
+                    "max_new_tokens": 1024,
                     "pad_token_id": self.processor.tokenizer.eos_token_id,
                     "use_cache": True
                 }
         full_prompt = "\n".join([msg.get("content", "") for msg in messages])
+        temperature = kwargs.get("temperature", 0.6)
         if kwargs.get("response_format", {}).get("type") == "json_object":
             # 在 prompt 末尾添加指令，强制模型输出 JSON
         )
+    def _build_prompt(self, processed_text: str, context: str = "") -> str:
         if context:
             return (
                 f"你是一个专业的旅游助手。请基于以下背景信息，用中文友好地回答用户的问题。\n\n"
             input_type, formatted_data, processed_text = self.format_input(input_type, user_input)
             # 3. 构建prompt - 使用你的原有结构
+            prompt = self._build_prompt(processed_text, context)
             # 4. 执行推理
+            return self.run_inference("text", formatted_data, prompt)
         except Exception as e:
             log.error(f"❌ 生成回复时发生错误: {e}", exc_info=True)

modules/intent_classifier.py CHANGED Viewed

@@ -43,7 +43,7 @@ PROVIDING_TRAVEL_INFO > INQUIRY > GREETING > OTHER
 - 用户输入: "你好，我想去东京玩" -> 分类: PROVIDING_TRAVEL_INFO
 - 用户输入: "Hi, 巴黎有什么推荐的吗？" -> 分类: INQUIRY
 - 用户输入: "周末愉快！" -> 分类: GREETING
-- 用户输入: "我们预算不多，大概3000元，目的地是成都。" -> 分类: PROVIDING_TRAVEL_INFO
 - 用户输入: "你好在吗" -> 分类: GREETING
 - 用户输入: "随便聊聊" -> 分类: OTHER
@@ -65,7 +65,6 @@ PROVIDING_TRAVEL_INFO > INQUIRY > GREETING > OTHER
         try:
             response = self.ai_model.chat_completion(
-                model="gpt-3.5-turbo",
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0.0,
                 max_tokens=10

 - 用户输入: "你好，我想去东京玩" -> 分类: PROVIDING_TRAVEL_INFO
 - 用户输入: "Hi, 巴黎有什么推荐的吗？" -> 分类: INQUIRY
 - 用户输入: "周末愉快！" -> 分类: GREETING
+- 用户输入: "我们预算不多，大概3000元，目的地是柏林。" -> 分类: PROVIDING_TRAVEL_INFO
 - 用户输入: "你好在吗" -> 分类: GREETING
 - 用户输入: "随便聊聊" -> 分类: OTHER
         try:
             response = self.ai_model.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0.0,
                 max_tokens=10