Spaces:

Eliot0110
/

Travel_Assistant

Sleeping

App Files Files Community

Eliot0110 commited on Aug 4

Commit

6c0d50f

1 Parent(s): 632df2f

fix : fix the inference

Browse files

Files changed (1) hide show

modules/ai_model.py +98 -82

modules/ai_model.py CHANGED Viewed

@@ -69,7 +69,6 @@ class AIModel:
         try:
             log.info(f"正在加载模型: {self.model_name}")
-            # 先进行认证并获取token
             token = self._authenticate_hf()
             if not token:
@@ -78,7 +77,6 @@ class AIModel:
                 self.processor = None
                 return
-            # 设置缓存目录
             cache_dir = "/app/.cache/huggingface"
             self.model = Gemma3nForConditionalGeneration.from_pretrained(
@@ -105,150 +103,168 @@ class AIModel:
             self.processor = None
     def is_available(self) -> bool:
-        """检查模型是否可用"""
         return self.model is not None and self.processor is not None
     def detect_input_type(self, input_data: str) -> str:
-        """检测输入类型：图片/音频/文字"""
-        if isinstance(input_data, str):
-            # 检查是否为图片URL或路径
-            if (input_data.startswith(("http://", "https://")) and
-                any(input_data.lower().endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"])):
-                return "image"
-            elif input_data.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp")):
-                return "image"
-            # 检查是否为音频URL或路径
-            elif (input_data.startswith(("http://", "https://")) and
-                  any(input_data.lower().endswith(ext) for ext in [".wav", ".mp3", ".m4a", ".ogg"])):
-                return "audio"
-            elif input_data.endswith((".wav", ".mp3", ".m4a", ".ogg")):
-                return "audio"
-            # 检查是否为base64编码的图片
-            elif input_data.startswith("data:image/"):
-                return "image"
         return "text"
     def format_input(self, input_type: str, raw_input: str) -> Tuple[str, Union[str, Image.Image, None]]:
-        """格式化输入数据"""
-        formatted_data = None
-        processed_text = raw_input
         if input_type == "image":
             try:
                 if raw_input.startswith("data:image/"):
-                    # 处理base64编码的图片
                     header, encoded = raw_input.split(",", 1)
                     image_data = base64.b64decode(encoded)
                     image = Image.open(BytesIO(image_data)).convert("RGB")
                 elif raw_input.startswith(("http://", "https://")):
-                    # 处理图片URL
                     response = requests.get(raw_input, timeout=10)
                     response.raise_for_status()
                     image = Image.open(BytesIO(response.content)).convert("RGB")
                 else:
-                    # 处理本地图片路径
-                    image = Image.open(raw_input).convert("RGB")
-                formatted_data = image
-                processed_text = "请描述这张图片，并基于图片内容提供旅游建议。"
                 log.info("✅ 图片加载成功")
             except Exception as e:
                 log.error(f"❌ 图片加载失败: {e}")
-                return "text", f"图片加载失败，请检查图片路径或URL。原始输入: {raw_input}"
         elif input_type == "audio":
-            # 音频处理逻辑（如果需要的话，目前先返回提示）
-            log.warning("⚠️ 音频处理功能暂未实现")
-            processed_text = "抱歉，音频输入功能正在开发中。请使用文字描述您的需求。"
-        elif input_type == "text":
-            # 文字输入直接使用
-            formatted_data = None
-            processed_text = raw_input
-        return input_type, formatted_data, processed_text
     def run_inference(self, input_type: str, formatted_input: Union[str, Image.Image], prompt: str) -> str:
-        """执行模型推理"""
         try:
             if input_type == "image" and isinstance(formatted_input, Image.Image):
-                # 图片输入处理
-                image_token = self.processor.tokenizer.image_token
                 if image_token not in prompt:
                     prompt = f"{image_token}\n{prompt}"
                 inputs = self.processor(
-                    text=prompt,
-                    images=formatted_input,
                     return_tensors="pt"
                 ).to(self.model.device, dtype=torch.bfloat16)
             else:
-                # 纯文本输入处理
                 inputs = self.processor(
-                    text=prompt,
                     return_tensors="pt"
                 ).to(self.model.device, dtype=torch.bfloat16)
-            # 生成响应
             with torch.inference_mode():
                 outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=512,
                     do_sample=True,
                     temperature=0.7,
                     top_p=0.9,
-                    pad_token_id=self.processor.tokenizer.eos_token_id
                 )
-            # 解码输出
             decoded = self.processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-            # 清理输出，移除输入的prompt部分
             if prompt in decoded:
                 decoded = decoded.replace(prompt, "").strip()
-            return decoded
         except Exception as e:
             log.error(f"❌ 模型推理失败: {e}", exc_info=True)
-            return "抱歉，我在处理您的请求时遇到了技术问题，请稍后再试。"
     def generate(self, user_input: str, context: str = "") -> str:
-        """主要的生成方法 - 支持多模态输入"""
         if not self.is_available():
             return "抱歉，AI 模型当前不可用，请稍后再试。"
         try:
             # 1. 检测输入类型
             input_type = self.detect_input_type(user_input)
             log.info(f"检测到输入类型: {input_type}")
             # 2. 格式化输入
             input_type, formatted_data, processed_text = self.format_input(input_type, user_input)
-            # 3. 构建prompt
-            if context:
-                prompt = (
-                    f"你是一个专业的旅游助手。请基于以下背景信息，用中文友好地回答用户的问题。\n\n"
-                    f"--- 背景信息 ---\n{context}\n\n"
-                    f"--- 用户问题 ---\n{processed_text}\n\n"
-                    f"请提供专业、实用的旅游建议："
-                )
-            else:
-                prompt = (
-                    f"你是一个专业的旅游助手。请用中文友好地回答用户的问题。\n\n"
-                    f"用户问题：{processed_text}\n\n"
-                    f"请提供专业、实用的旅游建议："
-                )
             # 4. 执行推理
             if input_type == "image" and formatted_data is not None:
                 return self.run_inference("image", formatted_data, prompt)
             else:
                 return self.run_inference("text", processed_text, prompt)
         except Exception as e:
             log.error(f"❌ 生成回复时发生错误: {e}", exc_info=True)
             return "抱歉，我在思考时遇到了点麻烦，请稍后再试。"

         try:
             log.info(f"正在加载模型: {self.model_name}")
             token = self._authenticate_hf()
             if not token:
                 self.processor = None
                 return
             cache_dir = "/app/.cache/huggingface"
             self.model = Gemma3nForConditionalGeneration.from_pretrained(
             self.processor = None
     def is_available(self) -> bool:
         return self.model is not None and self.processor is not None
     def detect_input_type(self, input_data: str) -> str:
+        if not isinstance(input_data, str):
+            return "text"
+        image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"]
+        if (input_data.startswith(("http://", "https://")) and
+            any(input_data.lower().endswith(ext) for ext in image_extensions)):
+            return "image"
+        elif any(input_data.endswith(ext) for ext in image_extensions):
+            return "image"
+        elif input_data.startswith("data:image/"):
+            return "image"
+        audio_extensions = [".wav", ".mp3", ".m4a", ".ogg", ".flac"]
+        if (input_data.startswith(("http://", "https://")) and
+            any(input_data.lower().endswith(ext) for ext in audio_extensions)):
+            return "audio"
+        elif any(input_data.endswith(ext) for ext in audio_extensions):
+            return "audio"
         return "text"
     def format_input(self, input_type: str, raw_input: str) -> Tuple[str, Union[str, Image.Image, None]]:
         if input_type == "image":
             try:
                 if raw_input.startswith("data:image/"):
                     header, encoded = raw_input.split(",", 1)
                     image_data = base64.b64decode(encoded)
                     image = Image.open(BytesIO(image_data)).convert("RGB")
                 elif raw_input.startswith(("http://", "https://")):
                     response = requests.get(raw_input, timeout=10)
                     response.raise_for_status()
                     image = Image.open(BytesIO(response.content)).convert("RGB")
                 else:
+                    image = Image.open(raw_input).convert("RGB")
                 log.info("✅ 图片加载成功")
+                return input_type, image, "请描述这张图片，并基于图片内容提供旅游建议。"
             except Exception as e:
                 log.error(f"❌ 图片加载失败: {e}")
+                return "text", None, f"图片加载失败，请检查路径或URL。"
         elif input_type == "audio":
+            log.warning("⚠️ 音频处理功能暂未实现")
+            return "text", None, "抱歉，音频输入功能正在开发中。请使用文字描述您的需求。"
+        else:  # text
+            return input_type, None, raw_input
     def run_inference(self, input_type: str, formatted_input: Union[str, Image.Image], prompt: str) -> str:
         try:
+            if len(prompt) > 500:
+                prompt = prompt[:500] + "..."
             if input_type == "image" and isinstance(formatted_input, Image.Image):
+                image_token = getattr(self.processor.tokenizer, 'image_token', '<image>')
                 if image_token not in prompt:
                     prompt = f"{image_token}\n{prompt}"
                 inputs = self.processor(
+                    text=prompt,
+                    images=formatted_input,
                     return_tensors="pt"
                 ).to(self.model.device, dtype=torch.bfloat16)
             else:
                 inputs = self.processor(
+                    text=prompt,
                     return_tensors="pt"
                 ).to(self.model.device, dtype=torch.bfloat16)
+            if hasattr(inputs, 'input_ids') and inputs.input_ids.shape[-1] > 512:
+                log.warning(f"⚠️ 截断过长输入: {inputs.input_ids.shape[-1]} -> 512")
+                inputs.input_ids = inputs.input_ids[:, :512]
+                if hasattr(inputs, 'attention_mask'):
+                    inputs.attention_mask = inputs.attention_mask[:, :512]
             with torch.inference_mode():
                 outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=256,
                     do_sample=True,
                     temperature=0.7,
                     top_p=0.9,
+                    pad_token_id=self.processor.tokenizer.eos_token_id,
+                    use_cache=True
                 )
             decoded = self.processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+        # 移除prompt部分
             if prompt in decoded:
                 decoded = decoded.replace(prompt, "").strip()
+            return decoded if decoded else "我理解了您的问题，请告诉我更多具体信息。"
+        except RuntimeError as e:
+            if "shape" in str(e):
+                log.error(f"❌ Tensor形状错误: {e}")
+                return "输入处理遇到问题，请尝试简化您的问题。"
+            raise e
         except Exception as e:
             log.error(f"❌ 模型推理失败: {e}", exc_info=True)
+            return "抱歉，处理您的请求时遇到技术问题。"
+    def _build_limited_prompt(self, processed_text: str, context: str = "") -> str:
+        """构建长度受限的prompt - 新增辅助方法"""
+        # 限制输入长度
+        if len(processed_text) > 200:
+            processed_text = processed_text[:200] + "..."
+        if context and len(context) > 300:
+            context = context[:300] + "..."
+        # 保持你原有的prompt结构
+        if context:
+            return (
+                f"你是一个专业的旅游助手。请基于以下背景信息，用中文友好地回答用户的问题。\n\n"
+                f"--- 背景信息 ---\n{context}\n\n"
+                f"--- 用户问题 ---\n{processed_text}\n\n"
+                f"请提供专业、实用的旅游建议："
+            )
+        else:
+            return (
+                f"你是一个专业的旅游助手。请用中文友好地回答用户的问题。\n\n"
+                f"用户问题：{processed_text}\n\n"
+                f"请提供专业、实用的旅游建议："
+            )
     def generate(self, user_input: str, context: str = "") -> str:
+        """主要的生成方法 - 保持原有逻辑"""
         if not self.is_available():
             return "抱歉，AI 模型当前不可用，请稍后再试。"
         try:
             # 1. 检测输入类型
             input_type = self.detect_input_type(user_input)
             log.info(f"检测到输入类型: {input_type}")
             # 2. 格式化输入
             input_type, formatted_data, processed_text = self.format_input(input_type, user_input)
+            # 3. 构建prompt - 使用你的原有结构
+            prompt = self._build_limited_prompt(processed_text, context)
             # 4. 执行推理
             if input_type == "image" and formatted_data is not None:
                 return self.run_inference("image", formatted_data, prompt)
             else:
                 return self.run_inference("text", processed_text, prompt)
         except Exception as e:
             log.error(f"❌ 生成回复时发生错误: {e}", exc_info=True)
             return "抱歉，我在思考时遇到了点麻烦，请稍后再试。"