Eliot0110 commited on
Commit
b713501
·
1 Parent(s): 3327cc4

improve: reverse to old version

Browse files
modules/info_extractor.py CHANGED
@@ -4,7 +4,6 @@ from utils.logger import log
4
  import jieba
5
  from typing import List, Tuple
6
  import copy
7
-
8
  class InfoExtractor:
9
  def __init__(self):
10
 
@@ -341,6 +340,7 @@ class InfoExtractor:
341
  existing_info[key] = value
342
  return existing_info
343
 
 
344
  def _tokenize_message(self, text: str) -> list:
345
  """智能分词,支持中英文混合"""
346
 
@@ -754,129 +754,189 @@ class InfoExtractor:
754
  return result
755
 
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
-
758
  result = {}
759
- text = "".join(tokens).lower().strip()
760
-
761
- # --- 1. 统一提取金额和货币 ---
762
- # 按优先级排列正则表达式,越精确的模式越靠前
763
- # 模式捕获: (金额数字, 乘数单位[千/万/k/w], 货币单位[元/欧/usd...])
764
- patterns = [
765
- {'regex': r'¥\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'RMB'},
766
- {'regex': r'€\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'EUR'},
767
- {'regex': r'\$\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'USD'},
768
- {'regex': r'£\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'GBP'},
769
- {'regex': r'(usd|rmb|eur|gbp|chf|jpy)\s*(\d+\.?\d*)\s*(百|hundred|千|k|thousand|万|w)?',
770
- 'groups': {'currency': 1, 'amount': 2, 'multiplier': 3}},
771
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(人民币|元|块|块钱|rmb)',
772
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'RMB'},
773
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(欧元|欧|euros?|eur)',
774
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'EUR'},
775
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(美元|dollars?|dollar|usd)',
776
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'USD'},
777
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(英镑|pounds?|pound|gbp)',
778
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'GBP'},
779
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(日元|yen|jpy)',
780
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'JPY'},
781
- {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(瑞郎|瑞士法郎|chf)',
782
- 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'CHF'},
783
-
784
- {'regex': r'(\d+\.?\d+)\s*(十|百|hundred|千|k|thousand|万|w)',
785
- 'groups': {'amount': 1, 'multiplier': 2}, 'context_needed': True},
786
- {'regex': r'(\d+\.?\d+)',
787
- 'groups': {'amount': 1}, 'context_needed': True},
788
- ]
789
-
790
- amount_found = False
791
- for p in patterns:
792
- match = re.search(p['regex'], text)
793
- if match:
794
- # 检查是否是纯数字模式,是的话需要上下文
795
- if p.get('context_needed', False):
796
- budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
797
- if not any(indicator in text for indicator in budget_indicators):
798
- continue # 如果没有上下文,则跳过纯数字匹配
799
-
800
- groups = match.groups()
801
-
802
- # 提取金额
803
- amount_group_index = p['groups']['amount'] - 1
804
- amount = float(groups[amount_group_index])
805
-
806
- # 确定乘数
807
- multiplier = 1
808
- multiplier_token = ''
809
- if 'multiplier' in p['groups']:
810
- multiplier_group_index = groups[p['groups']['multiplier']-1]
811
- if multiplier_group_index < len(groups) and groups[multiplier_group_index]:
812
- multiplier_token = groups[multiplier_group_index]
813
- if '十' in multiplier_token:
814
- multiplier = 10
815
- elif '百' in multiplier_token or 'hundred' in multiplier_token:
816
- multiplier = 100
817
- elif '千' in multiplier_token or 'k' in multiplier_token or 'thousand' in multiplier_token:
818
- multiplier = 1000
819
- elif '万' in multiplier_token or 'w' in multiplier_token:
820
- multiplier = 10000
821
 
822
- final_amount = amount * multiplier
823
- result['amount'] = int(final_amount)
824
-
825
- # 确定货币
826
- currency_token = ''
827
- if p.get('currency'):
828
- result['currency'] = p['currency']
829
- elif 'currency_str' in p['groups']:
830
- currency_group_index = p['groups']['currency_str'] - 1
831
- currency_token = groups[currency_group_index]
832
- currency_map = {'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'}
833
- if currency_token in currency_map:
834
- result['currency'] = currency_map[currency_token]
835
-
836
- amount_found = True
837
- break
838
-
839
- # 2. 查找预算类型(此部分逻辑与金额完全无关)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  budget_type_keywords = {
841
- 'economy': ['经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '背包客', '预算有限', '性价比', 'budget', 'cheap'],
842
- 'comfortable': ['舒适', '中等', '适中', '标准', '普通', '中档', '合理', 'comfortable', 'standard'],
843
- 'luxury': ['豪华', '奢华', '高端', '顶级', '精品', '不差钱', '任性', '土豪', '五星', 'luxury', 'premium']
 
 
 
 
 
 
 
 
 
 
 
844
  }
845
-
846
- # 查找最能代表预算类型的关键词
847
- found_type_keyword = ""
848
- found_type = ""
849
  for token in tokens:
850
  token_lower = token.lower()
851
  for budget_type, keywords in budget_type_keywords.items():
852
- for keyword in keywords:
853
- if keyword in token_lower:
854
- # 优先选择更长的、更具体的关键词作为描述
855
- if len(keyword) > len(found_type_keyword):
856
- found_type_keyword = keyword
857
- found_type = budget_type
858
-
859
- if found_type:
860
- result["type"] = found_type
861
- result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
862
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  if not result.get("amount"):
864
- chinese_money_mapping = {
865
- '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
866
- '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
867
- '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
868
- '一百': 100, '二百': 200, '三百': 300, '四百': 400, '五百': 500,
869
- '六百': 600, '七百': 700, '八百': 800, '九百': 900,
870
- '十': 10, '二十': 20, '三十': 30, '四十': 40, '五十': 50,
871
- '六十': 60, '七十': 70, '八十': 80, '九十': 90
872
- }
873
-
874
- sorted_keys = sorted(chinese_money_mapping.keys(), key=len, reverse=True)
875
- for name in sorted_keys:
876
- if name in text:
877
- result['amount'] = chinese_money_mapping[name]
878
  break
879
-
880
  return result
881
 
882
  # 保持向后兼容的验证方法
 
4
  import jieba
5
  from typing import List, Tuple
6
  import copy
 
7
  class InfoExtractor:
8
  def __init__(self):
9
 
 
340
  existing_info[key] = value
341
  return existing_info
342
 
343
+
344
  def _tokenize_message(self, text: str) -> list:
345
  """智能分词,支持中英文混合"""
346
 
 
754
  return result
755
 
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
+ """从tokens中提取预算信息"""
758
  result = {}
759
+
760
+ # 1. 查找金额
761
+ for i, token in enumerate(tokens):
762
+ amount = None
763
+ currency = "RMB" # 默认货币
764
+
765
+ # 处理包含货币的token "2000欧", "5000元"
766
+ currency_patterns = [
767
+ (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
768
+ (r'(\d+(?:\.\d+)?)元', 'RMB'),
769
+ (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
770
+ (r'(\d+(?:\.\d+)?)人民币', 'RMB'),
771
+ (r'(\d+(?:\.\d+)?)美元', 'USD'),
772
+ (r'(\d+(?:\.\d+)?)英镑', 'GBP'),
773
+ (r'(\d+(?:\.\d+)?)(?:士)?法郎', 'CHF'),
774
+ (r'(\d+(?:\.\d+)?)日元', 'JPY'),
775
+ (r'(\d+(?:\.\d+)?)韩元', 'KRW'),
776
+ (r'¥(\d+(?:\.\d+)?)', 'RMB'),
777
+ (r'(\d+(?:\.\d+)?)', 'EUR'),
778
+ (r'\$(\d+(?:\.\d+)?)', 'USD'),
779
+ (r'£(\d+(?:\.\d+)?)', 'GBP'),
780
+ (r'(\d+(?:\.\d+)?)rmb', 'RMB'),
781
+ (r'(\d+(?:\.\d+)?)usd', 'USD'),
782
+ (r'(\d+(?:\.\d+)?)eur', 'EUR'),
783
+ (r'(\d+(?:\.\d+)?)gbp', 'GBP'),
784
+ (r'(\d+(?:\.\d+)?)chf', 'CHF'),
785
+ ]
786
+
787
+ for pattern, curr in currency_patterns:
788
+ match = re.search(pattern, token.lower())
789
+ if match:
790
+ amount = float(match.group(1))
791
+ currency = curr
792
+ break
793
+
794
+ # 处理纯数字token(需要查看上下文)
795
+ if not amount and re.match(r'^\d+(?:\.\d+)?$', token):
796
+ number = float(token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
 
798
+ # 检查前面的token是否有预算相关词汇
799
+ budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
800
+ has_budget_context = False
801
+
802
+ if i > 0 and tokens[i-1] in budget_indicators:
803
+ has_budget_context = True
804
+ elif i > 1 and tokens[i-2] in budget_indicators:
805
+ has_budget_context = True
806
+
807
+ # 检查后面是否有货币单位
808
+ if i < len(tokens) - 1:
809
+ next_token = tokens[i + 1].lower()
810
+ currency_units = {
811
+ '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
812
+ '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
813
+ '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
814
+ 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
815
+ }
816
+
817
+ if next_token in currency_units:
818
+ amount = number
819
+ currency = currency_units[next_token]
820
+ has_budget_context = True
821
+
822
+ # 如果有预算上下文但没有明确货币单位,根据数字大小推断
823
+ if has_budget_context and not amount:
824
+ if number < 100: # 可能是欧元或美元
825
+ # 查看是否有欧洲城市上下文
826
+ has_european_context = any(self._normalize_city_name(t) for t in tokens)
827
+ if has_european_context:
828
+ currency = 'EUR'
829
+ else:
830
+ currency = 'USD'
831
+ else:
832
+ currency = 'RMB' # 大数字更可能是人民币
833
+ amount = number
834
+
835
+ # 处理万、千等单位
836
+ if amount:
837
+ # 检查是否有万、千修饰符
838
+ if i > 0:
839
+ prev_token = tokens[i-1]
840
+ if '万' in prev_token or 'w' in prev_token.lower():
841
+ amount *= 10000
842
+ elif '千' in prev_token or 'k' in prev_token.lower():
843
+ amount *= 1000
844
+ elif i < len(tokens) - 1:
845
+ next_token = tokens[i+1]
846
+ if '万' in next_token or 'w' in next_token.lower():
847
+ amount *= 10000
848
+ elif '千' in next_token or 'k' in next_token.lower():
849
+ amount *= 1000
850
+
851
+ if amount > 0:
852
+ result["amount"] = int(amount)
853
+ result["currency"] = currency
854
+ break
855
+
856
+ # 2. 查找预算类型
857
  budget_type_keywords = {
858
+ 'economy': [
859
+ '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
860
+ '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
861
+ '简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
862
+ ],
863
+ 'comfortable': [
864
+ '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
865
+ '中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
866
+ ],
867
+ 'luxury': [
868
+ '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
869
+ '贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
870
+ 'luxury', 'premium', 'high-end', 'expensive', 'fancy'
871
+ ]
872
  }
873
+
 
 
 
874
  for token in tokens:
875
  token_lower = token.lower()
876
  for budget_type, keywords in budget_type_keywords.items():
877
+ if any(keyword in token_lower for keyword in keywords):
878
+ result["type"] = budget_type
879
+
880
+ # 找到第一个匹配的关键词作为描述
881
+ for keyword in keywords:
882
+ if keyword in token_lower:
883
+ result["description"] = keyword if len(keyword) > 2 else token
884
+ break
885
+ break
886
+ if result.get("type"):
887
+ break
888
+
889
+ # 3. 如果有金额但没有类型,根据金额推断类型
890
+ if result.get("amount") and not result.get("type"):
891
+ amount = result["amount"]
892
+ currency = result.get("currency", "RMB")
893
+
894
+ # 根据欧洲旅行成本设置阈值
895
+ if currency == "EUR":
896
+ if amount < 1500: # 总预算
897
+ result["type"] = "economy"
898
+ result["description"] = "经济预算"
899
+ elif amount < 4000:
900
+ result["type"] = "comfortable"
901
+ result["description"] = "舒适预算"
902
+ else:
903
+ result["type"] = "luxury"
904
+ result["description"] = "豪华预算"
905
+ elif currency == "USD":
906
+ if amount < 2000:
907
+ result["type"] = "economy"
908
+ result["description"] = "经济预算"
909
+ elif amount < 5000:
910
+ result["type"] = "comfortable"
911
+ result["description"] = "舒适预算"
912
+ else:
913
+ result["type"] = "luxury"
914
+ result["description"] = "豪华预算"
915
+ elif currency == "RMB":
916
+ if amount < 8000:
917
+ result["type"] = "economy"
918
+ result["description"] = "经济预算"
919
+ elif amount < 20000:
920
+ result["type"] = "comfortable"
921
+ result["description"] = "舒适预算"
922
+ else:
923
+ result["type"] = "luxury"
924
+ result["description"] = "豪华预算"
925
+
926
+ # 4. 处理中文数字金额
927
+ chinese_money_mapping = {
928
+ '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
929
+ '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
930
+ '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
931
+ }
932
+
933
  if not result.get("amount"):
934
+ for token in tokens:
935
+ if token in chinese_money_mapping:
936
+ result["amount"] = chinese_money_mapping[token]
937
+ result["currency"] = "RMB"
 
 
 
 
 
 
 
 
 
 
938
  break
939
+
940
  return result
941
 
942
  # 保持向后兼容的验证方法
modules/response_generator.py CHANGED
@@ -110,62 +110,16 @@ class ResponseGenerator:
110
  return "抱歉,我在处理您的请求时遇到了问题,请稍后再试。"
111
 
112
  def _generate_vivid_acknowledgement(self, extracted_info: dict, session_state: SessionManager) -> str:
113
- """
114
- 【更新版本】根据最新提取的信息,生成一句生动的、非模板化的确认语。
115
- 此版本已重构,以处理更复杂的预算逻辑,并实现主动询问。
116
- """
117
- # --- 目的地确认 ---
118
  if "destination" in extracted_info and extracted_info["destination"]:
119
- dest_info = extracted_info["destination"]
120
- dest_name = dest_info.get('name')
121
-
122
- if not dest_name: return "" # 如果没有提取到有效名称,则不作回复
123
 
124
- # 优先使用预设的城市描述,让回复更生动
125
  if dest_name in self.city_descriptions:
126
  feature = random.choice(self.city_descriptions[dest_name])
127
  return f"{dest_name}!一个绝佳的选择,那可是著名的'{feature}'。目的地已为您记录。"
128
  else:
129
- # 修复了原代码中从列表获取国家信息的bug
130
- dest_country = dest_info.get('country')
131
- if dest_country:
132
- return f"好的,目的地已确认为 {dest_country} 的 {dest_name}!一个充满魅力的地方。"
133
- else:
134
- return f"好的,目的地 {dest_name} 已记录!听起来是个很棒的地方。"
135
-
136
- # --- 旅行时长确认 ---
137
- if "duration" in extracted_info and extracted_info["duration"]:
138
- # 使用 .get() 增加代码健壮性
139
- duration_description = extracted_info["duration"].get('description', '一段美好的时光')
140
- return f"了解,{duration_description}的行程,时间很充裕,可以深度体验了!"
141
-
142
- # --- 预算确认(核心修改部分) ---
143
- if "budget" in extracted_info and extracted_info["budget"]:
144
- budget_dict = extracted_info["budget"]
145
- amount = budget_dict.get("amount")
146
- currency = budget_dict.get("currency")
147
- budget_type_desc = budget_dict.get("description") # 例如:“穷游”
148
-
149
- # 场景一:用户提供了金额,但没说货币单位 -> 主动追问货币
150
- if amount and not currency:
151
- return f"收到,您的预算是 {amount}。请问这是以什么货币计算的呢?(例如:人民币、欧元、美元)"
152
-
153
- # 场景二:用户提供了完整的金额和货币 -> 优雅地确认
154
- if amount and currency:
155
- if budget_type_desc:
156
- # 例如,用户说:“我预算5000元,想穷游”
157
- return f"好的,您「{budget_type_desc}」的预算({amount} {currency})已为您记录,我会为您规划性价比最高的方案。"
158
- else:
159
- # 例如,用户说:“我预算5000元”
160
- return f"好的,预算 {amount} {currency} 已为您记录,我会为您规划性价比最高的方案。"
161
-
162
- # 场景三:用户只提了预算类型,没说金额 -> 只确认风格
163
- if budget_type_desc and not amount:
164
- # 例如,用户说:“我这次想穷游”
165
- return f"了解,您偏爱「{budget_type_desc}」的旅行方式,我会按这个风格为您规划。"
166
-
167
- # 如果没有提取到任何新信息,返回空字符串
168
- return ""
169
 
170
 
171
  def _get_dynamic_next_question(self, session_state: SessionManager) -> str:
@@ -175,7 +129,7 @@ class ResponseGenerator:
175
  if not session_state.get('duration'):
176
  return "计划玩几天呢?"
177
  if not session_state.get('budget'):
178
- return "您的旅行预算大概是多少?"
179
  return "" # 所有信息都已收集
180
 
181
 
 
110
  return "抱歉,我在处理您的请求时遇到了问题,请稍后再试。"
111
 
112
  def _generate_vivid_acknowledgement(self, extracted_info: dict, session_state: SessionManager) -> str:
113
+
 
 
 
 
114
  if "destination" in extracted_info and extracted_info["destination"]:
115
+ dest_name = extracted_info["destination"]['name']
 
 
 
116
 
 
117
  if dest_name in self.city_descriptions:
118
  feature = random.choice(self.city_descriptions[dest_name])
119
  return f"{dest_name}!一个绝佳的选择,那可是著名的'{feature}'。目的地已为您记录。"
120
  else:
121
+ dest_country = extracted_info["destination"][0]['country']
122
+ return f"好的,目的地已确认为 {dest_country} {dest_name}!一个充满魅力的地方。"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  def _get_dynamic_next_question(self, session_state: SessionManager) -> str:
 
129
  if not session_state.get('duration'):
130
  return "计划玩几天呢?"
131
  if not session_state.get('budget'):
132
+ return "您的旅行预算大概是多少?您可以金额+币种的格式输入,例如:5000元人民币 或 800 eur"
133
  return "" # 所有信息都已收集
134
 
135