Spaces:
Paused
Paused
| # app/utils/district_matcher.py | |
| """ | |
| District Matcher - Noto'g'ri yozilgan tuman nomlarini topish | |
| Fuzzy matching ishlatiladi | |
| """ | |
| import logging | |
| from typing import Optional | |
| from difflib import SequenceMatcher | |
| logger = logging.getLogger(__name__) | |
| # Toshkent tumanlari (barcha variantlar bilan) | |
| # DISTRICT_VARIANTS - KO'PROQ VARIANTLAR BILAN | |
| DISTRICT_VARIANTS = { | |
| "chilonzor": [ | |
| "chilonzor", "chilanazor", "chillonzor", "chilanzor", "chilinzor", | |
| "chilanzar", "chilinzar", "chilonzar", "chilanzur" | |
| ], | |
| "yunusobod": [ | |
| "yunusobod", "yunusabad", "yunusabod", "yunusobod", "iunusobod", | |
| "yunus obod", "yunus abad", "yunusabat", "iunusabad" | |
| ], | |
| "mirzo_ulugbek": [ | |
| "mirzo ulugbek", "mirzo ulug'bek", "mirzo ulugʻbek", "mirza ulugbek", | |
| "ttg", "mirzo ulug bek", "mirza ulug'bek", "ulugbek", "ulug'bek" | |
| ], | |
| "shayxontohur": [ | |
| "shayxontohur", "shayxontoxur", "shayhontohur", "shayxantoxur", | |
| "sayxontohur", "sheyhontoxur", "shayxon tohur", "shayxon toxur", | |
| "shayx tohur", "shayx toxur" | |
| ], | |
| "yakkasaroy": [ | |
| "yakkasaroy", "yakkasaray", "yakasaroy", "yakkosaroy", "iakkasaroy", | |
| "yakka saroy", "yakka saray", "yakkasarai" | |
| ], | |
| "mirobod": [ | |
| "mirobod", "mirabod", "mirobad", "mirabod", "mirobad", | |
| "mir obod", "mir abad", "mirabat" | |
| ], | |
| "yashnobod": [ | |
| "yashnobod", "yashnabad", "yeshnobod", "yashnabod", "yashnobad", | |
| "yash nobod", "yash nabad", "yashnabat" | |
| ], | |
| "sergeli": [ | |
| "sergeli", "sergili", "sirgeli", "sergeley", "sirgili", | |
| "sergel", "sergil" | |
| ], | |
| "bektemir": [ | |
| "bektemir", "bektemar", "bektimir", "bektamir", "bektemur", | |
| "bek temir", "bek tamir", "bektamur" | |
| ], | |
| "uchtepa": [ | |
| "uchtepa", "uchtepe", "uchtepa", "uchtipi", "uchtepo", | |
| "uch tepa", "uch tepe", "uchtipa" | |
| ], | |
| "olmazor": [ | |
| "olmazor", "olmazor", "almazor", "olmozor", "almazor", | |
| "olma zor", "alma zor", "olmazar" | |
| ], | |
| "yangihayot": [ | |
| "yangihayot", "yangihayat", "yangi hayot", "yangixayot", "yangihoyot", | |
| "yangi xayot", "yangi hayat", "yangihayat" | |
| ] | |
| } | |
| def normalize_text(text: str) -> str: | |
| """ | |
| Matnni normalizatsiya qilish (kichik harf, probel olib tashlash) | |
| Args: | |
| text: Asl matn | |
| Returns: | |
| Normalized matn | |
| """ | |
| if not text: | |
| return "" | |
| # Kichik harf | |
| text = text.lower().strip() | |
| # Ko'p probellarni bitta probelga | |
| text = " ".join(text.split()) | |
| # "tumani" so'zini olib tashlash | |
| text = text.replace(" tumani", "").replace(" tuman", "") | |
| return text | |
| def similarity_score(str1: str, str2: str) -> float: | |
| """ | |
| Ikki string orasidagi o'xshashlik (0.0 - 1.0) | |
| Args: | |
| str1: Birinchi string | |
| str2: Ikkinchi string | |
| Returns: | |
| Similarity score (1.0 = 100% o'xshash) | |
| """ | |
| return SequenceMatcher(None, str1, str2).ratio() | |
| def find_district_fuzzy(user_text: str, threshold: float = 0.5) -> Optional[str]: | |
| """ | |
| Noto'g'ri yozilgan tuman nomini topish (YAXSHILANGAN FUZZY MATCHING) | |
| Args: | |
| user_text: Bemorning kiritgan matni (masalan: "chillonzor" yoki "yunusabad") | |
| threshold: Minimal o'xshashlik darajasi (0.5 = 50%) ← PASTROQ! | |
| Returns: | |
| District ID (masalan: "chilonzor") yoki None | |
| """ | |
| try: | |
| if not user_text: | |
| return None | |
| # Matnni normalizatsiya qilish | |
| normalized_input = normalize_text(user_text) | |
| logger.info(f"🏙️ Tuman qidirilmoqda: '{user_text}' → '{normalized_input}'") | |
| if len(normalized_input) < 3: | |
| logger.warning("⚠️ Matn juda qisqa") | |
| return None | |
| # Eng yaxshi moslikni topish | |
| best_match = None | |
| best_score = 0.0 | |
| for district_id, variants in DISTRICT_VARIANTS.items(): | |
| for variant in variants: | |
| # 1. To'liq fuzzy match | |
| score = similarity_score(normalized_input, variant) | |
| # 2. Substring match (bonus) | |
| if normalized_input in variant or variant in normalized_input: | |
| score = max(score, 0.85) | |
| # 3. So'z boshi match (bonus) | |
| if variant.startswith(normalized_input[:4]) or normalized_input.startswith(variant[:4]): | |
| score = max(score, 0.75) | |
| if score > best_score: | |
| best_score = score | |
| best_match = district_id | |
| # Threshold tekshirish | |
| if best_score >= threshold: | |
| logger.info(f"✅ Tuman topildi: '{best_match}' (score: {best_score:.2f})") | |
| return best_match | |
| else: | |
| logger.warning(f"⚠️ Tuman topilmadi (best score: {best_score:.2f} < {threshold})") | |
| return None | |
| except Exception as e: | |
| logger.error(f"❌ District matching xatoligi: {e}") | |
| return None | |
| def get_district_display_name(district_id: str) -> str: | |
| """ | |
| District ID'dan to'liq nom olish | |
| Args: | |
| district_id: "chilonzor" | |
| Returns: | |
| "Chilonzor tumani" | |
| """ | |
| district_names = { | |
| "chilonzor": "Chilonzor tumani", | |
| "yunusobod": "Yunusobod tumani", | |
| "mirzo_ulugbek": "Mirzo Ulug'bek tumani", | |
| "shayxontohur": "Shayxontohur tumani", | |
| "yakkasaroy": "Yakkasaroy tumani", | |
| "mirobod": "Mirobod tumani", | |
| "yashnobod": "Yashnobod tumani", | |
| "sergeli": "Sergeli tumani", | |
| "bektemir": "Bektemir tumani", | |
| "uchtepa": "Uchtepa tumani", | |
| "olmazor": "Olmazor tumani", | |
| "yangihayot": "Yangihayot tumani" | |
| } | |
| return district_names.get(district_id, district_id) | |
| def list_all_districts_text() -> str: | |
| """ | |
| Barcha tumanlar nomini matn sifatida qaytarish (AI uchun) | |
| Returns: | |
| "Chilonzor, Yunusobod, Mirzo Ulug'bek, ..." | |
| """ | |
| districts = [ | |
| "Chilonzor", "Yunusobod", "Mirzo Ulug'bek", "Shayxontohur", | |
| "Yakkasaroy", "Mirobod", "Yashnobod", "Sergeli", | |
| "Bektemir", "Uchtepa", "Olmazor", "Yangihayot" | |
| ] | |
| return ", ".join(districts) |