Spaces:
Paused
Paused
File size: 6,521 Bytes
98b6d67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# app/utils/district_matcher.py
"""
District Matcher - Noto'g'ri yozilgan tuman nomlarini topish
Fuzzy matching ishlatiladi
"""
import logging
from typing import Optional
from difflib import SequenceMatcher
logger = logging.getLogger(__name__)
# Toshkent tumanlari (barcha variantlar bilan)
# DISTRICT_VARIANTS - KO'PROQ VARIANTLAR BILAN
DISTRICT_VARIANTS = {
"chilonzor": [
"chilonzor", "chilanazor", "chillonzor", "chilanzor", "chilinzor",
"chilanzar", "chilinzar", "chilonzar", "chilanzur"
],
"yunusobod": [
"yunusobod", "yunusabad", "yunusabod", "yunusobod", "iunusobod",
"yunus obod", "yunus abad", "yunusabat", "iunusabad"
],
"mirzo_ulugbek": [
"mirzo ulugbek", "mirzo ulug'bek", "mirzo ulugʻbek", "mirza ulugbek",
"ttg", "mirzo ulug bek", "mirza ulug'bek", "ulugbek", "ulug'bek"
],
"shayxontohur": [
"shayxontohur", "shayxontoxur", "shayhontohur", "shayxantoxur",
"sayxontohur", "sheyhontoxur", "shayxon tohur", "shayxon toxur",
"shayx tohur", "shayx toxur"
],
"yakkasaroy": [
"yakkasaroy", "yakkasaray", "yakasaroy", "yakkosaroy", "iakkasaroy",
"yakka saroy", "yakka saray", "yakkasarai"
],
"mirobod": [
"mirobod", "mirabod", "mirobad", "mirabod", "mirobad",
"mir obod", "mir abad", "mirabat"
],
"yashnobod": [
"yashnobod", "yashnabad", "yeshnobod", "yashnabod", "yashnobad",
"yash nobod", "yash nabad", "yashnabat"
],
"sergeli": [
"sergeli", "sergili", "sirgeli", "sergeley", "sirgili",
"sergel", "sergil"
],
"bektemir": [
"bektemir", "bektemar", "bektimir", "bektamir", "bektemur",
"bek temir", "bek tamir", "bektamur"
],
"uchtepa": [
"uchtepa", "uchtepe", "uchtepa", "uchtipi", "uchtepo",
"uch tepa", "uch tepe", "uchtipa"
],
"olmazor": [
"olmazor", "olmazor", "almazor", "olmozor", "almazor",
"olma zor", "alma zor", "olmazar"
],
"yangihayot": [
"yangihayot", "yangihayat", "yangi hayot", "yangixayot", "yangihoyot",
"yangi xayot", "yangi hayat", "yangihayat"
]
}
def normalize_text(text: str) -> str:
"""
Matnni normalizatsiya qilish (kichik harf, probel olib tashlash)
Args:
text: Asl matn
Returns:
Normalized matn
"""
if not text:
return ""
# Kichik harf
text = text.lower().strip()
# Ko'p probellarni bitta probelga
text = " ".join(text.split())
# "tumani" so'zini olib tashlash
text = text.replace(" tumani", "").replace(" tuman", "")
return text
def similarity_score(str1: str, str2: str) -> float:
"""
Ikki string orasidagi o'xshashlik (0.0 - 1.0)
Args:
str1: Birinchi string
str2: Ikkinchi string
Returns:
Similarity score (1.0 = 100% o'xshash)
"""
return SequenceMatcher(None, str1, str2).ratio()
def find_district_fuzzy(user_text: str, threshold: float = 0.5) -> Optional[str]:
"""
Noto'g'ri yozilgan tuman nomini topish (YAXSHILANGAN FUZZY MATCHING)
Args:
user_text: Bemorning kiritgan matni (masalan: "chillonzor" yoki "yunusabad")
threshold: Minimal o'xshashlik darajasi (0.5 = 50%) ← PASTROQ!
Returns:
District ID (masalan: "chilonzor") yoki None
"""
try:
if not user_text:
return None
# Matnni normalizatsiya qilish
normalized_input = normalize_text(user_text)
logger.info(f"🏙️ Tuman qidirilmoqda: '{user_text}' → '{normalized_input}'")
if len(normalized_input) < 3:
logger.warning("⚠️ Matn juda qisqa")
return None
# Eng yaxshi moslikni topish
best_match = None
best_score = 0.0
for district_id, variants in DISTRICT_VARIANTS.items():
for variant in variants:
# 1. To'liq fuzzy match
score = similarity_score(normalized_input, variant)
# 2. Substring match (bonus)
if normalized_input in variant or variant in normalized_input:
score = max(score, 0.85)
# 3. So'z boshi match (bonus)
if variant.startswith(normalized_input[:4]) or normalized_input.startswith(variant[:4]):
score = max(score, 0.75)
if score > best_score:
best_score = score
best_match = district_id
# Threshold tekshirish
if best_score >= threshold:
logger.info(f"✅ Tuman topildi: '{best_match}' (score: {best_score:.2f})")
return best_match
else:
logger.warning(f"⚠️ Tuman topilmadi (best score: {best_score:.2f} < {threshold})")
return None
except Exception as e:
logger.error(f"❌ District matching xatoligi: {e}")
return None
def get_district_display_name(district_id: str) -> str:
"""
District ID'dan to'liq nom olish
Args:
district_id: "chilonzor"
Returns:
"Chilonzor tumani"
"""
district_names = {
"chilonzor": "Chilonzor tumani",
"yunusobod": "Yunusobod tumani",
"mirzo_ulugbek": "Mirzo Ulug'bek tumani",
"shayxontohur": "Shayxontohur tumani",
"yakkasaroy": "Yakkasaroy tumani",
"mirobod": "Mirobod tumani",
"yashnobod": "Yashnobod tumani",
"sergeli": "Sergeli tumani",
"bektemir": "Bektemir tumani",
"uchtepa": "Uchtepa tumani",
"olmazor": "Olmazor tumani",
"yangihayot": "Yangihayot tumani"
}
return district_names.get(district_id, district_id)
def list_all_districts_text() -> str:
"""
Barcha tumanlar nomini matn sifatida qaytarish (AI uchun)
Returns:
"Chilonzor, Yunusobod, Mirzo Ulug'bek, ..."
"""
districts = [
"Chilonzor", "Yunusobod", "Mirzo Ulug'bek", "Shayxontohur",
"Yakkasaroy", "Mirobod", "Yashnobod", "Sergeli",
"Bektemir", "Uchtepa", "Olmazor", "Yangihayot"
]
return ", ".join(districts) |