Help_Me_3 / app /utils /mahalla_matcher.py
giyos1212's picture
Upload 72 files
98b6d67 verified
# app/utils/mahalla_matcher.py - YAXSHILANGAN (THRESHOLD 0.35)
"""
Mahalla Matcher - Noto'g'ri yozilgan mahalla nomlarini topish
Fuzzy matching + substring + word matching
"""
import logging
from typing import Optional, List
from difflib import SequenceMatcher
from app.services.location_validator import get_mahallas_by_district
logger = logging.getLogger(__name__)
def normalize_mahalla_text(text: str) -> str:
"""
Mahalla nomini normalizatsiya qilish (KENGAYTIRILGAN)
O'zbek tili qo'shimchalarini olib tashlaydi
Args:
text: Asl matn
Returns:
Normalized matn
"""
if not text:
return ""
# Kichik harf
text = text.lower().strip()
# Ko'p probellarni bitta probelga
text = " ".join(text.split())
# "mahallasi", "mahalla" so'zlarini olib tashlash
text = text.replace(' mahallasi', '').replace(' mahalla', '')
text = text.replace('mahallasi', '').replace('mahalla', '')
# O'zbek tili kelishik qo'shimchalarini olib tashlash (KENGAYTIRILGAN)
suffixes = [
"ni", "ga", "da", "dan", "ning", "niki",
"dagi", "dagina", "gacha", "dan", "dek",
"lar", "larni", "larga", "larda", "lardan"
]
words = text.split()
if words:
last_word = words[-1]
for suffix in suffixes:
if last_word.endswith(suffix) and len(last_word) > len(suffix) + 2: # Juda qisqa so'zlarni saqlab qolish
words[-1] = last_word[:-len(suffix)]
break
text = " ".join(words)
# Oxiridagi tinish belgilarini olib tashlash
text = text.strip('.,!? ')
return text.strip()
def similarity_score(str1: str, str2: str) -> float:
"""
Ikki string orasidagi o'xshashlik (0.0 - 1.0)
Args:
str1: Birinchi string
str2: Ikkinchi string
Returns:
Similarity score (1.0 = 100% o'xshash)
"""
return SequenceMatcher(None, str1, str2).ratio()
def word_similarity(str1: str, str2: str) -> float:
"""
So'zma-so'z o'xshashlik (word-level matching)
Args:
str1: Birinchi matn
str2: Ikkinchi matn
Returns:
Word overlap score (0.0 - 1.0)
"""
words1 = set(str1.split())
words2 = set(str2.split())
if not words1 or not words2:
return 0.0
intersection = words1.intersection(words2)
union = words1.union(words2)
return len(intersection) / len(union) if union else 0.0
def find_mahalla_fuzzy(district_name: str, user_text: str, threshold: float = 0.35) -> Optional[str]:
"""
Noto'g'ri yozilgan mahalla nomini topish (YAXSHILANGAN - THRESHOLD 0.35)
Args:
district_name: Tuman nomi
user_text: Bemorning kiritgan matni (masalan: "katta chilonzor" yoki "besh qorgon")
threshold: Minimal o'xshashlik darajasi (0.35 = 35%) ← PASAYTIRILDI!
Returns:
Mahalla nomi (masalan: "Katta Chilonzor-1 mahallasi") yoki None
"""
try:
if not user_text or not district_name:
return None
# Matnni normalizatsiya qilish
normalized_input = normalize_mahalla_text(user_text)
logger.info(f"🏘️ Mahalla qidirilmoqda: '{user_text}' β†’ '{normalized_input}' ({district_name})")
if len(normalized_input) < 2:
logger.warning("⚠️ Matn juda qisqa")
return None
# Tuman bo'yicha mahallalarni olish
mahallas = get_mahallas_by_district(district_name)
if not mahallas:
logger.warning(f"⚠️ {district_name} uchun mahallalar topilmadi")
return None
# Eng yaxshi moslikni topish
best_match = None
best_score = 0.0
scoring_details = []
for mahalla in mahallas:
normalized_mahalla = normalize_mahalla_text(mahalla)
# 1. To'liq fuzzy match (SequenceMatcher)
fuzzy_score = similarity_score(normalized_input, normalized_mahalla)
# 2. Substring match (KATTA BONUS)
substring_score = 0.0
if normalized_input in normalized_mahalla:
substring_score = 0.9 # 90% match
logger.debug(f" βœ“ Substring (input in mahalla): '{normalized_input}' in '{normalized_mahalla}'")
elif normalized_mahalla in normalized_input:
substring_score = 0.85 # 85% match
logger.debug(f" βœ“ Substring (mahalla in input): '{normalized_mahalla}' in '{normalized_input}'")
# 3. So'zma-so'z match (Word overlap)
word_score = word_similarity(normalized_input, normalized_mahalla)
# 4. So'z boshi match (First word matching)
input_words = normalized_input.split()
mahalla_words = normalized_mahalla.split()
first_word_score = 0.0
if input_words and mahalla_words:
if input_words[0] == mahalla_words[0]:
first_word_score = 0.7 # Birinchi so'z mos kelsa - 70%
logger.debug(f" βœ“ First word match: '{input_words[0]}'")
# 5. FINAL SCORE (eng yuqori ball)
final_score = max(fuzzy_score, substring_score, word_score, first_word_score)
scoring_details.append({
"mahalla": mahalla,
"fuzzy": fuzzy_score,
"substring": substring_score,
"word": word_score,
"first_word": first_word_score,
"final": final_score
})
if final_score > best_score:
best_score = final_score
best_match = mahalla
# Debug: Top 3 natijalar
scoring_details.sort(key=lambda x: x['final'], reverse=True)
logger.debug(f" Top 3 matches:")
for i, detail in enumerate(scoring_details[:3], 1):
logger.debug(f" {i}. {detail['mahalla']}: {detail['final']:.2f} "
f"(fuzzy={detail['fuzzy']:.2f}, sub={detail['substring']:.2f}, "
f"word={detail['word']:.2f}, first={detail['first_word']:.2f})")
# Threshold tekshirish
if best_score >= threshold:
logger.info(f"βœ… Mahalla topildi: '{best_match}' (score: {best_score:.2f})")
return best_match
else:
logger.warning(f"⚠️ Mahalla topilmadi (best score: {best_score:.2f} < {threshold})")
return None
except Exception as e:
logger.error(f"❌ Mahalla matching xatoligi: {e}", exc_info=True)
return None
def get_mahalla_display_name(mahalla_name: str) -> str:
"""
Mahalla nomini to'liq formatda qaytarish
Args:
mahalla_name: "Beltepa" yoki "Beltepa mahallasi"
Returns:
"Beltepa mahallasi"
"""
if not mahalla_name:
return ""
if 'mahallasi' not in mahalla_name.lower():
return f"{mahalla_name} mahallasi"
return mahalla_name
def suggest_mahallas(district_name: str, user_text: str, top_n: int = 3) -> List[tuple]:
"""
Mahallalar tavsiyalari (top N eng o'xshash)
Args:
district_name: Tuman nomi
user_text: Bemorning matni
top_n: Ko'rsatiladigan maksimal tavsiyalar soni
Returns:
[(mahalla_name, score), ...] - eng yaxshilaridan boshlab
"""
try:
normalized_input = normalize_mahalla_text(user_text)
mahallas = get_mahallas_by_district(district_name)
if not mahallas:
return []
results = []
for mahalla in mahallas:
normalized_mahalla = normalize_mahalla_text(mahalla)
# Barcha scoringlar
fuzzy = similarity_score(normalized_input, normalized_mahalla)
substring = 0.9 if normalized_input in normalized_mahalla else 0.0
word = word_similarity(normalized_input, normalized_mahalla)
final_score = max(fuzzy, substring, word)
results.append((mahalla, final_score))
# Scorecard bo'yicha tartiblash
results.sort(key=lambda x: x[1], reverse=True)
return results[:top_n]
except Exception as e:
logger.error(f"❌ Mahalla tavsiyalari xatoligi: {e}")
return []
# ==================== TESTING HELPER ====================
def test_mahalla_matching(district_name: str, test_inputs: List[str]):
"""
Mahalla matching'ni test qilish uchun helper
Args:
district_name: Test qilinadigan tuman
test_inputs: Test qilinadigan inputlar ro'yxati
"""
logger.info(f"\n{'='*60}")
logger.info(f"TEST: {district_name}")
logger.info(f"{'='*60}")
for test_input in test_inputs:
logger.info(f"\nTest input: '{test_input}'")
result = find_mahalla_fuzzy(district_name, test_input, threshold=0.35)
if result:
logger.info(f" βœ… MATCH: {result}")
else:
logger.warning(f" ❌ NO MATCH")
suggestions = suggest_mahallas(district_name, test_input, top_n=3)
if suggestions:
logger.info(f" πŸ’‘ Suggestions:")
for mahalla, score in suggestions:
logger.info(f" - {mahalla} ({score:.2f})")
logger.info(f"{'='*60}\n")
# ==================== AUTO-TEST ON IMPORT (DEBUG MODE) ====================
if __name__ == "__main__":
# Test uchun
logging.basicConfig(level=logging.DEBUG)
test_cases = [
("Chilonzor tumani", ["katta chilonzor", "beltepa", "beshqozon", "qorgon"]),
("Bektemir tumani", ["abay", "bektemir", "binokor"]),
("Shayxontohur tumani", ["kamolon", "shayx"]),
]
for district, inputs in test_cases:
test_mahalla_matching(district, inputs)