# app/utils/mahalla_matcher.py - YAXSHILANGAN (THRESHOLD 0.35) """ Mahalla Matcher - Noto'g'ri yozilgan mahalla nomlarini topish Fuzzy matching + substring + word matching """ import logging from typing import Optional, List from difflib import SequenceMatcher from app.services.location_validator import get_mahallas_by_district logger = logging.getLogger(__name__) def normalize_mahalla_text(text: str) -> str: """ Mahalla nomini normalizatsiya qilish (KENGAYTIRILGAN) O'zbek tili qo'shimchalarini olib tashlaydi Args: text: Asl matn Returns: Normalized matn """ if not text: return "" # Kichik harf text = text.lower().strip() # Ko'p probellarni bitta probelga text = " ".join(text.split()) # "mahallasi", "mahalla" so'zlarini olib tashlash text = text.replace(' mahallasi', '').replace(' mahalla', '') text = text.replace('mahallasi', '').replace('mahalla', '') # O'zbek tili kelishik qo'shimchalarini olib tashlash (KENGAYTIRILGAN) suffixes = [ "ni", "ga", "da", "dan", "ning", "niki", "dagi", "dagina", "gacha", "dan", "dek", "lar", "larni", "larga", "larda", "lardan" ] words = text.split() if words: last_word = words[-1] for suffix in suffixes: if last_word.endswith(suffix) and len(last_word) > len(suffix) + 2: # Juda qisqa so'zlarni saqlab qolish words[-1] = last_word[:-len(suffix)] break text = " ".join(words) # Oxiridagi tinish belgilarini olib tashlash text = text.strip('.,!? ') return text.strip() def similarity_score(str1: str, str2: str) -> float: """ Ikki string orasidagi o'xshashlik (0.0 - 1.0) Args: str1: Birinchi string str2: Ikkinchi string Returns: Similarity score (1.0 = 100% o'xshash) """ return SequenceMatcher(None, str1, str2).ratio() def word_similarity(str1: str, str2: str) -> float: """ So'zma-so'z o'xshashlik (word-level matching) Args: str1: Birinchi matn str2: Ikkinchi matn Returns: Word overlap score (0.0 - 1.0) """ words1 = set(str1.split()) words2 = set(str2.split()) if not words1 or not words2: return 0.0 intersection = words1.intersection(words2) union = words1.union(words2) return len(intersection) / len(union) if union else 0.0 def find_mahalla_fuzzy(district_name: str, user_text: str, threshold: float = 0.35) -> Optional[str]: """ Noto'g'ri yozilgan mahalla nomini topish (YAXSHILANGAN - THRESHOLD 0.35) Args: district_name: Tuman nomi user_text: Bemorning kiritgan matni (masalan: "katta chilonzor" yoki "besh qorgon") threshold: Minimal o'xshashlik darajasi (0.35 = 35%) ← PASAYTIRILDI! Returns: Mahalla nomi (masalan: "Katta Chilonzor-1 mahallasi") yoki None """ try: if not user_text or not district_name: return None # Matnni normalizatsiya qilish normalized_input = normalize_mahalla_text(user_text) logger.info(f"🏘️ Mahalla qidirilmoqda: '{user_text}' → '{normalized_input}' ({district_name})") if len(normalized_input) < 2: logger.warning("⚠️ Matn juda qisqa") return None # Tuman bo'yicha mahallalarni olish mahallas = get_mahallas_by_district(district_name) if not mahallas: logger.warning(f"⚠️ {district_name} uchun mahallalar topilmadi") return None # Eng yaxshi moslikni topish best_match = None best_score = 0.0 scoring_details = [] for mahalla in mahallas: normalized_mahalla = normalize_mahalla_text(mahalla) # 1. To'liq fuzzy match (SequenceMatcher) fuzzy_score = similarity_score(normalized_input, normalized_mahalla) # 2. Substring match (KATTA BONUS) substring_score = 0.0 if normalized_input in normalized_mahalla: substring_score = 0.9 # 90% match logger.debug(f" ✓ Substring (input in mahalla): '{normalized_input}' in '{normalized_mahalla}'") elif normalized_mahalla in normalized_input: substring_score = 0.85 # 85% match logger.debug(f" ✓ Substring (mahalla in input): '{normalized_mahalla}' in '{normalized_input}'") # 3. So'zma-so'z match (Word overlap) word_score = word_similarity(normalized_input, normalized_mahalla) # 4. So'z boshi match (First word matching) input_words = normalized_input.split() mahalla_words = normalized_mahalla.split() first_word_score = 0.0 if input_words and mahalla_words: if input_words[0] == mahalla_words[0]: first_word_score = 0.7 # Birinchi so'z mos kelsa - 70% logger.debug(f" ✓ First word match: '{input_words[0]}'") # 5. FINAL SCORE (eng yuqori ball) final_score = max(fuzzy_score, substring_score, word_score, first_word_score) scoring_details.append({ "mahalla": mahalla, "fuzzy": fuzzy_score, "substring": substring_score, "word": word_score, "first_word": first_word_score, "final": final_score }) if final_score > best_score: best_score = final_score best_match = mahalla # Debug: Top 3 natijalar scoring_details.sort(key=lambda x: x['final'], reverse=True) logger.debug(f" Top 3 matches:") for i, detail in enumerate(scoring_details[:3], 1): logger.debug(f" {i}. {detail['mahalla']}: {detail['final']:.2f} " f"(fuzzy={detail['fuzzy']:.2f}, sub={detail['substring']:.2f}, " f"word={detail['word']:.2f}, first={detail['first_word']:.2f})") # Threshold tekshirish if best_score >= threshold: logger.info(f"✅ Mahalla topildi: '{best_match}' (score: {best_score:.2f})") return best_match else: logger.warning(f"⚠️ Mahalla topilmadi (best score: {best_score:.2f} < {threshold})") return None except Exception as e: logger.error(f"❌ Mahalla matching xatoligi: {e}", exc_info=True) return None def get_mahalla_display_name(mahalla_name: str) -> str: """ Mahalla nomini to'liq formatda qaytarish Args: mahalla_name: "Beltepa" yoki "Beltepa mahallasi" Returns: "Beltepa mahallasi" """ if not mahalla_name: return "" if 'mahallasi' not in mahalla_name.lower(): return f"{mahalla_name} mahallasi" return mahalla_name def suggest_mahallas(district_name: str, user_text: str, top_n: int = 3) -> List[tuple]: """ Mahallalar tavsiyalari (top N eng o'xshash) Args: district_name: Tuman nomi user_text: Bemorning matni top_n: Ko'rsatiladigan maksimal tavsiyalar soni Returns: [(mahalla_name, score), ...] - eng yaxshilaridan boshlab """ try: normalized_input = normalize_mahalla_text(user_text) mahallas = get_mahallas_by_district(district_name) if not mahallas: return [] results = [] for mahalla in mahallas: normalized_mahalla = normalize_mahalla_text(mahalla) # Barcha scoringlar fuzzy = similarity_score(normalized_input, normalized_mahalla) substring = 0.9 if normalized_input in normalized_mahalla else 0.0 word = word_similarity(normalized_input, normalized_mahalla) final_score = max(fuzzy, substring, word) results.append((mahalla, final_score)) # Scorecard bo'yicha tartiblash results.sort(key=lambda x: x[1], reverse=True) return results[:top_n] except Exception as e: logger.error(f"❌ Mahalla tavsiyalari xatoligi: {e}") return [] # ==================== TESTING HELPER ==================== def test_mahalla_matching(district_name: str, test_inputs: List[str]): """ Mahalla matching'ni test qilish uchun helper Args: district_name: Test qilinadigan tuman test_inputs: Test qilinadigan inputlar ro'yxati """ logger.info(f"\n{'='*60}") logger.info(f"TEST: {district_name}") logger.info(f"{'='*60}") for test_input in test_inputs: logger.info(f"\nTest input: '{test_input}'") result = find_mahalla_fuzzy(district_name, test_input, threshold=0.35) if result: logger.info(f" ✅ MATCH: {result}") else: logger.warning(f" ❌ NO MATCH") suggestions = suggest_mahallas(district_name, test_input, top_n=3) if suggestions: logger.info(f" 💡 Suggestions:") for mahalla, score in suggestions: logger.info(f" - {mahalla} ({score:.2f})") logger.info(f"{'='*60}\n") # ==================== AUTO-TEST ON IMPORT (DEBUG MODE) ==================== if __name__ == "__main__": # Test uchun logging.basicConfig(level=logging.DEBUG) test_cases = [ ("Chilonzor tumani", ["katta chilonzor", "beltepa", "beshqozon", "qorgon"]), ("Bektemir tumani", ["abay", "bektemir", "binokor"]), ("Shayxontohur tumani", ["kamolon", "shayx"]), ] for district, inputs in test_cases: test_mahalla_matching(district, inputs)