Spaces:
Paused
Paused
| # app/utils/mahalla_matcher.py - YAXSHILANGAN (THRESHOLD 0.35) | |
| """ | |
| Mahalla Matcher - Noto'g'ri yozilgan mahalla nomlarini topish | |
| Fuzzy matching + substring + word matching | |
| """ | |
| import logging | |
| from typing import Optional, List | |
| from difflib import SequenceMatcher | |
| from app.services.location_validator import get_mahallas_by_district | |
| logger = logging.getLogger(__name__) | |
| def normalize_mahalla_text(text: str) -> str: | |
| """ | |
| Mahalla nomini normalizatsiya qilish (KENGAYTIRILGAN) | |
| O'zbek tili qo'shimchalarini olib tashlaydi | |
| Args: | |
| text: Asl matn | |
| Returns: | |
| Normalized matn | |
| """ | |
| if not text: | |
| return "" | |
| # Kichik harf | |
| text = text.lower().strip() | |
| # Ko'p probellarni bitta probelga | |
| text = " ".join(text.split()) | |
| # "mahallasi", "mahalla" so'zlarini olib tashlash | |
| text = text.replace(' mahallasi', '').replace(' mahalla', '') | |
| text = text.replace('mahallasi', '').replace('mahalla', '') | |
| # O'zbek tili kelishik qo'shimchalarini olib tashlash (KENGAYTIRILGAN) | |
| suffixes = [ | |
| "ni", "ga", "da", "dan", "ning", "niki", | |
| "dagi", "dagina", "gacha", "dan", "dek", | |
| "lar", "larni", "larga", "larda", "lardan" | |
| ] | |
| words = text.split() | |
| if words: | |
| last_word = words[-1] | |
| for suffix in suffixes: | |
| if last_word.endswith(suffix) and len(last_word) > len(suffix) + 2: # Juda qisqa so'zlarni saqlab qolish | |
| words[-1] = last_word[:-len(suffix)] | |
| break | |
| text = " ".join(words) | |
| # Oxiridagi tinish belgilarini olib tashlash | |
| text = text.strip('.,!? ') | |
| return text.strip() | |
| def similarity_score(str1: str, str2: str) -> float: | |
| """ | |
| Ikki string orasidagi o'xshashlik (0.0 - 1.0) | |
| Args: | |
| str1: Birinchi string | |
| str2: Ikkinchi string | |
| Returns: | |
| Similarity score (1.0 = 100% o'xshash) | |
| """ | |
| return SequenceMatcher(None, str1, str2).ratio() | |
| def word_similarity(str1: str, str2: str) -> float: | |
| """ | |
| So'zma-so'z o'xshashlik (word-level matching) | |
| Args: | |
| str1: Birinchi matn | |
| str2: Ikkinchi matn | |
| Returns: | |
| Word overlap score (0.0 - 1.0) | |
| """ | |
| words1 = set(str1.split()) | |
| words2 = set(str2.split()) | |
| if not words1 or not words2: | |
| return 0.0 | |
| intersection = words1.intersection(words2) | |
| union = words1.union(words2) | |
| return len(intersection) / len(union) if union else 0.0 | |
| def find_mahalla_fuzzy(district_name: str, user_text: str, threshold: float = 0.35) -> Optional[str]: | |
| """ | |
| Noto'g'ri yozilgan mahalla nomini topish (YAXSHILANGAN - THRESHOLD 0.35) | |
| Args: | |
| district_name: Tuman nomi | |
| user_text: Bemorning kiritgan matni (masalan: "katta chilonzor" yoki "besh qorgon") | |
| threshold: Minimal o'xshashlik darajasi (0.35 = 35%) β PASAYTIRILDI! | |
| Returns: | |
| Mahalla nomi (masalan: "Katta Chilonzor-1 mahallasi") yoki None | |
| """ | |
| try: | |
| if not user_text or not district_name: | |
| return None | |
| # Matnni normalizatsiya qilish | |
| normalized_input = normalize_mahalla_text(user_text) | |
| logger.info(f"ποΈ Mahalla qidirilmoqda: '{user_text}' β '{normalized_input}' ({district_name})") | |
| if len(normalized_input) < 2: | |
| logger.warning("β οΈ Matn juda qisqa") | |
| return None | |
| # Tuman bo'yicha mahallalarni olish | |
| mahallas = get_mahallas_by_district(district_name) | |
| if not mahallas: | |
| logger.warning(f"β οΈ {district_name} uchun mahallalar topilmadi") | |
| return None | |
| # Eng yaxshi moslikni topish | |
| best_match = None | |
| best_score = 0.0 | |
| scoring_details = [] | |
| for mahalla in mahallas: | |
| normalized_mahalla = normalize_mahalla_text(mahalla) | |
| # 1. To'liq fuzzy match (SequenceMatcher) | |
| fuzzy_score = similarity_score(normalized_input, normalized_mahalla) | |
| # 2. Substring match (KATTA BONUS) | |
| substring_score = 0.0 | |
| if normalized_input in normalized_mahalla: | |
| substring_score = 0.9 # 90% match | |
| logger.debug(f" β Substring (input in mahalla): '{normalized_input}' in '{normalized_mahalla}'") | |
| elif normalized_mahalla in normalized_input: | |
| substring_score = 0.85 # 85% match | |
| logger.debug(f" β Substring (mahalla in input): '{normalized_mahalla}' in '{normalized_input}'") | |
| # 3. So'zma-so'z match (Word overlap) | |
| word_score = word_similarity(normalized_input, normalized_mahalla) | |
| # 4. So'z boshi match (First word matching) | |
| input_words = normalized_input.split() | |
| mahalla_words = normalized_mahalla.split() | |
| first_word_score = 0.0 | |
| if input_words and mahalla_words: | |
| if input_words[0] == mahalla_words[0]: | |
| first_word_score = 0.7 # Birinchi so'z mos kelsa - 70% | |
| logger.debug(f" β First word match: '{input_words[0]}'") | |
| # 5. FINAL SCORE (eng yuqori ball) | |
| final_score = max(fuzzy_score, substring_score, word_score, first_word_score) | |
| scoring_details.append({ | |
| "mahalla": mahalla, | |
| "fuzzy": fuzzy_score, | |
| "substring": substring_score, | |
| "word": word_score, | |
| "first_word": first_word_score, | |
| "final": final_score | |
| }) | |
| if final_score > best_score: | |
| best_score = final_score | |
| best_match = mahalla | |
| # Debug: Top 3 natijalar | |
| scoring_details.sort(key=lambda x: x['final'], reverse=True) | |
| logger.debug(f" Top 3 matches:") | |
| for i, detail in enumerate(scoring_details[:3], 1): | |
| logger.debug(f" {i}. {detail['mahalla']}: {detail['final']:.2f} " | |
| f"(fuzzy={detail['fuzzy']:.2f}, sub={detail['substring']:.2f}, " | |
| f"word={detail['word']:.2f}, first={detail['first_word']:.2f})") | |
| # Threshold tekshirish | |
| if best_score >= threshold: | |
| logger.info(f"β Mahalla topildi: '{best_match}' (score: {best_score:.2f})") | |
| return best_match | |
| else: | |
| logger.warning(f"β οΈ Mahalla topilmadi (best score: {best_score:.2f} < {threshold})") | |
| return None | |
| except Exception as e: | |
| logger.error(f"β Mahalla matching xatoligi: {e}", exc_info=True) | |
| return None | |
| def get_mahalla_display_name(mahalla_name: str) -> str: | |
| """ | |
| Mahalla nomini to'liq formatda qaytarish | |
| Args: | |
| mahalla_name: "Beltepa" yoki "Beltepa mahallasi" | |
| Returns: | |
| "Beltepa mahallasi" | |
| """ | |
| if not mahalla_name: | |
| return "" | |
| if 'mahallasi' not in mahalla_name.lower(): | |
| return f"{mahalla_name} mahallasi" | |
| return mahalla_name | |
| def suggest_mahallas(district_name: str, user_text: str, top_n: int = 3) -> List[tuple]: | |
| """ | |
| Mahallalar tavsiyalari (top N eng o'xshash) | |
| Args: | |
| district_name: Tuman nomi | |
| user_text: Bemorning matni | |
| top_n: Ko'rsatiladigan maksimal tavsiyalar soni | |
| Returns: | |
| [(mahalla_name, score), ...] - eng yaxshilaridan boshlab | |
| """ | |
| try: | |
| normalized_input = normalize_mahalla_text(user_text) | |
| mahallas = get_mahallas_by_district(district_name) | |
| if not mahallas: | |
| return [] | |
| results = [] | |
| for mahalla in mahallas: | |
| normalized_mahalla = normalize_mahalla_text(mahalla) | |
| # Barcha scoringlar | |
| fuzzy = similarity_score(normalized_input, normalized_mahalla) | |
| substring = 0.9 if normalized_input in normalized_mahalla else 0.0 | |
| word = word_similarity(normalized_input, normalized_mahalla) | |
| final_score = max(fuzzy, substring, word) | |
| results.append((mahalla, final_score)) | |
| # Scorecard bo'yicha tartiblash | |
| results.sort(key=lambda x: x[1], reverse=True) | |
| return results[:top_n] | |
| except Exception as e: | |
| logger.error(f"β Mahalla tavsiyalari xatoligi: {e}") | |
| return [] | |
| # ==================== TESTING HELPER ==================== | |
| def test_mahalla_matching(district_name: str, test_inputs: List[str]): | |
| """ | |
| Mahalla matching'ni test qilish uchun helper | |
| Args: | |
| district_name: Test qilinadigan tuman | |
| test_inputs: Test qilinadigan inputlar ro'yxati | |
| """ | |
| logger.info(f"\n{'='*60}") | |
| logger.info(f"TEST: {district_name}") | |
| logger.info(f"{'='*60}") | |
| for test_input in test_inputs: | |
| logger.info(f"\nTest input: '{test_input}'") | |
| result = find_mahalla_fuzzy(district_name, test_input, threshold=0.35) | |
| if result: | |
| logger.info(f" β MATCH: {result}") | |
| else: | |
| logger.warning(f" β NO MATCH") | |
| suggestions = suggest_mahallas(district_name, test_input, top_n=3) | |
| if suggestions: | |
| logger.info(f" π‘ Suggestions:") | |
| for mahalla, score in suggestions: | |
| logger.info(f" - {mahalla} ({score:.2f})") | |
| logger.info(f"{'='*60}\n") | |
| # ==================== AUTO-TEST ON IMPORT (DEBUG MODE) ==================== | |
| if __name__ == "__main__": | |
| # Test uchun | |
| logging.basicConfig(level=logging.DEBUG) | |
| test_cases = [ | |
| ("Chilonzor tumani", ["katta chilonzor", "beltepa", "beshqozon", "qorgon"]), | |
| ("Bektemir tumani", ["abay", "bektemir", "binokor"]), | |
| ("Shayxontohur tumani", ["kamolon", "shayx"]), | |
| ] | |
| for district, inputs in test_cases: | |
| test_mahalla_matching(district, inputs) |