Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List, Set | |
| try: | |
| from underthesea import word_tokenize, pos_tag | |
| UNDERTHESEA_AVAILABLE = True | |
| except ImportError: | |
| UNDERTHESEA_AVAILABLE = False | |
| print("[WARNING] underthesea not available, falling back to basic tokenization") | |
| class VietnameseTextProcessor: | |
| """Vietnamese text processing utilities for ViettelPay knowledge base""" | |
| def __init__(self): | |
| # Keywords by document type | |
| self.keyword_mappings = { | |
| "error": "lỗi, error code, mã lỗi, sự cố, problem, thất bại, failed, hệ thống, system, maintenance, bảo trì, nâng cấp, upgrade", | |
| "procedure": "hướng dẫn, guide, instruction, bước, step, quy trình, process, nạp cước, topup, recharge, mua, buy, purchase, chọn, select, bấm, click", | |
| "definition": "định nghĩa, definition, nghĩa là, meaning, khái niệm, concept, giải thích, explain", | |
| "policy": "quy định, policy, rule, chính sách, regulation, hủy, cancel, phí, fee, chiết khấu, discount", | |
| "reference": "bảng, table, danh sách, list, thông tin, information, chi tiết, detail", | |
| } | |
| # Vietnamese stop words | |
| self.vietnamese_stop_words = self._load_vietnamese_stop_words() | |
| # Keep important domain terms even if they appear in stop words | |
| self.domain_important_terms = { | |
| "lỗi", | |
| "error", | |
| "mã", | |
| "code", | |
| "bước", | |
| "step", | |
| "hướng", | |
| "dẫn", | |
| "guide", | |
| "thanh", | |
| "toán", | |
| "payment", | |
| "nạp", | |
| "cước", | |
| "topup", | |
| "mua", | |
| "buy", | |
| "viettel", | |
| "viettelpay", | |
| "app", | |
| "ứng", | |
| "dụng", | |
| "mobile", | |
| "thẻ", | |
| "card", | |
| "tiền", | |
| "money", | |
| "rút", | |
| "withdraw", | |
| "chuyển", | |
| "transfer", | |
| } | |
| def _load_vietnamese_stop_words(self) -> Set[str]: | |
| """Load Vietnamese stop words""" | |
| # Common Vietnamese stop words | |
| stop_words = { | |
| "và", | |
| "của", | |
| "có", | |
| "là", | |
| "được", | |
| "các", | |
| "một", | |
| "này", | |
| "cho", | |
| "với", | |
| "trong", | |
| "từ", | |
| "tại", | |
| "về", | |
| "như", | |
| "sau", | |
| "trước", | |
| "khi", | |
| "nếu", | |
| "để", | |
| "đã", | |
| "sẽ", | |
| "đang", | |
| "bị", | |
| "bởi", | |
| "theo", | |
| "những", | |
| "nhưng", | |
| "mà", | |
| "thì", | |
| "cũng", | |
| "hay", | |
| "hoặc", | |
| "nên", | |
| "phải", | |
| "rất", | |
| "lại", | |
| "chỉ", | |
| "đó", | |
| "đây", | |
| "kia", | |
| "nào", | |
| "ai", | |
| "gì", | |
| "sao", | |
| "đâu", | |
| "bao", | |
| "nhiều", | |
| "lắm", | |
| "hơn", | |
| "nhất", | |
| "cả", | |
| "tất", | |
| "mọi", | |
| "toàn", | |
| "chưa", | |
| "không", | |
| "chẳng", | |
| "đang", | |
| "vẫn", | |
| "còn", | |
| "đều", | |
| "cùng", | |
| "nhau", | |
| "riêng", | |
| "luôn", | |
| "ngay", | |
| "liền", | |
| "thêm", | |
| "nữa", | |
| "lần", | |
| "cuối", | |
| "đầu", | |
| "giữa", | |
| "ngoài", | |
| "trong", | |
| "trên", | |
| "dưới", | |
| "bên", | |
| "cạnh", | |
| "giữa", | |
| "trước", | |
| "sau", | |
| "gần", | |
| "xa", | |
| "cao", | |
| "thấp", | |
| } | |
| # Add English stop words that might appear | |
| english_stops = { | |
| "the", | |
| "a", | |
| "an", | |
| "and", | |
| "or", | |
| "but", | |
| "in", | |
| "on", | |
| "at", | |
| "to", | |
| "for", | |
| "of", | |
| "with", | |
| "by", | |
| "is", | |
| "are", | |
| "was", | |
| "were", | |
| "be", | |
| "been", | |
| "have", | |
| "has", | |
| "had", | |
| "do", | |
| "does", | |
| "did", | |
| "will", | |
| "would", | |
| "could", | |
| "should", | |
| "may", | |
| "might", | |
| "can", | |
| "this", | |
| "that", | |
| "these", | |
| "those", | |
| } | |
| return stop_words.union(english_stops) | |
| def vietnamese_tokenize(self, text: str) -> List[str]: | |
| """Vietnamese word tokenization using underthesea or fallback""" | |
| if not text: | |
| return [] | |
| if UNDERTHESEA_AVAILABLE: | |
| try: | |
| # Use underthesea for proper Vietnamese tokenization | |
| tokenized_text = word_tokenize(text, format="text") | |
| return tokenized_text.split() | |
| except Exception as e: | |
| print( | |
| f"[WARNING] underthesea tokenization failed: {e}, falling back to basic" | |
| ) | |
| # Fallback: basic tokenization with Vietnamese-aware splitting | |
| # Handle Vietnamese compound words better | |
| tokens = text.split() | |
| return [token.strip() for token in tokens if token.strip()] | |
| def remove_stop_words(self, tokens: List[str]) -> List[str]: | |
| """Remove Vietnamese stop words while preserving domain terms""" | |
| filtered_tokens = [] | |
| for token in tokens: | |
| # Always keep domain-important terms | |
| if token.lower() in self.domain_important_terms: | |
| filtered_tokens.append(token) | |
| # Keep numbers and error codes | |
| elif re.match(r"^\d+$", token) or re.match(r"^[A-Z]\d+$", token): | |
| filtered_tokens.append(token) | |
| # Remove stop words | |
| elif token.lower() not in self.vietnamese_stop_words: | |
| filtered_tokens.append(token) | |
| return filtered_tokens | |
| def normalize_text_for_bm25(self, text: str) -> str: | |
| """Enhanced Vietnamese normalization for BM25""" | |
| if not text: | |
| return "" | |
| # Basic normalization | |
| normalized = text.lower().strip() | |
| # Vietnamese tokenization | |
| tokens = self.vietnamese_tokenize(normalized) | |
| # Remove stop words but keep domain terms | |
| tokens = self.remove_stop_words(tokens) | |
| # Filter out very short tokens (but keep numbers and codes) | |
| tokens = [ | |
| token | |
| for token in tokens | |
| if len(token) >= 2 | |
| or token.isdigit() | |
| or re.match(r"^[A-Z]\d+$", token.upper()) | |
| ] | |
| # Join back | |
| normalized = " ".join(tokens) | |
| return normalized | |
| def bm25_tokenizer(self, text: str) -> str: | |
| if not text: | |
| return "" | |
| # Basic normalization | |
| normalized = text.lower().strip() | |
| # Vietnamese tokenization | |
| tokens = self.vietnamese_tokenize(normalized) | |
| # Remove stop words but keep domain terms | |
| tokens = self.remove_stop_words(tokens) | |
| # Filter out very short tokens (but keep numbers and codes) | |
| tokens = [ | |
| token | |
| for token in tokens | |
| if len(token) >= 2 | |
| or token.isdigit() | |
| or re.match(r"^[A-Z]\d+$", token.upper()) | |
| ] | |
| return tokens | |
| def enhance_for_bm25( | |
| self, | |
| content: str, | |
| doc_type: str, | |
| additional_keywords: str = "", | |
| ) -> str: | |
| """Enhanced content processing for BM25 with Vietnamese preprocessing""" | |
| # Only use document-type specific keywords (no generic base keywords) | |
| type_specific_keywords = self.keyword_mappings.get(doc_type, "") | |
| enhanced_content = f""" | |
| {type_specific_keywords} {additional_keywords} | |
| {content} | |
| """ | |
| return self.normalize_text_for_bm25(enhanced_content) | |
| def extract_error_code_variations(self, error_code: str) -> str: | |
| """Generate variations of error codes for better BM25 matching""" | |
| if not error_code: | |
| return "" | |
| variations = [error_code] | |
| # Add common Vietnamese variations | |
| if error_code.isdigit(): | |
| # For numeric codes like "606" | |
| variations.extend( | |
| [ | |
| f"lỗi {error_code}", | |
| f"error {error_code}", | |
| f"mã {error_code}", | |
| f"code {error_code}", | |
| f"mã lỗi {error_code}", | |
| ] | |
| ) | |
| else: | |
| # For alphanumeric codes like "W02", "BL2" | |
| variations.extend( | |
| [ | |
| f"lỗi {error_code}", | |
| f"error {error_code}", | |
| f"mã lỗi {error_code}", | |
| f"code {error_code}", | |
| ] | |
| ) | |
| return " ".join(variations) | |
| def extract_steps_keywords(self, guide_text: str) -> str: | |
| """Extract step-related keywords from procedure text""" | |
| if not guide_text: | |
| return "" | |
| # Find step patterns | |
| steps = re.findall(r"(?:bước|b)\s*\d+", guide_text, re.IGNORECASE) | |
| step_keywords = " ".join(steps) | |
| # Add common procedure keywords | |
| procedure_keywords = ( | |
| "step bước instruction hướng dẫn guide quy trình process thao tác action" | |
| ) | |
| return f"{step_keywords} {procedure_keywords}" | |
| def clean_column_name(self, column_name: str) -> str: | |
| """Clean column names by removing extra whitespace and newlines""" | |
| if not column_name: | |
| return "" | |
| # Remove newlines and extra spaces | |
| cleaned = re.sub(r"\s+", " ", column_name.strip()) | |
| return cleaned | |