Spaces:
Sleeping
Sleeping
| # Autocomplete backend β builds, loads, and queries bigram index | |
| import os | |
| import pickle | |
| from collections import Counter | |
| # Paths: where bigrams.pkl is stored | |
| BIGRAMS_PATH = os.path.join(os.path.dirname(__file__), "../data/bigrams.pkl") | |
| # Global cache (lazy-loaded bigram counts) | |
| _bigram_counts = None | |
| # Build bigrams index from subtitle blocks | |
| def build_bigrams_index(blocks: list[dict], out_path: str = BIGRAMS_PATH, min_count: int = 2): | |
| """ | |
| Build a bigram frequency file from preprocessed blocks and save to disk. | |
| We use a simple whitespace tokenizer and generate bigrams via zip(). | |
| """ | |
| all_text = " ".join((b.get("text") or "").lower() for b in blocks) | |
| tokens = all_text.split() | |
| bigrams = [" ".join(pair) for pair in zip(tokens, tokens[1:])] | |
| counts = Counter(bigrams) | |
| if min_count > 1: | |
| counts = Counter({k: v for k, v in counts.items() if v >= min_count}) | |
| os.makedirs(os.path.dirname(out_path), exist_ok=True) | |
| with open(out_path, "wb") as f: | |
| pickle.dump(counts, f) | |
| # Lazy loader for bigrams.pkl into memory | |
| def load_bigrams(): | |
| """Load precomputed bigrams from disk.""" | |
| global _bigram_counts | |
| if _bigram_counts is None: | |
| if os.path.exists(BIGRAMS_PATH): | |
| with open(BIGRAMS_PATH, "rb") as f: | |
| _bigram_counts = pickle.load(f) | |
| else: | |
| _bigram_counts = Counter() | |
| # Suggestion function | |
| def get_suggestions(term: str): | |
| """Return top 10 bigram suggestions starting with the given term.""" | |
| if not term or not term.strip(): | |
| return [] | |
| load_bigrams() | |
| term = term.lower().strip() | |
| matches = [bg for bg in _bigram_counts if bg.startswith(term)] | |
| matches.sort(key=lambda x: (-_bigram_counts[x], x)) | |
| return matches[:10] | |