# Embedding + autocomplete index builder — creates FAISS vector index and bigram index import os import numpy as np import pandas as pd import faiss from sentence_transformers import SentenceTransformer from config import ( META_CSV, INDEX_DIR, FAISS_PATH, EMBEDDING_MODEL, VIDEO_METADATA, ) # Autocomplete index builder from autocomplete import build_bigrams_index, BIGRAMS_PATH # Build FAISS embedding index + bigram autocomplete index def build_embedding_index(subtitle_blocks: list[dict]): texts = [(s.get("text") or "") for s in subtitle_blocks] if not texts: raise ValueError("No texts found in subtitle blocks. Did you generate metadata.csv?") model = SentenceTransformer(EMBEDDING_MODEL) vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True) vectors = np.asarray(vectors, dtype=np.float32) index = faiss.IndexFlatL2(vectors.shape[1]) index.add(vectors) os.makedirs(INDEX_DIR, exist_ok=True) faiss.write_index(index, os.fspath(FAISS_PATH)) # Build bigrams for autocomplete build_bigrams_index(subtitle_blocks, out_path=BIGRAMS_PATH, min_count=2) # Load subtitle blocks from CSV and with video titles def load_blocks_from_csv(csv_path) -> list[dict]: df = pd.read_csv(csv_path) records = df.to_dict("records") for r in records: vid = r.get("video_id") friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid), None) if friendly_key: r["video_title"] = VIDEO_METADATA[friendly_key]["title"] else: r["video_title"] = "Unknown Video" return records # build FAISS + autocomplete indexes if __name__ == "__main__": if not META_CSV.exists(): raise FileNotFoundError( f"metadata.csv not found at {META_CSV}. Run clean_subtitles.py first to generate it." ) blocks = load_blocks_from_csv(META_CSV) build_embedding_index(blocks)