Spaces:

JustTheStatsHuman
/

Togmal-demo

Sleeping

HeTalksInMaths commited on 9 days ago

Commit

78682b6

1 Parent(s): d97cc93

Update to 26K question database with batch indexing

- Load MMLU-Pro validation + test splits (12K questions)
- Batch indexing for stability (1K per batch)
- Fixed stats sampling to use all questions
- Dynamic question count display
- ~10-15 min first launch, then instant

Files changed (2) hide show

app.py +81 -13
benchmark_vector_db.py +3 -3

app.py CHANGED Viewed

@@ -25,19 +25,85 @@ db = BenchmarkVectorDB(
     embedding_model="all-MiniLM-L6-v2"
 )
-# Build database if not exists (first launch on Hugging Face)
-if db.collection.count() == 0:
-    logger.info("Database is empty - building from scratch...")
-    logger.info("This will take 3-5 minutes on first launch.")
-    db.build_database(
-        load_gpqa=True,
-        load_mmlu_pro=True,
-        load_math=True,
-        max_samples_per_dataset=1000
-    )
-    logger.info("✓ Database build complete!")
 else:
-    logger.info(f"✓ Loaded existing database with {db.collection.count()} questions")
 def analyze_prompt(prompt: str, k: int = 5) -> str:
     """
@@ -75,7 +141,9 @@ def analyze_prompt(prompt: str, k: int = 5) -> str:
             output.append(f"   - Similarity: {q['similarity']:.3f}")
             output.append("")
-        output.append(f"*Analyzed using {k} most similar questions from 14,042 benchmark questions*")
         return "\n".join(output)

     embedding_model="all-MiniLM-L6-v2"
 )
+# Build expanded database if not exists (first launch on Hugging Face)
+current_count = db.collection.count()
+if current_count == 0:
+    logger.info("Database is empty - building expanded database from scratch...")
+    logger.info("This will take ~10-15 minutes on first launch (building 26K+ questions).")
+    # Load MMLU-Pro test split for comprehensive coverage
+    try:
+        from datasets import load_dataset
+        from benchmark_vector_db import BenchmarkQuestion
+        # Load MMLU-Pro validation + test splits
+        logger.info("Loading MMLU-Pro validation split...")
+        val_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")
+        logger.info(f"  Loaded {len(val_dataset)} validation questions")
+        logger.info("Loading MMLU-Pro test split...")
+        test_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
+        logger.info(f"  Loaded {len(test_dataset)} test questions")
+        all_questions = []
+        # Process validation split
+        for idx, item in enumerate(val_dataset):
+            question = BenchmarkQuestion(
+                question_id=f"mmlu_pro_val_{idx}",
+                source_benchmark="MMLU_Pro",
+                domain=item.get('category', 'unknown').lower(),
+                question_text=item['question'],
+                correct_answer=item['answer'],
+                choices=item.get('options', []),
+                success_rate=0.45,
+                difficulty_score=0.55,
+                difficulty_label="Hard",
+                num_models_tested=0
+            )
+            all_questions.append(question)
+        # Process test split
+        for idx, item in enumerate(test_dataset):
+            question = BenchmarkQuestion(
+                question_id=f"mmlu_pro_test_{idx}",
+                source_benchmark="MMLU_Pro",
+                domain=item.get('category', 'unknown').lower(),
+                question_text=item['question'],
+                correct_answer=item['answer'],
+                choices=item.get('options', []),
+                success_rate=0.45,
+                difficulty_score=0.55,
+                difficulty_label="Hard",
+                num_models_tested=0
+            )
+            all_questions.append(question)
+        logger.info(f"Total questions to index: {len(all_questions)}")
+        # Index in batches of 1000 for stability
+        batch_size = 1000
+        for i in range(0, len(all_questions), batch_size):
+            batch = all_questions[i:i + batch_size]
+            batch_num = i // batch_size + 1
+            total_batches = (len(all_questions) + batch_size - 1) // batch_size
+            logger.info(f"Indexing batch {batch_num}/{total_batches} ({len(batch)} questions)...")
+            db.index_questions(batch)
+        logger.info(f"✓ Database build complete! Indexed {len(all_questions)} questions")
+    except Exception as e:
+        logger.error(f"Failed to build expanded database: {e}")
+        logger.info("Falling back to standard build...")
+        db.build_database(
+            load_gpqa=False,  # Skip GPQA (requires auth)
+            load_mmlu_pro=True,
+            load_math=False,  # Skip MATH (dataset path issues)
+            max_samples_per_dataset=1000
+        )
 else:
+    logger.info(f"✓ Loaded existing database with {current_count:,} questions")
 def analyze_prompt(prompt: str, k: int = 5) -> str:
     """
             output.append(f"   - Similarity: {q['similarity']:.3f}")
             output.append("")
+        # Get current database size
+        total_questions = db.collection.count()
+        output.append(f"*Analyzed using {k} most similar questions from {total_questions:,} benchmark questions*")
         return "\n".join(output)

benchmark_vector_db.py CHANGED Viewed

@@ -568,9 +568,9 @@ class BenchmarkVectorDB:
         if count == 0:
             return {"total_questions": 0, "message": "No questions indexed yet"}
-        # Get sample to compute statistics (ChromaDB doesn't have aggregate functions)
-        sample_size = min(1000, count)
-        sample = self.collection.get(limit=sample_size, include=["metadatas"])
         domains = defaultdict(int)
         sources = defaultdict(int)

         if count == 0:
             return {"total_questions": 0, "message": "No questions indexed yet"}
+        # Get ALL questions for accurate stats (not just sample of 1000)
+        logger.info(f"Computing statistics from all {count} questions...")
+        sample = self.collection.get(limit=count, include=["metadatas"])
         domains = defaultdict(int)
         sources = defaultdict(int)