HeTalksInMaths commited on
Commit
29ce16b
·
1 Parent(s): 78682b6

Reduce to 5K questions for fast HF build

Browse files

- Build now takes ~3-5 min instead of timing out
- Samples 5K from MMLU-Pro test split
- Still covers all 14 domains
- Note: Full 26K available locally

Files changed (1) hide show
  1. app.py +24 -35
app.py CHANGED
@@ -25,46 +25,34 @@ db = BenchmarkVectorDB(
25
  embedding_model="all-MiniLM-L6-v2"
26
  )
27
 
28
- # Build expanded database if not exists (first launch on Hugging Face)
 
29
  current_count = db.collection.count()
30
 
31
  if current_count == 0:
32
- logger.info("Database is empty - building expanded database from scratch...")
33
- logger.info("This will take ~10-15 minutes on first launch (building 26K+ questions).")
34
 
35
- # Load MMLU-Pro test split for comprehensive coverage
36
  try:
37
  from datasets import load_dataset
38
  from benchmark_vector_db import BenchmarkQuestion
39
 
40
- # Load MMLU-Pro validation + test splits
41
- logger.info("Loading MMLU-Pro validation split...")
42
- val_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")
43
- logger.info(f" Loaded {len(val_dataset)} validation questions")
44
-
45
- logger.info("Loading MMLU-Pro test split...")
46
  test_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
47
- logger.info(f" Loaded {len(test_dataset)} test questions")
48
 
49
- all_questions = []
 
 
 
 
 
 
50
 
51
- # Process validation split
52
- for idx, item in enumerate(val_dataset):
53
- question = BenchmarkQuestion(
54
- question_id=f"mmlu_pro_val_{idx}",
55
- source_benchmark="MMLU_Pro",
56
- domain=item.get('category', 'unknown').lower(),
57
- question_text=item['question'],
58
- correct_answer=item['answer'],
59
- choices=item.get('options', []),
60
- success_rate=0.45,
61
- difficulty_score=0.55,
62
- difficulty_label="Hard",
63
- num_models_tested=0
64
- )
65
- all_questions.append(question)
66
 
67
- # Process test split
68
  for idx, item in enumerate(test_dataset):
69
  question = BenchmarkQuestion(
70
  question_id=f"mmlu_pro_test_{idx}",
@@ -80,26 +68,27 @@ if current_count == 0:
80
  )
81
  all_questions.append(question)
82
 
83
- logger.info(f"Total questions to index: {len(all_questions)}")
84
 
85
- # Index in batches of 1000 for stability
86
  batch_size = 1000
87
  for i in range(0, len(all_questions), batch_size):
88
  batch = all_questions[i:i + batch_size]
89
  batch_num = i // batch_size + 1
90
  total_batches = (len(all_questions) + batch_size - 1) // batch_size
91
- logger.info(f"Indexing batch {batch_num}/{total_batches} ({len(batch)} questions)...")
92
  db.index_questions(batch)
93
 
94
  logger.info(f"✓ Database build complete! Indexed {len(all_questions)} questions")
 
95
 
96
  except Exception as e:
97
- logger.error(f"Failed to build expanded database: {e}")
98
- logger.info("Falling back to standard build...")
99
  db.build_database(
100
- load_gpqa=False, # Skip GPQA (requires auth)
101
  load_mmlu_pro=True,
102
- load_math=False, # Skip MATH (dataset path issues)
103
  max_samples_per_dataset=1000
104
  )
105
  else:
 
25
  embedding_model="all-MiniLM-L6-v2"
26
  )
27
 
28
+ # Build database if not exists (first launch on Hugging Face)
29
+ # Start with a manageable size to avoid build timeout
30
  current_count = db.collection.count()
31
 
32
  if current_count == 0:
33
+ logger.info("Database is empty - building database...")
34
+ logger.info("Building 5K questions to stay within build time limits.")
35
 
 
36
  try:
37
  from datasets import load_dataset
38
  from benchmark_vector_db import BenchmarkQuestion
39
 
40
+ # Load MMLU-Pro test split (sample 5K for fast build)
41
+ logger.info("Loading MMLU-Pro test split (5K sample)...")
 
 
 
 
42
  test_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
43
+ logger.info(f" Dataset has {len(test_dataset)} questions total")
44
 
45
+ # Sample 5000 questions for fast initial build
46
+ import random
47
+ total_questions = len(test_dataset)
48
+ if total_questions > 5000:
49
+ indices = random.sample(range(total_questions), 5000)
50
+ test_dataset = test_dataset.select(indices)
51
+ logger.info(f" Sampled 5000 questions for initial build")
52
 
53
+ all_questions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Process questions
56
  for idx, item in enumerate(test_dataset):
57
  question = BenchmarkQuestion(
58
  question_id=f"mmlu_pro_test_{idx}",
 
68
  )
69
  all_questions.append(question)
70
 
71
+ logger.info(f"Indexing {len(all_questions)} questions...")
72
 
73
+ # Index in batches of 1000
74
  batch_size = 1000
75
  for i in range(0, len(all_questions), batch_size):
76
  batch = all_questions[i:i + batch_size]
77
  batch_num = i // batch_size + 1
78
  total_batches = (len(all_questions) + batch_size - 1) // batch_size
79
+ logger.info(f" Batch {batch_num}/{total_batches}...")
80
  db.index_questions(batch)
81
 
82
  logger.info(f"✓ Database build complete! Indexed {len(all_questions)} questions")
83
+ logger.info("Note: This is a 5K subset. Full 26K database available locally.")
84
 
85
  except Exception as e:
86
+ logger.error(f"Failed to build database: {e}")
87
+ logger.info("Falling back to minimal build...")
88
  db.build_database(
89
+ load_gpqa=False,
90
  load_mmlu_pro=True,
91
+ load_math=False,
92
  max_samples_per_dataset=1000
93
  )
94
  else: