HeTalksInMaths commited on
Commit
cbd3402
Β·
1 Parent(s): 4663c58

Fix: Dynamic port assignment for HF Spaces

Browse files
Files changed (1) hide show
  1. app.py +199 -36
app.py CHANGED
@@ -130,7 +130,7 @@ def analyze_prompt(prompt: str, k: int = 5) -> str:
130
 
131
 
132
  def expand_database(batch_size: int = 5000) -> str:
133
- """Expand the database by adding another batch of questions."""
134
  try:
135
  from datasets import load_dataset
136
  from benchmark_vector_db import BenchmarkQuestion
@@ -138,43 +138,190 @@ def expand_database(batch_size: int = 5000) -> str:
138
 
139
  current_count = db.collection.count()
140
 
141
- # Load full MMLU-Pro test dataset
142
- logger.info("Loading MMLU-Pro test dataset...")
143
- test_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
144
- total_available = len(test_dataset)
145
 
146
- # Figure out which questions we haven't indexed yet
147
- # We'll use a simple offset approach
148
- already_indexed = current_count
149
- remaining = total_available - already_indexed
150
 
151
- if remaining <= 0:
152
- return f"βœ… Database is complete! All {total_available:,} questions indexed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- # Sample next batch
155
- start_idx = already_indexed
156
- end_idx = min(start_idx + batch_size, total_available)
157
- batch_questions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- logger.info(f"Expanding database: adding questions {start_idx} to {end_idx}...")
 
 
 
160
 
161
- for idx in range(start_idx, end_idx):
162
- item = test_dataset[idx]
 
163
  question = BenchmarkQuestion(
164
- question_id=f"mmlu_pro_test_{idx}",
165
- source_benchmark="MMLU_Pro",
166
- domain=item.get('category', 'unknown').lower(),
167
- question_text=item['question'],
168
- correct_answer=item['answer'],
169
- choices=item.get('options', []),
170
- success_rate=0.45,
171
- difficulty_score=0.55,
172
- difficulty_label="Hard",
173
  num_models_tested=0
174
  )
175
  batch_questions.append(question)
176
 
177
- # Index the batch
178
  logger.info(f"Indexing {len(batch_questions)} new questions...")
179
  db.index_questions(batch_questions)
180
 
@@ -185,12 +332,16 @@ def expand_database(batch_size: int = 5000) -> str:
185
  result += f"**Database Stats:**\n"
186
  result += f"- Total Questions: {new_count:,}\n"
187
  result += f"- Just Added: {len(batch_questions)}\n"
 
188
  result += f"- Remaining: {still_remaining:,}\n\n"
189
 
190
  if still_remaining > 0:
191
- result += f"Click 'Expand Database' again to add {min(batch_size, still_remaining)} more questions."
 
192
  else:
193
- result += f"πŸŽ‰ Database is now complete with all {total_available:,} questions!"
 
 
194
 
195
  return result
196
 
@@ -204,19 +355,27 @@ def get_database_info() -> str:
204
  try:
205
  current_count = db.collection.count()
206
 
207
- # Estimate total available (MMLU-Pro test has ~12K)
208
- total_available = 12032
 
 
209
  remaining = total_available - current_count
 
210
 
211
  info = f"### πŸ“Š Database Status\n\n"
212
  info += f"**Current Size:** {current_count:,} questions\n"
213
- info += f"**Available:** {total_available:,} questions\n"
 
214
  info += f"**Remaining:** {max(0, remaining):,} questions\n\n"
215
 
216
  if remaining > 0:
217
- info += f"πŸ’‘ Click 'Expand Database' to add 5,000 more questions (takes ~2-3 min)"
 
 
218
  else:
219
- info += f"βœ… Database is complete!"
 
 
220
 
221
  return info
222
  except Exception as e:
@@ -294,4 +453,8 @@ with gr.Blocks(title="ToGMAL Prompt Difficulty Analyzer") as demo:
294
  )
295
 
296
  if __name__ == "__main__":
297
- demo.launch(share=True, server_port=7861)
 
 
 
 
 
130
 
131
 
132
  def expand_database(batch_size: int = 5000) -> str:
133
+ """Expand the database by adding another batch of questions from multiple sources."""
134
  try:
135
  from datasets import load_dataset
136
  from benchmark_vector_db import BenchmarkQuestion
 
138
 
139
  current_count = db.collection.count()
140
 
141
+ # Load from ALL available sources to reach 32K+
142
+ # We'll build a pool of all questions and track which ones we've indexed
143
+ all_questions_pool = []
 
144
 
145
+ logger.info("Loading all available benchmark datasets...")
 
 
 
146
 
147
+ # Source 1: MMLU-Pro test split (12,032 questions)
148
+ try:
149
+ logger.info(" Loading MMLU-Pro test...")
150
+ mmlu_pro_test = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
151
+ for idx, item in enumerate(mmlu_pro_test):
152
+ all_questions_pool.append({
153
+ 'id': f"mmlu_pro_test_{idx}",
154
+ 'source': 'MMLU_Pro',
155
+ 'domain': item.get('category', 'unknown').lower(),
156
+ 'question': item['question'],
157
+ 'answer': item['answer'],
158
+ 'choices': item.get('options', []),
159
+ 'success_rate': 0.45
160
+ })
161
+ logger.info(f" Added {len(mmlu_pro_test)} MMLU-Pro test questions")
162
+ except Exception as e:
163
+ logger.warning(f" Could not load MMLU-Pro test: {e}")
164
 
165
+ # Source 2: MMLU-Pro validation split (70 questions)
166
+ try:
167
+ logger.info(" Loading MMLU-Pro validation...")
168
+ mmlu_pro_val = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")
169
+ for idx, item in enumerate(mmlu_pro_val):
170
+ all_questions_pool.append({
171
+ 'id': f"mmlu_pro_val_{idx}",
172
+ 'source': 'MMLU_Pro',
173
+ 'domain': item.get('category', 'unknown').lower(),
174
+ 'question': item['question'],
175
+ 'answer': item['answer'],
176
+ 'choices': item.get('options', []),
177
+ 'success_rate': 0.45
178
+ })
179
+ logger.info(f" Added {len(mmlu_pro_val)} MMLU-Pro validation questions")
180
+ except Exception as e:
181
+ logger.warning(f" Could not load MMLU-Pro validation: {e}")
182
+
183
+ # Source 3: MMLU (original - 14,042 questions for cross-domain coverage)
184
+ try:
185
+ logger.info(" Loading MMLU (original)...")
186
+ # MMLU has multiple subjects, we'll load the test split
187
+ # Using the 'all' configuration to get all subjects
188
+ mmlu_dataset = load_dataset("cais/mmlu", "all", split="test")
189
+ for idx, item in enumerate(mmlu_dataset):
190
+ all_questions_pool.append({
191
+ 'id': f"mmlu_{idx}",
192
+ 'source': 'MMLU',
193
+ 'domain': item.get('subject', 'cross_domain').lower(),
194
+ 'question': item['question'],
195
+ 'answer': str(item['answer']),
196
+ 'choices': item.get('choices', []),
197
+ 'success_rate': 0.65 # MMLU is easier than MMLU-Pro
198
+ })
199
+ logger.info(f" Added {len(mmlu_dataset)} MMLU questions")
200
+ except Exception as e:
201
+ logger.warning(f" Could not load MMLU: {e}")
202
+
203
+ # Source 4: ARC-Challenge - Science reasoning
204
+ try:
205
+ logger.info(" Loading ARC-Challenge...")
206
+ arc_dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")
207
+ for idx, item in enumerate(arc_dataset):
208
+ all_questions_pool.append({
209
+ 'id': f"arc_challenge_{idx}",
210
+ 'source': 'ARC-Challenge',
211
+ 'domain': 'science',
212
+ 'question': item['question'],
213
+ 'answer': item['answerKey'],
214
+ 'choices': item['choices']['text'] if 'choices' in item else [],
215
+ 'success_rate': 0.50
216
+ })
217
+ logger.info(f" Added {len(arc_dataset)} ARC-Challenge questions")
218
+ except Exception as e:
219
+ logger.warning(f" Could not load ARC-Challenge: {e}")
220
+
221
+ # Source 5: HellaSwag - Commonsense NLI (sample 2K from 10K)
222
+ try:
223
+ logger.info(" Loading HellaSwag...")
224
+ hellaswag_dataset = load_dataset("Rowan/hellaswag", split="validation")
225
+ # Sample to 2000 to manage size
226
+ if len(hellaswag_dataset) > 2000:
227
+ indices = random.sample(range(len(hellaswag_dataset)), 2000)
228
+ hellaswag_dataset = hellaswag_dataset.select(indices)
229
+ for idx, item in enumerate(hellaswag_dataset):
230
+ all_questions_pool.append({
231
+ 'id': f"hellaswag_{idx}",
232
+ 'source': 'HellaSwag',
233
+ 'domain': 'commonsense',
234
+ 'question': item['ctx'],
235
+ 'answer': str(item['label']),
236
+ 'choices': item['endings'] if 'endings' in item else [],
237
+ 'success_rate': 0.65
238
+ })
239
+ logger.info(f" Added {len(hellaswag_dataset)} HellaSwag questions")
240
+ except Exception as e:
241
+ logger.warning(f" Could not load HellaSwag: {e}")
242
+
243
+ # Source 6: GSM8K - Math word problems
244
+ try:
245
+ logger.info(" Loading GSM8K...")
246
+ gsm8k_dataset = load_dataset("openai/gsm8k", "main", split="test")
247
+ for idx, item in enumerate(gsm8k_dataset):
248
+ all_questions_pool.append({
249
+ 'id': f"gsm8k_{idx}",
250
+ 'source': 'GSM8K',
251
+ 'domain': 'math_word_problems',
252
+ 'question': item['question'],
253
+ 'answer': item['answer'],
254
+ 'choices': None,
255
+ 'success_rate': 0.55
256
+ })
257
+ logger.info(f" Added {len(gsm8k_dataset)} GSM8K questions")
258
+ except Exception as e:
259
+ logger.warning(f" Could not load GSM8K: {e}")
260
+
261
+ # Source 7: TruthfulQA - Truthfulness detection
262
+ try:
263
+ logger.info(" Loading TruthfulQA...")
264
+ truthfulqa_dataset = load_dataset("truthful_qa", "generation", split="validation")
265
+ for idx, item in enumerate(truthfulqa_dataset):
266
+ all_questions_pool.append({
267
+ 'id': f"truthfulqa_{idx}",
268
+ 'source': 'TruthfulQA',
269
+ 'domain': 'truthfulness',
270
+ 'question': item['question'],
271
+ 'answer': item['best_answer'],
272
+ 'choices': None,
273
+ 'success_rate': 0.35
274
+ })
275
+ logger.info(f" Added {len(truthfulqa_dataset)} TruthfulQA questions")
276
+ except Exception as e:
277
+ logger.warning(f" Could not load TruthfulQA: {e}")
278
+
279
+ # Source 8: Winogrande - Commonsense reasoning
280
+ try:
281
+ logger.info(" Loading Winogrande...")
282
+ winogrande_dataset = load_dataset("winogrande", "winogrande_xl", split="validation")
283
+ for idx, item in enumerate(winogrande_dataset):
284
+ all_questions_pool.append({
285
+ 'id': f"winogrande_{idx}",
286
+ 'source': 'Winogrande',
287
+ 'domain': 'commonsense_reasoning',
288
+ 'question': item['sentence'],
289
+ 'answer': item['answer'],
290
+ 'choices': [item['option1'], item['option2']],
291
+ 'success_rate': 0.70
292
+ })
293
+ logger.info(f" Added {len(winogrande_dataset)} Winogrande questions")
294
+ except Exception as e:
295
+ logger.warning(f" Could not load Winogrande: {e}")
296
+
297
+ total_available = len(all_questions_pool)
298
+ logger.info(f"Total questions available: {total_available:,}")
299
+
300
+ if current_count >= total_available:
301
+ return f"βœ… Database is complete! All {total_available:,} questions already indexed.\n\nπŸ“Š **20 domains** across **7 benchmark sources**!"
302
 
303
+ # Get next batch (skip ones we've already indexed)
304
+ start_idx = current_count
305
+ end_idx = min(start_idx + batch_size, total_available)
306
+ batch_data = all_questions_pool[start_idx:end_idx]
307
 
308
+ # Convert to BenchmarkQuestion objects
309
+ batch_questions = []
310
+ for q_data in batch_data:
311
  question = BenchmarkQuestion(
312
+ question_id=q_data['id'],
313
+ source_benchmark=q_data['source'],
314
+ domain=q_data['domain'],
315
+ question_text=q_data['question'],
316
+ correct_answer=q_data['answer'],
317
+ choices=q_data.get('choices'),
318
+ success_rate=q_data['success_rate'],
319
+ difficulty_score=1.0 - q_data['success_rate'],
320
+ difficulty_label="Hard" if q_data['success_rate'] < 0.5 else "Moderate",
321
  num_models_tested=0
322
  )
323
  batch_questions.append(question)
324
 
 
325
  logger.info(f"Indexing {len(batch_questions)} new questions...")
326
  db.index_questions(batch_questions)
327
 
 
332
  result += f"**Database Stats:**\n"
333
  result += f"- Total Questions: {new_count:,}\n"
334
  result += f"- Just Added: {len(batch_questions)}\n"
335
+ result += f"- Total Available: {total_available:,}\n"
336
  result += f"- Remaining: {still_remaining:,}\n\n"
337
 
338
  if still_remaining > 0:
339
+ result += f"πŸ’‘ Click 'Expand Database' again to add up to {min(batch_size, still_remaining):,} more questions.\n"
340
+ result += f"πŸ“Š Progress: {(new_count/total_available*100):.1f}% complete"
341
  else:
342
+ result += f"πŸŽ‰ Database is now complete with all {total_available:,} questions!\n\n"
343
+ result += f"πŸ“š **Sources:** MMLU, MMLU-Pro, ARC-Challenge, HellaSwag, GSM8K, TruthfulQA, Winogrande\n"
344
+ result += f"🌐 **Domains:** 20+ including science, math, truthfulness, commonsense, and more!"
345
 
346
  return result
347
 
 
355
  try:
356
  current_count = db.collection.count()
357
 
358
+ # Total available across all sources
359
+ # MMLU: ~14,042 + MMLU-Pro: 12,102 + ARC: 1,172 + HellaSwag: 2,000
360
+ # + GSM8K: 1,319 + TruthfulQA: 817 + Winogrande: 1,267 = ~32,719 total
361
+ total_available = 32719
362
  remaining = total_available - current_count
363
+ progress_pct = (current_count / total_available * 100) if total_available > 0 else 0
364
 
365
  info = f"### πŸ“Š Database Status\n\n"
366
  info += f"**Current Size:** {current_count:,} questions\n"
367
+ info += f"**Total Available:** {total_available:,} questions\n"
368
+ info += f"**Progress:** {progress_pct:.1f}% complete\n"
369
  info += f"**Remaining:** {max(0, remaining):,} questions\n\n"
370
 
371
  if remaining > 0:
372
+ info += f"πŸ’‘ Click 'Expand Database' to add 5,000 more questions (~2-3 min per batch)\n\n"
373
+ clicks_needed = (remaining + 4999) // 5000 # Round up
374
+ info += f"πŸ“ˆ ~{clicks_needed} more clicks to reach full 32K+ dataset"
375
  else:
376
+ info += f"πŸŽ‰ Database is complete with all available questions!\n\n"
377
+ info += f"**Sources:** MMLU, MMLU-Pro, ARC-Challenge, HellaSwag, GSM8K, TruthfulQA, Winogrande\n"
378
+ info += f"**Domains:** 20+ including truthfulness, commonsense, math word problems, science, and more!"
379
 
380
  return info
381
  except Exception as e:
 
453
  )
454
 
455
  if __name__ == "__main__":
456
+ # HuggingFace Spaces: Use default port (7860) and auto-share
457
+ # Port is auto-assigned by HF Spaces infrastructure
458
+ import os
459
+ port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
460
+ demo.launch(server_name="0.0.0.0", server_port=port)