Spaces:
Configuration error
Configuration error
HeTalksInMaths
commited on
Commit
Β·
cbd3402
1
Parent(s):
4663c58
Fix: Dynamic port assignment for HF Spaces
Browse files
app.py
CHANGED
|
@@ -130,7 +130,7 @@ def analyze_prompt(prompt: str, k: int = 5) -> str:
|
|
| 130 |
|
| 131 |
|
| 132 |
def expand_database(batch_size: int = 5000) -> str:
|
| 133 |
-
"""Expand the database by adding another batch of questions."""
|
| 134 |
try:
|
| 135 |
from datasets import load_dataset
|
| 136 |
from benchmark_vector_db import BenchmarkQuestion
|
|
@@ -138,43 +138,190 @@ def expand_database(batch_size: int = 5000) -> str:
|
|
| 138 |
|
| 139 |
current_count = db.collection.count()
|
| 140 |
|
| 141 |
-
# Load
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
total_available = len(test_dataset)
|
| 145 |
|
| 146 |
-
|
| 147 |
-
# We'll use a simple offset approach
|
| 148 |
-
already_indexed = current_count
|
| 149 |
-
remaining = total_available - already_indexed
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
|
|
|
| 163 |
question = BenchmarkQuestion(
|
| 164 |
-
question_id=
|
| 165 |
-
source_benchmark=
|
| 166 |
-
domain=
|
| 167 |
-
question_text=
|
| 168 |
-
correct_answer=
|
| 169 |
-
choices=
|
| 170 |
-
success_rate=
|
| 171 |
-
difficulty_score=0
|
| 172 |
-
difficulty_label="Hard",
|
| 173 |
num_models_tested=0
|
| 174 |
)
|
| 175 |
batch_questions.append(question)
|
| 176 |
|
| 177 |
-
# Index the batch
|
| 178 |
logger.info(f"Indexing {len(batch_questions)} new questions...")
|
| 179 |
db.index_questions(batch_questions)
|
| 180 |
|
|
@@ -185,12 +332,16 @@ def expand_database(batch_size: int = 5000) -> str:
|
|
| 185 |
result += f"**Database Stats:**\n"
|
| 186 |
result += f"- Total Questions: {new_count:,}\n"
|
| 187 |
result += f"- Just Added: {len(batch_questions)}\n"
|
|
|
|
| 188 |
result += f"- Remaining: {still_remaining:,}\n\n"
|
| 189 |
|
| 190 |
if still_remaining > 0:
|
| 191 |
-
result += f"Click 'Expand Database' again to add {min(batch_size, still_remaining)} more questions
|
|
|
|
| 192 |
else:
|
| 193 |
-
result += f"π Database is now complete with all {total_available:,} questions
|
|
|
|
|
|
|
| 194 |
|
| 195 |
return result
|
| 196 |
|
|
@@ -204,19 +355,27 @@ def get_database_info() -> str:
|
|
| 204 |
try:
|
| 205 |
current_count = db.collection.count()
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
|
|
|
|
|
|
|
| 209 |
remaining = total_available - current_count
|
|
|
|
| 210 |
|
| 211 |
info = f"### π Database Status\n\n"
|
| 212 |
info += f"**Current Size:** {current_count:,} questions\n"
|
| 213 |
-
info += f"**Available:** {total_available:,} questions\n"
|
|
|
|
| 214 |
info += f"**Remaining:** {max(0, remaining):,} questions\n\n"
|
| 215 |
|
| 216 |
if remaining > 0:
|
| 217 |
-
info += f"π‘ Click 'Expand Database' to add 5,000 more questions (
|
|
|
|
|
|
|
| 218 |
else:
|
| 219 |
-
info += f"
|
|
|
|
|
|
|
| 220 |
|
| 221 |
return info
|
| 222 |
except Exception as e:
|
|
@@ -294,4 +453,8 @@ with gr.Blocks(title="ToGMAL Prompt Difficulty Analyzer") as demo:
|
|
| 294 |
)
|
| 295 |
|
| 296 |
if __name__ == "__main__":
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
def expand_database(batch_size: int = 5000) -> str:
|
| 133 |
+
"""Expand the database by adding another batch of questions from multiple sources."""
|
| 134 |
try:
|
| 135 |
from datasets import load_dataset
|
| 136 |
from benchmark_vector_db import BenchmarkQuestion
|
|
|
|
| 138 |
|
| 139 |
current_count = db.collection.count()
|
| 140 |
|
| 141 |
+
# Load from ALL available sources to reach 32K+
|
| 142 |
+
# We'll build a pool of all questions and track which ones we've indexed
|
| 143 |
+
all_questions_pool = []
|
|
|
|
| 144 |
|
| 145 |
+
logger.info("Loading all available benchmark datasets...")
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
# Source 1: MMLU-Pro test split (12,032 questions)
|
| 148 |
+
try:
|
| 149 |
+
logger.info(" Loading MMLU-Pro test...")
|
| 150 |
+
mmlu_pro_test = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
|
| 151 |
+
for idx, item in enumerate(mmlu_pro_test):
|
| 152 |
+
all_questions_pool.append({
|
| 153 |
+
'id': f"mmlu_pro_test_{idx}",
|
| 154 |
+
'source': 'MMLU_Pro',
|
| 155 |
+
'domain': item.get('category', 'unknown').lower(),
|
| 156 |
+
'question': item['question'],
|
| 157 |
+
'answer': item['answer'],
|
| 158 |
+
'choices': item.get('options', []),
|
| 159 |
+
'success_rate': 0.45
|
| 160 |
+
})
|
| 161 |
+
logger.info(f" Added {len(mmlu_pro_test)} MMLU-Pro test questions")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.warning(f" Could not load MMLU-Pro test: {e}")
|
| 164 |
|
| 165 |
+
# Source 2: MMLU-Pro validation split (70 questions)
|
| 166 |
+
try:
|
| 167 |
+
logger.info(" Loading MMLU-Pro validation...")
|
| 168 |
+
mmlu_pro_val = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")
|
| 169 |
+
for idx, item in enumerate(mmlu_pro_val):
|
| 170 |
+
all_questions_pool.append({
|
| 171 |
+
'id': f"mmlu_pro_val_{idx}",
|
| 172 |
+
'source': 'MMLU_Pro',
|
| 173 |
+
'domain': item.get('category', 'unknown').lower(),
|
| 174 |
+
'question': item['question'],
|
| 175 |
+
'answer': item['answer'],
|
| 176 |
+
'choices': item.get('options', []),
|
| 177 |
+
'success_rate': 0.45
|
| 178 |
+
})
|
| 179 |
+
logger.info(f" Added {len(mmlu_pro_val)} MMLU-Pro validation questions")
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.warning(f" Could not load MMLU-Pro validation: {e}")
|
| 182 |
+
|
| 183 |
+
# Source 3: MMLU (original - 14,042 questions for cross-domain coverage)
|
| 184 |
+
try:
|
| 185 |
+
logger.info(" Loading MMLU (original)...")
|
| 186 |
+
# MMLU has multiple subjects, we'll load the test split
|
| 187 |
+
# Using the 'all' configuration to get all subjects
|
| 188 |
+
mmlu_dataset = load_dataset("cais/mmlu", "all", split="test")
|
| 189 |
+
for idx, item in enumerate(mmlu_dataset):
|
| 190 |
+
all_questions_pool.append({
|
| 191 |
+
'id': f"mmlu_{idx}",
|
| 192 |
+
'source': 'MMLU',
|
| 193 |
+
'domain': item.get('subject', 'cross_domain').lower(),
|
| 194 |
+
'question': item['question'],
|
| 195 |
+
'answer': str(item['answer']),
|
| 196 |
+
'choices': item.get('choices', []),
|
| 197 |
+
'success_rate': 0.65 # MMLU is easier than MMLU-Pro
|
| 198 |
+
})
|
| 199 |
+
logger.info(f" Added {len(mmlu_dataset)} MMLU questions")
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.warning(f" Could not load MMLU: {e}")
|
| 202 |
+
|
| 203 |
+
# Source 4: ARC-Challenge - Science reasoning
|
| 204 |
+
try:
|
| 205 |
+
logger.info(" Loading ARC-Challenge...")
|
| 206 |
+
arc_dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")
|
| 207 |
+
for idx, item in enumerate(arc_dataset):
|
| 208 |
+
all_questions_pool.append({
|
| 209 |
+
'id': f"arc_challenge_{idx}",
|
| 210 |
+
'source': 'ARC-Challenge',
|
| 211 |
+
'domain': 'science',
|
| 212 |
+
'question': item['question'],
|
| 213 |
+
'answer': item['answerKey'],
|
| 214 |
+
'choices': item['choices']['text'] if 'choices' in item else [],
|
| 215 |
+
'success_rate': 0.50
|
| 216 |
+
})
|
| 217 |
+
logger.info(f" Added {len(arc_dataset)} ARC-Challenge questions")
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.warning(f" Could not load ARC-Challenge: {e}")
|
| 220 |
+
|
| 221 |
+
# Source 5: HellaSwag - Commonsense NLI (sample 2K from 10K)
|
| 222 |
+
try:
|
| 223 |
+
logger.info(" Loading HellaSwag...")
|
| 224 |
+
hellaswag_dataset = load_dataset("Rowan/hellaswag", split="validation")
|
| 225 |
+
# Sample to 2000 to manage size
|
| 226 |
+
if len(hellaswag_dataset) > 2000:
|
| 227 |
+
indices = random.sample(range(len(hellaswag_dataset)), 2000)
|
| 228 |
+
hellaswag_dataset = hellaswag_dataset.select(indices)
|
| 229 |
+
for idx, item in enumerate(hellaswag_dataset):
|
| 230 |
+
all_questions_pool.append({
|
| 231 |
+
'id': f"hellaswag_{idx}",
|
| 232 |
+
'source': 'HellaSwag',
|
| 233 |
+
'domain': 'commonsense',
|
| 234 |
+
'question': item['ctx'],
|
| 235 |
+
'answer': str(item['label']),
|
| 236 |
+
'choices': item['endings'] if 'endings' in item else [],
|
| 237 |
+
'success_rate': 0.65
|
| 238 |
+
})
|
| 239 |
+
logger.info(f" Added {len(hellaswag_dataset)} HellaSwag questions")
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logger.warning(f" Could not load HellaSwag: {e}")
|
| 242 |
+
|
| 243 |
+
# Source 6: GSM8K - Math word problems
|
| 244 |
+
try:
|
| 245 |
+
logger.info(" Loading GSM8K...")
|
| 246 |
+
gsm8k_dataset = load_dataset("openai/gsm8k", "main", split="test")
|
| 247 |
+
for idx, item in enumerate(gsm8k_dataset):
|
| 248 |
+
all_questions_pool.append({
|
| 249 |
+
'id': f"gsm8k_{idx}",
|
| 250 |
+
'source': 'GSM8K',
|
| 251 |
+
'domain': 'math_word_problems',
|
| 252 |
+
'question': item['question'],
|
| 253 |
+
'answer': item['answer'],
|
| 254 |
+
'choices': None,
|
| 255 |
+
'success_rate': 0.55
|
| 256 |
+
})
|
| 257 |
+
logger.info(f" Added {len(gsm8k_dataset)} GSM8K questions")
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.warning(f" Could not load GSM8K: {e}")
|
| 260 |
+
|
| 261 |
+
# Source 7: TruthfulQA - Truthfulness detection
|
| 262 |
+
try:
|
| 263 |
+
logger.info(" Loading TruthfulQA...")
|
| 264 |
+
truthfulqa_dataset = load_dataset("truthful_qa", "generation", split="validation")
|
| 265 |
+
for idx, item in enumerate(truthfulqa_dataset):
|
| 266 |
+
all_questions_pool.append({
|
| 267 |
+
'id': f"truthfulqa_{idx}",
|
| 268 |
+
'source': 'TruthfulQA',
|
| 269 |
+
'domain': 'truthfulness',
|
| 270 |
+
'question': item['question'],
|
| 271 |
+
'answer': item['best_answer'],
|
| 272 |
+
'choices': None,
|
| 273 |
+
'success_rate': 0.35
|
| 274 |
+
})
|
| 275 |
+
logger.info(f" Added {len(truthfulqa_dataset)} TruthfulQA questions")
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.warning(f" Could not load TruthfulQA: {e}")
|
| 278 |
+
|
| 279 |
+
# Source 8: Winogrande - Commonsense reasoning
|
| 280 |
+
try:
|
| 281 |
+
logger.info(" Loading Winogrande...")
|
| 282 |
+
winogrande_dataset = load_dataset("winogrande", "winogrande_xl", split="validation")
|
| 283 |
+
for idx, item in enumerate(winogrande_dataset):
|
| 284 |
+
all_questions_pool.append({
|
| 285 |
+
'id': f"winogrande_{idx}",
|
| 286 |
+
'source': 'Winogrande',
|
| 287 |
+
'domain': 'commonsense_reasoning',
|
| 288 |
+
'question': item['sentence'],
|
| 289 |
+
'answer': item['answer'],
|
| 290 |
+
'choices': [item['option1'], item['option2']],
|
| 291 |
+
'success_rate': 0.70
|
| 292 |
+
})
|
| 293 |
+
logger.info(f" Added {len(winogrande_dataset)} Winogrande questions")
|
| 294 |
+
except Exception as e:
|
| 295 |
+
logger.warning(f" Could not load Winogrande: {e}")
|
| 296 |
+
|
| 297 |
+
total_available = len(all_questions_pool)
|
| 298 |
+
logger.info(f"Total questions available: {total_available:,}")
|
| 299 |
+
|
| 300 |
+
if current_count >= total_available:
|
| 301 |
+
return f"β
Database is complete! All {total_available:,} questions already indexed.\n\nπ **20 domains** across **7 benchmark sources**!"
|
| 302 |
|
| 303 |
+
# Get next batch (skip ones we've already indexed)
|
| 304 |
+
start_idx = current_count
|
| 305 |
+
end_idx = min(start_idx + batch_size, total_available)
|
| 306 |
+
batch_data = all_questions_pool[start_idx:end_idx]
|
| 307 |
|
| 308 |
+
# Convert to BenchmarkQuestion objects
|
| 309 |
+
batch_questions = []
|
| 310 |
+
for q_data in batch_data:
|
| 311 |
question = BenchmarkQuestion(
|
| 312 |
+
question_id=q_data['id'],
|
| 313 |
+
source_benchmark=q_data['source'],
|
| 314 |
+
domain=q_data['domain'],
|
| 315 |
+
question_text=q_data['question'],
|
| 316 |
+
correct_answer=q_data['answer'],
|
| 317 |
+
choices=q_data.get('choices'),
|
| 318 |
+
success_rate=q_data['success_rate'],
|
| 319 |
+
difficulty_score=1.0 - q_data['success_rate'],
|
| 320 |
+
difficulty_label="Hard" if q_data['success_rate'] < 0.5 else "Moderate",
|
| 321 |
num_models_tested=0
|
| 322 |
)
|
| 323 |
batch_questions.append(question)
|
| 324 |
|
|
|
|
| 325 |
logger.info(f"Indexing {len(batch_questions)} new questions...")
|
| 326 |
db.index_questions(batch_questions)
|
| 327 |
|
|
|
|
| 332 |
result += f"**Database Stats:**\n"
|
| 333 |
result += f"- Total Questions: {new_count:,}\n"
|
| 334 |
result += f"- Just Added: {len(batch_questions)}\n"
|
| 335 |
+
result += f"- Total Available: {total_available:,}\n"
|
| 336 |
result += f"- Remaining: {still_remaining:,}\n\n"
|
| 337 |
|
| 338 |
if still_remaining > 0:
|
| 339 |
+
result += f"π‘ Click 'Expand Database' again to add up to {min(batch_size, still_remaining):,} more questions.\n"
|
| 340 |
+
result += f"π Progress: {(new_count/total_available*100):.1f}% complete"
|
| 341 |
else:
|
| 342 |
+
result += f"π Database is now complete with all {total_available:,} questions!\n\n"
|
| 343 |
+
result += f"π **Sources:** MMLU, MMLU-Pro, ARC-Challenge, HellaSwag, GSM8K, TruthfulQA, Winogrande\n"
|
| 344 |
+
result += f"π **Domains:** 20+ including science, math, truthfulness, commonsense, and more!"
|
| 345 |
|
| 346 |
return result
|
| 347 |
|
|
|
|
| 355 |
try:
|
| 356 |
current_count = db.collection.count()
|
| 357 |
|
| 358 |
+
# Total available across all sources
|
| 359 |
+
# MMLU: ~14,042 + MMLU-Pro: 12,102 + ARC: 1,172 + HellaSwag: 2,000
|
| 360 |
+
# + GSM8K: 1,319 + TruthfulQA: 817 + Winogrande: 1,267 = ~32,719 total
|
| 361 |
+
total_available = 32719
|
| 362 |
remaining = total_available - current_count
|
| 363 |
+
progress_pct = (current_count / total_available * 100) if total_available > 0 else 0
|
| 364 |
|
| 365 |
info = f"### π Database Status\n\n"
|
| 366 |
info += f"**Current Size:** {current_count:,} questions\n"
|
| 367 |
+
info += f"**Total Available:** {total_available:,} questions\n"
|
| 368 |
+
info += f"**Progress:** {progress_pct:.1f}% complete\n"
|
| 369 |
info += f"**Remaining:** {max(0, remaining):,} questions\n\n"
|
| 370 |
|
| 371 |
if remaining > 0:
|
| 372 |
+
info += f"π‘ Click 'Expand Database' to add 5,000 more questions (~2-3 min per batch)\n\n"
|
| 373 |
+
clicks_needed = (remaining + 4999) // 5000 # Round up
|
| 374 |
+
info += f"π ~{clicks_needed} more clicks to reach full 32K+ dataset"
|
| 375 |
else:
|
| 376 |
+
info += f"π Database is complete with all available questions!\n\n"
|
| 377 |
+
info += f"**Sources:** MMLU, MMLU-Pro, ARC-Challenge, HellaSwag, GSM8K, TruthfulQA, Winogrande\n"
|
| 378 |
+
info += f"**Domains:** 20+ including truthfulness, commonsense, math word problems, science, and more!"
|
| 379 |
|
| 380 |
return info
|
| 381 |
except Exception as e:
|
|
|
|
| 453 |
)
|
| 454 |
|
| 455 |
if __name__ == "__main__":
|
| 456 |
+
# HuggingFace Spaces: Use default port (7860) and auto-share
|
| 457 |
+
# Port is auto-assigned by HF Spaces infrastructure
|
| 458 |
+
import os
|
| 459 |
+
port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
|
| 460 |
+
demo.launch(server_name="0.0.0.0", server_port=port)
|