Spaces:
Sleeping
π Add comprehensive answer validation interface for accuracy monitoring
Browse files**Restored Missing Validation Column:**
- Added "Correct Answer" column displaying expected answers from validation data
- Added "Match" column with visual indicators:
- β
= Exact match (case-insensitive)
- π‘ = Partial match (substring matching)
- β = No match or error
**Enhanced Validation Features:**
- Loads validation data from gaia_validation_metadata.jsonl
- Real-time answer comparison during processing
- Detailed match logging for performance analysis
- Graceful fallback when validation data unavailable
**Interface Improvements:**
- Updated results table label to "Detailed Question Results with Validation"
- Added validation legend to user instructions
- Applied to both root and deployment app versions consistently
**Technical Details:**
- Case-insensitive string matching for robustness
- Substring matching for partial credit detection
- Supports multiple validation file locations
- JSON parsing with error handling
This restores the validation functionality from commit 7724e0ec51435b950a6ea341fee67a3fce051261
and enables real-time accuracy monitoring during evaluation runs.
π§ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- app.py +47 -4
- app/app.py +47 -4
|
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
| 298 |
print(f"π Agent code available at: {agent_code}")
|
| 299 |
|
| 300 |
-
# 2. Fetch Questions
|
| 301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
| 302 |
try:
|
| 303 |
response = requests.get(questions_url, timeout=15)
|
|
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 316 |
except Exception as e:
|
| 317 |
print(f"β Unexpected error fetching questions: {e}")
|
| 318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
# 3. Run Advanced GAIA Agent
|
| 321 |
results_log = []
|
|
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 338 |
submitted_answer = agent(question_text)
|
| 339 |
question_time = time.time() - question_start
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 342 |
results_log.append({
|
| 343 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 344 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 345 |
"Submitted Answer": submitted_answer,
|
|
|
|
|
|
|
| 346 |
"Processing Time (s)": f"{question_time:.2f}"
|
| 347 |
})
|
| 348 |
-
print(f"β
Completed in {question_time:.2f}s")
|
| 349 |
|
| 350 |
except Exception as e:
|
| 351 |
print(f"β Error running agent on task {task_id}: {e}")
|
|
|
|
| 352 |
results_log.append({
|
| 353 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 354 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 355 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
|
|
|
|
|
|
| 356 |
"Processing Time (s)": "Error"
|
| 357 |
})
|
| 358 |
|
|
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
| 459 |
|
| 460 |
1. **Login**: Use the Hugging Face login button below
|
| 461 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
| 462 |
-
3. **Results**: View detailed results
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
---
|
| 465 |
|
|
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
| 489 |
)
|
| 490 |
|
| 491 |
results_table = gr.DataFrame(
|
| 492 |
-
label="π Detailed Question Results",
|
| 493 |
wrap=True,
|
| 494 |
interactive=False
|
| 495 |
)
|
|
|
|
| 297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
| 298 |
print(f"π Agent code available at: {agent_code}")
|
| 299 |
|
| 300 |
+
# 2. Fetch Questions and Load Validation Data
|
| 301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
| 302 |
try:
|
| 303 |
response = requests.get(questions_url, timeout=15)
|
|
|
|
| 316 |
except Exception as e:
|
| 317 |
print(f"β Unexpected error fetching questions: {e}")
|
| 318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 319 |
+
|
| 320 |
+
# Load validation data for correct answers
|
| 321 |
+
validation_data = {}
|
| 322 |
+
validation_files = [
|
| 323 |
+
"/home/user/gaia_validation_metadata.jsonl",
|
| 324 |
+
"/home/user/app/gaia_validation_metadata.jsonl"
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
for validation_file in validation_files:
|
| 328 |
+
try:
|
| 329 |
+
if os.path.exists(validation_file):
|
| 330 |
+
print(f"π Loading validation data from: {validation_file}")
|
| 331 |
+
with open(validation_file, 'r') as f:
|
| 332 |
+
for line in f:
|
| 333 |
+
if line.strip():
|
| 334 |
+
entry = json.loads(line.strip())
|
| 335 |
+
validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
|
| 336 |
+
print(f"β
Loaded validation data for {len(validation_data)} questions")
|
| 337 |
+
break
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"β οΈ Could not load validation data from {validation_file}: {e}")
|
| 340 |
+
continue
|
| 341 |
|
| 342 |
# 3. Run Advanced GAIA Agent
|
| 343 |
results_log = []
|
|
|
|
| 360 |
submitted_answer = agent(question_text)
|
| 361 |
question_time = time.time() - question_start
|
| 362 |
|
| 363 |
+
# Get correct answer for validation
|
| 364 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
| 365 |
+
|
| 366 |
+
# Check if submitted answer matches correct answer (case-insensitive, trimmed)
|
| 367 |
+
is_correct = "β"
|
| 368 |
+
if correct_answer != "N/A":
|
| 369 |
+
submitted_clean = str(submitted_answer).strip().lower()
|
| 370 |
+
correct_clean = str(correct_answer).strip().lower()
|
| 371 |
+
if submitted_clean == correct_clean:
|
| 372 |
+
is_correct = "β
"
|
| 373 |
+
elif submitted_clean in correct_clean or correct_clean in submitted_clean:
|
| 374 |
+
is_correct = "π‘" # Partial match
|
| 375 |
+
|
| 376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 377 |
results_log.append({
|
| 378 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 379 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 380 |
"Submitted Answer": submitted_answer,
|
| 381 |
+
"Correct Answer": correct_answer,
|
| 382 |
+
"Match": is_correct,
|
| 383 |
"Processing Time (s)": f"{question_time:.2f}"
|
| 384 |
})
|
| 385 |
+
print(f"β
Completed in {question_time:.2f}s - Match: {is_correct}")
|
| 386 |
|
| 387 |
except Exception as e:
|
| 388 |
print(f"β Error running agent on task {task_id}: {e}")
|
| 389 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
| 390 |
results_log.append({
|
| 391 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 392 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 393 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
| 394 |
+
"Correct Answer": correct_answer,
|
| 395 |
+
"Match": "β",
|
| 396 |
"Processing Time (s)": "Error"
|
| 397 |
})
|
| 398 |
|
|
|
|
| 499 |
|
| 500 |
1. **Login**: Use the Hugging Face login button below
|
| 501 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
| 502 |
+
3. **Results**: View detailed results with validation against correct answers
|
| 503 |
+
- β
= Exact match
|
| 504 |
+
- π‘ = Partial match
|
| 505 |
+
- β = No match
|
| 506 |
|
| 507 |
---
|
| 508 |
|
|
|
|
| 532 |
)
|
| 533 |
|
| 534 |
results_table = gr.DataFrame(
|
| 535 |
+
label="π Detailed Question Results with Validation",
|
| 536 |
wrap=True,
|
| 537 |
interactive=False
|
| 538 |
)
|
|
@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
| 298 |
print(f"π Agent code available at: {agent_code}")
|
| 299 |
|
| 300 |
-
# 2. Fetch Questions
|
| 301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
| 302 |
try:
|
| 303 |
response = requests.get(questions_url, timeout=15)
|
|
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 316 |
except Exception as e:
|
| 317 |
print(f"β Unexpected error fetching questions: {e}")
|
| 318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
# 3. Run Advanced GAIA Agent
|
| 321 |
results_log = []
|
|
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 338 |
submitted_answer = agent(question_text)
|
| 339 |
question_time = time.time() - question_start
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 342 |
results_log.append({
|
| 343 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 344 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 345 |
"Submitted Answer": submitted_answer,
|
|
|
|
|
|
|
| 346 |
"Processing Time (s)": f"{question_time:.2f}"
|
| 347 |
})
|
| 348 |
-
print(f"β
Completed in {question_time:.2f}s")
|
| 349 |
|
| 350 |
except Exception as e:
|
| 351 |
print(f"β Error running agent on task {task_id}: {e}")
|
|
|
|
| 352 |
results_log.append({
|
| 353 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 354 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 355 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
|
|
|
|
|
|
| 356 |
"Processing Time (s)": "Error"
|
| 357 |
})
|
| 358 |
|
|
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
| 459 |
|
| 460 |
1. **Login**: Use the Hugging Face login button below
|
| 461 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
| 462 |
-
3. **Results**: View detailed results
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
---
|
| 465 |
|
|
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
|
|
| 489 |
)
|
| 490 |
|
| 491 |
results_table = gr.DataFrame(
|
| 492 |
-
label="π Detailed Question Results",
|
| 493 |
wrap=True,
|
| 494 |
interactive=False
|
| 495 |
)
|
|
|
|
| 297 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
|
| 298 |
print(f"π Agent code available at: {agent_code}")
|
| 299 |
|
| 300 |
+
# 2. Fetch Questions and Load Validation Data
|
| 301 |
print(f"π₯ Fetching questions from: {questions_url}")
|
| 302 |
try:
|
| 303 |
response = requests.get(questions_url, timeout=15)
|
|
|
|
| 316 |
except Exception as e:
|
| 317 |
print(f"β Unexpected error fetching questions: {e}")
|
| 318 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 319 |
+
|
| 320 |
+
# Load validation data for correct answers
|
| 321 |
+
validation_data = {}
|
| 322 |
+
validation_files = [
|
| 323 |
+
"/home/user/gaia_validation_metadata.jsonl",
|
| 324 |
+
"/home/user/app/gaia_validation_metadata.jsonl"
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
for validation_file in validation_files:
|
| 328 |
+
try:
|
| 329 |
+
if os.path.exists(validation_file):
|
| 330 |
+
print(f"π Loading validation data from: {validation_file}")
|
| 331 |
+
with open(validation_file, 'r') as f:
|
| 332 |
+
for line in f:
|
| 333 |
+
if line.strip():
|
| 334 |
+
entry = json.loads(line.strip())
|
| 335 |
+
validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
|
| 336 |
+
print(f"β
Loaded validation data for {len(validation_data)} questions")
|
| 337 |
+
break
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"β οΈ Could not load validation data from {validation_file}: {e}")
|
| 340 |
+
continue
|
| 341 |
|
| 342 |
# 3. Run Advanced GAIA Agent
|
| 343 |
results_log = []
|
|
|
|
| 360 |
submitted_answer = agent(question_text)
|
| 361 |
question_time = time.time() - question_start
|
| 362 |
|
| 363 |
+
# Get correct answer for validation
|
| 364 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
| 365 |
+
|
| 366 |
+
# Check if submitted answer matches correct answer (case-insensitive, trimmed)
|
| 367 |
+
is_correct = "β"
|
| 368 |
+
if correct_answer != "N/A":
|
| 369 |
+
submitted_clean = str(submitted_answer).strip().lower()
|
| 370 |
+
correct_clean = str(correct_answer).strip().lower()
|
| 371 |
+
if submitted_clean == correct_clean:
|
| 372 |
+
is_correct = "β
"
|
| 373 |
+
elif submitted_clean in correct_clean or correct_clean in submitted_clean:
|
| 374 |
+
is_correct = "π‘" # Partial match
|
| 375 |
+
|
| 376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 377 |
results_log.append({
|
| 378 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 379 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 380 |
"Submitted Answer": submitted_answer,
|
| 381 |
+
"Correct Answer": correct_answer,
|
| 382 |
+
"Match": is_correct,
|
| 383 |
"Processing Time (s)": f"{question_time:.2f}"
|
| 384 |
})
|
| 385 |
+
print(f"β
Completed in {question_time:.2f}s - Match: {is_correct}")
|
| 386 |
|
| 387 |
except Exception as e:
|
| 388 |
print(f"β Error running agent on task {task_id}: {e}")
|
| 389 |
+
correct_answer = validation_data.get(task_id, "N/A")
|
| 390 |
results_log.append({
|
| 391 |
"Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
|
| 392 |
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 393 |
"Submitted Answer": f"AGENT ERROR: {e}",
|
| 394 |
+
"Correct Answer": correct_answer,
|
| 395 |
+
"Match": "β",
|
| 396 |
"Processing Time (s)": "Error"
|
| 397 |
})
|
| 398 |
|
|
|
|
| 499 |
|
| 500 |
1. **Login**: Use the Hugging Face login button below
|
| 501 |
2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
|
| 502 |
+
3. **Results**: View detailed results with validation against correct answers
|
| 503 |
+
- β
= Exact match
|
| 504 |
+
- π‘ = Partial match
|
| 505 |
+
- β = No match
|
| 506 |
|
| 507 |
---
|
| 508 |
|
|
|
|
| 532 |
)
|
| 533 |
|
| 534 |
results_table = gr.DataFrame(
|
| 535 |
+
label="π Detailed Question Results with Validation",
|
| 536 |
wrap=True,
|
| 537 |
interactive=False
|
| 538 |
)
|