Final_Assignment

Sleeping

GAIA Developer Claude commited on Jun 14

Commit

e09f605

1 Parent(s): aebabc5

📊 Add comprehensive answer validation interface for accuracy monitoring

**Restored Missing Validation Column:**
- Added "Correct Answer" column displaying expected answers from validation data
- Added "Match" column with visual indicators:
- ✅ = Exact match (case-insensitive)
- 🟡 = Partial match (substring matching)
- ❌ = No match or error

**Enhanced Validation Features:**
- Loads validation data from gaia_validation_metadata.jsonl
- Real-time answer comparison during processing
- Detailed match logging for performance analysis
- Graceful fallback when validation data unavailable

**Interface Improvements:**
- Updated results table label to "Detailed Question Results with Validation"
- Added validation legend to user instructions
- Applied to both root and deployment app versions consistently

**Technical Details:**
- Case-insensitive string matching for robustness
- Substring matching for partial credit detection
- Supports multiple validation file locations
- JSON parsing with error handling

This restores the validation functionality from commit 7724e0ec51435b950a6ea341fee67a3fce051261
and enables real-time accuracy monitoring during evaluation runs.

🔧 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

app.py +47 -4
app/app.py +47 -4

app.py CHANGED Viewed

@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
     print(f"📋 Agent code available at: {agent_code}")
-    # 2. Fetch Questions
     print(f"📥 Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     except Exception as e:
         print(f"❌ Unexpected error fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run Advanced GAIA Agent
     results_log = []
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             submitted_answer = agent(question_text)
             question_time = time.time() - question_start
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": submitted_answer,
                 "Processing Time (s)": f"{question_time:.2f}"
             })
-            print(f"✅ Completed in {question_time:.2f}s")
         except Exception as e:
             print(f"❌ Error running agent on task {task_id}: {e}")
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": f"AGENT ERROR: {e}",
                 "Processing Time (s)": "Error"
             })
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
         1. **Login**: Use the Hugging Face login button below
         2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
-        3. **Results**: View detailed results and performance metrics
         ---
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
     )
     results_table = gr.DataFrame(
-        label="📋 Detailed Question Results",
         wrap=True,
         interactive=False
     )

     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
     print(f"📋 Agent code available at: {agent_code}")
+    # 2. Fetch Questions and Load Validation Data
     print(f"📥 Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
     except Exception as e:
         print(f"❌ Unexpected error fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
+    # Load validation data for correct answers
+    validation_data = {}
+    validation_files = [
+        "/home/user/gaia_validation_metadata.jsonl",
+        "/home/user/app/gaia_validation_metadata.jsonl"
+    ]
+    for validation_file in validation_files:
+        try:
+            if os.path.exists(validation_file):
+                print(f"📋 Loading validation data from: {validation_file}")
+                with open(validation_file, 'r') as f:
+                    for line in f:
+                        if line.strip():
+                            entry = json.loads(line.strip())
+                            validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
+                print(f"✅ Loaded validation data for {len(validation_data)} questions")
+                break
+        except Exception as e:
+            print(f"⚠️ Could not load validation data from {validation_file}: {e}")
+            continue
     # 3. Run Advanced GAIA Agent
     results_log = []
             submitted_answer = agent(question_text)
             question_time = time.time() - question_start
+            # Get correct answer for validation
+            correct_answer = validation_data.get(task_id, "N/A")
+            # Check if submitted answer matches correct answer (case-insensitive, trimmed)
+            is_correct = "❌"
+            if correct_answer != "N/A":
+                submitted_clean = str(submitted_answer).strip().lower()
+                correct_clean = str(correct_answer).strip().lower()
+                if submitted_clean == correct_clean:
+                    is_correct = "✅"
+                elif submitted_clean in correct_clean or correct_clean in submitted_clean:
+                    is_correct = "🟡"  # Partial match
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": submitted_answer,
+                "Correct Answer": correct_answer,
+                "Match": is_correct,
                 "Processing Time (s)": f"{question_time:.2f}"
             })
+            print(f"✅ Completed in {question_time:.2f}s - Match: {is_correct}")
         except Exception as e:
             print(f"❌ Error running agent on task {task_id}: {e}")
+            correct_answer = validation_data.get(task_id, "N/A")
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": f"AGENT ERROR: {e}",
+                "Correct Answer": correct_answer,
+                "Match": "❌",
                 "Processing Time (s)": "Error"
             })
         1. **Login**: Use the Hugging Face login button below
         2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
+        3. **Results**: View detailed results with validation against correct answers
+           - ✅ = Exact match
+           - 🟡 = Partial match
+           - ❌ = No match
         ---
     )
     results_table = gr.DataFrame(
+        label="📋 Detailed Question Results with Validation",
         wrap=True,
         interactive=False
     )

app/app.py CHANGED Viewed

@@ -297,7 +297,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
     print(f"📋 Agent code available at: {agent_code}")
-    # 2. Fetch Questions
     print(f"📥 Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
@@ -316,6 +316,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     except Exception as e:
         print(f"❌ Unexpected error fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run Advanced GAIA Agent
     results_log = []
@@ -338,21 +360,39 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             submitted_answer = agent(question_text)
             question_time = time.time() - question_start
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": submitted_answer,
                 "Processing Time (s)": f"{question_time:.2f}"
             })
-            print(f"✅ Completed in {question_time:.2f}s")
         except Exception as e:
             print(f"❌ Error running agent on task {task_id}: {e}")
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": f"AGENT ERROR: {e}",
                 "Processing Time (s)": "Error"
             })
@@ -459,7 +499,10 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
         1. **Login**: Use the Hugging Face login button below
         2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
-        3. **Results**: View detailed results and performance metrics
         ---
@@ -489,7 +532,7 @@ with gr.Blocks(title="Advanced GAIA Agent Evaluation", theme=gr.themes.Soft()) a
     )
     results_table = gr.DataFrame(
-        label="📋 Detailed Question Results",
         wrap=True,
         interactive=False
     )

     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://github.com/your-repo"
     print(f"📋 Agent code available at: {agent_code}")
+    # 2. Fetch Questions and Load Validation Data
     print(f"📥 Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
     except Exception as e:
         print(f"❌ Unexpected error fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
+    # Load validation data for correct answers
+    validation_data = {}
+    validation_files = [
+        "/home/user/gaia_validation_metadata.jsonl",
+        "/home/user/app/gaia_validation_metadata.jsonl"
+    ]
+    for validation_file in validation_files:
+        try:
+            if os.path.exists(validation_file):
+                print(f"📋 Loading validation data from: {validation_file}")
+                with open(validation_file, 'r') as f:
+                    for line in f:
+                        if line.strip():
+                            entry = json.loads(line.strip())
+                            validation_data[entry['task_id']] = entry.get('Final answer', 'N/A')
+                print(f"✅ Loaded validation data for {len(validation_data)} questions")
+                break
+        except Exception as e:
+            print(f"⚠️ Could not load validation data from {validation_file}: {e}")
+            continue
     # 3. Run Advanced GAIA Agent
     results_log = []
             submitted_answer = agent(question_text)
             question_time = time.time() - question_start
+            # Get correct answer for validation
+            correct_answer = validation_data.get(task_id, "N/A")
+            # Check if submitted answer matches correct answer (case-insensitive, trimmed)
+            is_correct = "❌"
+            if correct_answer != "N/A":
+                submitted_clean = str(submitted_answer).strip().lower()
+                correct_clean = str(correct_answer).strip().lower()
+                if submitted_clean == correct_clean:
+                    is_correct = "✅"
+                elif submitted_clean in correct_clean or correct_clean in submitted_clean:
+                    is_correct = "🟡"  # Partial match
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": submitted_answer,
+                "Correct Answer": correct_answer,
+                "Match": is_correct,
                 "Processing Time (s)": f"{question_time:.2f}"
             })
+            print(f"✅ Completed in {question_time:.2f}s - Match: {is_correct}")
         except Exception as e:
             print(f"❌ Error running agent on task {task_id}: {e}")
+            correct_answer = validation_data.get(task_id, "N/A")
             results_log.append({
                 "Task ID": task_id[:12] + "..." if len(task_id) > 12 else task_id,
                 "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Submitted Answer": f"AGENT ERROR: {e}",
+                "Correct Answer": correct_answer,
+                "Match": "❌",
                 "Processing Time (s)": "Error"
             })
         1. **Login**: Use the Hugging Face login button below
         2. **Submit**: Click "Run Advanced GAIA Agent" to process all questions
+        3. **Results**: View detailed results with validation against correct answers
+           - ✅ = Exact match
+           - 🟡 = Partial match
+           - ❌ = No match
         ---
     )
     results_table = gr.DataFrame(
+        label="📋 Detailed Question Results with Validation",
         wrap=True,
         interactive=False
     )