Maharshi Gor commited on
Commit
e494d40
·
1 Parent(s): bfbc762

Leaderboard, metrics, and docs.

Browse files
README.md CHANGED
@@ -1,11 +1,17 @@
1
  ---
2
- title: Grounded Qa Leaderboard
3
- emoji: 👻
4
- colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.27.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
 
 
 
 
 
 
1
  ---
2
+ title: QANTA 2025 Leaderboard
3
+ emoji: 🎖️
4
+ colorFrom: red
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: 'Leaderboard for QANTA 2025: Human-AI Cooperative Trivia'
12
  ---
13
+
14
+ # QANTA 2025 Leaderboard
15
+
16
+ This is the leaderboard for QANTA 2025: Human-AI Cooperative Trivia.
17
+
app.py CHANGED
@@ -1,51 +1,124 @@
1
  import gradio as gr
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
 
3
  from huggingface_hub import snapshot_download
 
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
  TITLE,
8
  )
9
  from src.display.css_html_js import custom_css
10
- from src.display.utils import (
11
- AutoEvalColumn,
12
- fields,
 
 
 
 
 
 
 
 
13
  )
14
- from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
- from src.populate import get_new_leaderboard_df
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def restart_space():
19
  API.restart_space(repo_id=REPO_ID)
20
 
 
21
  try:
22
  print(EVAL_RESULTS_PATH)
23
  snapshot_download(
24
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
25
  )
26
  except Exception:
27
  restart_space()
28
 
29
- original_df = get_new_leaderboard_df(EVAL_RESULTS_PATH)
30
- leaderboard_df = original_df.copy()
31
 
32
- demo = gr.Blocks(css=custom_css)
33
- with demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  gr.HTML(TITLE)
35
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
36
 
37
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
38
- with gr.TabItem("🏅 System", elem_id="llm-benchmark-tab-table", id=0):
39
- leaderboard_table = gr.components.Dataframe(
40
- value=[leaderboard_df.iloc[idx] for idx in range(len(leaderboard_df))],
41
- headers=[c.name for c in fields(AutoEvalColumn)],
42
- datatype=[c.type for c in fields(AutoEvalColumn)],
43
- elem_id="leaderboard-table",
44
- interactive=False,
45
- visible=True,
46
- )
47
-
48
- scheduler = BackgroundScheduler()
49
- scheduler.add_job(restart_space, "interval", seconds=1800)
50
- scheduler.start()
51
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
+ import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from gradio_leaderboard import Leaderboard
5
  from huggingface_hub import snapshot_download
6
+ from loguru import logger
7
 
8
  from src.about import (
9
  INTRODUCTION_TEXT,
10
  TITLE,
11
  )
12
  from src.display.css_html_js import custom_css
13
+ from src.envs import (
14
+ API,
15
+ COMPETITION_URL,
16
+ EVAL_RESULTS_PATH,
17
+ EVAL_SPLITS,
18
+ LEADERBOARD_REFRESH_INTERVAL,
19
+ REGISTRATION_URL,
20
+ REPO_ID,
21
+ RESULTS_REPO,
22
+ SUBMISSION_URL,
23
+ TOKEN,
24
  )
25
+ from src.hf_dataset_utils import download_dataset_snapshot
26
+ from src.populate import (
27
+ fetch_bonus_leaderboard,
28
+ fetch_tossup_leaderboard,
29
+ )
30
+
31
+
32
+ # Load metrics manual content
33
+ def load_metrics_manual():
34
+ try:
35
+ with open("metrics_manual.md", "r") as f:
36
+ return f.read()
37
+ except Exception as e:
38
+ logger.error(f"Error loading metrics manual: {e}")
39
+ return "# Metrics Manual\n\nCould not load metrics manual content."
40
 
41
 
42
  def restart_space():
43
  API.restart_space(repo_id=REPO_ID)
44
 
45
+
46
  try:
47
  print(EVAL_RESULTS_PATH)
48
  snapshot_download(
49
+ repo_id=RESULTS_REPO,
50
+ local_dir=EVAL_RESULTS_PATH,
51
+ repo_type="dataset",
52
+ tqdm_class=None,
53
+ etag_timeout=30,
54
+ token=TOKEN,
55
  )
56
  except Exception:
57
  restart_space()
58
 
 
 
59
 
60
+ def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
61
+ download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
62
+ tossup_df = fetch_tossup_leaderboard(split, style)
63
+ bonus_df = fetch_bonus_leaderboard(split, style)
64
+ return tossup_df, bonus_df
65
+
66
+
67
+ def create_leaderboard_interface(app, split: str = "tiny_eval"):
68
+ leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
69
+ refresh_btn = gr.Button("🔄 Refresh")
70
+
71
+ tossup_df, bonus_df = refresh_leaderboard(split, style=False)
72
+
73
+ gr.Markdown("## 🛎️ Tossup Round Leaderboard")
74
+ logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
75
+ tossup_leaderboard = Leaderboard(
76
+ value=tossup_df,
77
+ search_columns=["Submission"],
78
+ datatype=["str", "number", "number", "number", "number", "number"],
79
+ elem_id="tossup-table",
80
+ interactive=False, # Ensure it's not interactive
81
+ )
82
+
83
+ gr.Markdown("")
84
+
85
+ gr.Markdown("## 🤔 Bonus Round Leaderboard")
86
+ logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
87
+ bonus_leaderboard = Leaderboard(
88
+ value=bonus_df,
89
+ search_columns=["Submission"],
90
+ datatype=["str", "number", "number"],
91
+ elem_id="bonus-table",
92
+ interactive=False, # Ensure it's not interactive
93
+ )
94
+
95
+ gr.on(
96
+ triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
97
+ fn=refresh_leaderboard,
98
+ inputs=[gr.State(split)],
99
+ outputs=[tossup_leaderboard, bonus_leaderboard],
100
+ )
101
+
102
+
103
+ with gr.Blocks(css=custom_css) as demo:
104
  gr.HTML(TITLE)
105
+ gr.Markdown(
106
+ f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
107
+ f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
108
+ elem_classes="welcome-text",
109
+ )
110
 
111
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
112
+ for i, (name, split) in enumerate(EVAL_SPLITS.items()):
113
+ with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
114
+ leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
115
+ create_leaderboard_interface(demo, split)
116
+
117
+ # Add the Metrics Guide tab
118
+ with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
119
+ gr.Markdown(load_metrics_manual())
120
+
121
+ # scheduler = BackgroundScheduler()
122
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
123
+ # scheduler.start()
124
+ demo.queue(default_concurrency_limit=40).launch()
 
metrics_manual.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QANTA 2025 Leaderboard Metrics Manual
2
+
3
+ This document explains the metrics displayed on the QANTA 2025 Human-AI Cooperative QA competition leaderboard.
4
+
5
+ ## Tossup Round Metrics
6
+
7
+ Tossup rounds measure an AI system's ability to answer questions as they're being read:
8
+
9
+ | Metric | Description |
10
+ |--------|-------------|
11
+ | **Submission** | The username and model name of the submission (format: `username/model_name`) |
12
+ | **Avg Score ⬆️** | Average points scored per tossup question. 10 points is the maximum score per question. -5 point for incorrect buzzes, 0 for no buzz. Positive scores (green) indicate good performance, while negative scores (red) indicate penalties for incorrect answers. |
13
+ | **Buzz Accuracy** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
14
+ | **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
15
+ | **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players. |
16
+
17
+ ## Bonus Round Metrics
18
+
19
+ Bonus rounds test an AI system's ability to answer multi-part questions:
20
+
21
+ | Metric | Description |
22
+ |--------|-------------|
23
+ | **Submission** | The username and model name of the submission (format: `username/model_name`) |
24
+ | **Question Accuracy** | Percentage of bonus questions where all parts were answered correctly. |
25
+ | **Part Accuracy** | Percentage of individual bonus question parts answered correctly across all questions. |
26
+
27
+ ## Understanding the Competition
28
+
29
+ QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
30
+
31
+ 1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer
32
+ 2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas
33
+
34
+ The leaderboard tracks how well AI models perform on both question types across different evaluation datasets.
requirements.txt CHANGED
@@ -1,11 +1,13 @@
1
  APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
- datasets==2.14.5
5
- gradio==4.4.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
8
- numpy==1.24.2
9
- pandas==2.0.0
10
  python-dateutil==2.8.2
11
- requests==2.28.2
 
 
 
1
  APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
+ datasets>=3.0.0
5
+ gradio>=5.0.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
8
+ numpy<2.0.0
9
+ pandas>=2.0.0
10
  python-dateutil==2.8.2
11
+ requests==2.28.2
12
+ gradio_leaderboard
13
+ loguru
src/__init__.py ADDED
File without changes
src/about.py CHANGED
@@ -1,5 +1,5 @@
1
  # Your leaderboard name
2
- TITLE = """<h1 align="center" id="space-title">Adversarial Calibration QA Leaderboard</h1>"""
3
 
4
  # What does your leaderboard evaluate?
5
  INTRODUCTION_TEXT = """
 
1
  # Your leaderboard name
2
+ TITLE = """<h1 align="center" id="space-title">QANTA 2025: Human-AI Cooperative QA Leaderboard</h1>"""
3
 
4
  # What does your leaderboard evaluate?
5
  INTRODUCTION_TEXT = """
src/display/css_html_js.py CHANGED
@@ -46,6 +46,10 @@ table th:first-child {
46
  white-space: nowrap;
47
  }
48
 
 
 
 
 
49
  .tab-buttons button {
50
  font-size: 20px;
51
  }
 
46
  white-space: nowrap;
47
  }
48
 
49
+ .table td .cell-wrap span {
50
+ white-space: pre;
51
+ }
52
+
53
  .tab-buttons button {
54
  font-size: 20px;
55
  }
src/envs.py CHANGED
@@ -4,19 +4,50 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
 
8
 
9
- OWNER = "umdclip" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/grounded_qa_leaderboard"
13
- RESULTS_REPO = f"{OWNER}/model-results"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # If you setup a cache later, just change HF_HOME
16
- CACHE_PATH=os.getenv("HF_HOME", ".")
17
 
18
  # Local caches
 
 
 
19
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
20
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
21
 
 
 
 
 
 
22
  API = HfApi(token=TOKEN)
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
+
9
 
 
10
  # ----------------------------------
11
 
12
+ OWNER = "qanta-challenge"
13
+
14
+ REPO_ID = f"{OWNER}/quizbowl-submission"
15
+ QUEUE_REPO = f"{OWNER}/advcal-requests"
16
+ RESULTS_REPO = f"{OWNER}/advcal-results"
17
+ LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
18
+ USERS_REPO = f"{OWNER}/registered-users"
19
+ EVAL_SPLITS = {"Week 0": "tiny_eval"}
20
+
21
+ # Important Links
22
+ QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
23
+ COMPETITION_URL = "https://sites.google.com/view/qanta/2025-competition"
24
+ DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
25
+ DOCS_URL = DOCS_REPO_URL + "/tree/main"
26
+ GITHUB_ISSUES_URL = DOCS_REPO_URL + "/issues"
27
+
28
+ CONTACT_EMAIL = "[email protected]"
29
+ DISCORD_URL = "https://discord.gg/ChmDVatJ6Y"
30
+ REGISTRATION_URL = "https://huggingface.co/spaces/qanta-challenge/register"
31
+ SUBMISSION_URL = "https://huggingface.co/spaces/qanta-challenge/quizbowl-submission"
32
+ EXAMPLES_PATH = "examples"
33
+
34
+
35
+ # ----------------------------------
36
 
37
  # If you setup a cache later, just change HF_HOME
38
+ CACHE_PATH = os.getenv("HF_HOME", ".")
39
 
40
  # Local caches
41
+ LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
42
+ USERS_PATH = os.path.join(CACHE_PATH, "registered-users")
43
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
44
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
45
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
46
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
47
 
48
+
49
+ LLM_CACHE_REFRESH_INTERVAL = 600 # seconds (30 minutes)
50
+ SERVER_RESTART_INTERVAL = 2 * 24 * 60 * 60 # seconds (2 days)
51
+ LEADERBOARD_REFRESH_INTERVAL = 600 # seconds (10 minutes)
52
+
53
  API = HfApi(token=TOKEN)
src/hf_dataset_utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi, snapshot_download
2
+ from loguru import logger
3
+
4
+ api = HfApi()
5
+
6
+
7
+ def download_dataset_snapshot(repo_id, local_dir):
8
+ try:
9
+ logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}")
10
+ snapshot_download(
11
+ repo_id=repo_id,
12
+ local_dir=local_dir,
13
+ repo_type="dataset",
14
+ tqdm_class=None,
15
+ )
16
+ except Exception as e:
17
+ logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
18
+ api.restart_space(repo_id=repo_id)
19
+
20
+
21
+ def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
22
+ """
23
+ Remove files or directories matching specified patterns from a Hugging Face dataset repository.
24
+
25
+ Args:
26
+ repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
27
+ path_patterns: List of file or directory path patterns to remove
28
+ commit_message: Message for the commit that removes the files
29
+ """
30
+ import fnmatch
31
+ import os
32
+
33
+ # Get all files in the repository
34
+ repo_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
35
+
36
+ # Find files matching the patterns
37
+ files_to_remove = []
38
+ for pattern in path_patterns:
39
+ matching_files = fnmatch.filter(repo_files, pattern)
40
+ files_to_remove.extend(matching_files)
41
+
42
+ # Delete each matching file
43
+ for path in files_to_remove:
44
+ try:
45
+ api.delete_file(
46
+ path_in_repo=path, repo_id=repo_id, repo_type="dataset", commit_message=f"{commit_message}: {path}"
47
+ )
48
+ print(f"Successfully removed {path} from {repo_id}")
49
+ except Exception as e:
50
+ print(f"Error removing {path}: {e}")
51
+
52
+
53
+ def update_dataset_info_readme(
54
+ repo_id: str,
55
+ dataset_info: dict,
56
+ license_id: str = None,
57
+ commit_message: str = "Update dataset_info in README.md",
58
+ ):
59
+ """
60
+ Update the dataset_info section in the README.md file of a Hugging Face dataset repository.
61
+
62
+ Args:
63
+ repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
64
+ dataset_info: Dictionary containing dataset information to include in the README
65
+ license_id: Optional license identifier (e.g., "mit", "cc-by-4.0")
66
+ commit_message: Message for the commit
67
+
68
+ Example dataset_info structure:
69
+ {
70
+ "features": [
71
+ {"name": "text", "dtype": "string"},
72
+ {"name": "label", "dtype": "int64"}
73
+ ],
74
+ "splits": [
75
+ {"name": "train", "num_examples": 10000, "num_bytes": 1000000},
76
+ {"name": "test", "num_examples": 1000, "num_bytes": 100000}
77
+ ],
78
+ "download_size": 1200000,
79
+ "dataset_size": 1100000,
80
+ "configs": [
81
+ {
82
+ "config_name": "default",
83
+ "data_files": [
84
+ {"split": "train", "path": "data/train.csv"},
85
+ {"split": "test", "path": "data/test.csv"}
86
+ ]
87
+ }
88
+ ]
89
+ }
90
+ """
91
+ import re
92
+
93
+ import yaml
94
+ from huggingface_hub import HfApi
95
+
96
+ api = HfApi()
97
+
98
+ # Check if README.md exists
99
+ try:
100
+ readme_content = api.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="README.md", token=None)
101
+ with open(readme_content, "r", encoding="utf-8") as f:
102
+ content = f.read()
103
+ except Exception:
104
+ # Create a new README.md if it doesn't exist
105
+ content = ""
106
+
107
+ # Parse existing YAML front matter if it exists
108
+ yaml_block = None
109
+ yaml_match = re.search(r"---\s*\n(.*?)\n\s*---", content, re.DOTALL)
110
+
111
+ if yaml_match:
112
+ yaml_text = yaml_match.group(1)
113
+ try:
114
+ yaml_block = yaml.safe_load(yaml_text)
115
+ except Exception as e:
116
+ print(f"Error parsing existing YAML front matter: {e}")
117
+ yaml_block = {}
118
+ else:
119
+ yaml_block = {}
120
+
121
+ # Update or add dataset_info and license
122
+ if dataset_info:
123
+ yaml_block["dataset_info"] = dataset_info
124
+
125
+ if license_id:
126
+ yaml_block["license"] = license_id
127
+
128
+ # Generate new YAML front matter
129
+ new_yaml = yaml.dump(yaml_block, sort_keys=False, default_flow_style=False)
130
+ new_yaml_block = f"---\n{new_yaml}---\n"
131
+
132
+ # Replace existing YAML front matter or add it at the beginning
133
+ if yaml_match:
134
+ new_content = content[: yaml_match.start()] + new_yaml_block + content[yaml_match.end() :]
135
+ else:
136
+ new_content = new_yaml_block + content
137
+
138
+ # Create a temporary file with the new content
139
+ import tempfile
140
+
141
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
142
+ temp_file.write(new_content)
143
+ temp_path = temp_file.name
144
+
145
+ # Upload the updated README.md
146
+ try:
147
+ api.upload_file(
148
+ path_or_fileobj=temp_path,
149
+ path_in_repo="README.md",
150
+ repo_id=repo_id,
151
+ repo_type="dataset",
152
+ commit_message=commit_message,
153
+ )
154
+ print(f"Successfully updated README.md in {repo_id}")
155
+ except Exception as e:
156
+ print(f"Error updating README.md: {e}")
157
+
158
+ # Clean up temporary file
159
+ import os
160
+
161
+ os.unlink(temp_path)
src/populate.py CHANGED
@@ -1,31 +1,146 @@
 
 
 
1
  import json
2
  import os
3
 
4
  import pandas as pd
 
 
 
 
5
 
6
- def get_new_leaderboard_df(results_path: str) -> pd.DataFrame:
7
- model_result_filepaths = []
8
- for root, _, files in os.walk(results_path):
9
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
 
10
  continue
11
  for file in files:
12
- model_result_filepaths.append(os.path.join(root, file))
13
-
14
- eval_results = {
15
- 'model': [],
16
- 'buzz_accuracy': [],
17
- 'win_rate_human': [],
18
- 'win_rate_model': []
19
- }
20
- for model_result_filepath in model_result_filepaths:
21
- with open(model_result_filepath, "r") as fin:
22
- model_result = json.load(fin)
23
- model_id = model_result["model_id"]
24
- buzz_accuracy = model_result["buzz_accuracy"]
25
- win_rate_human = model_result["win_rate_human"]
26
- win_rate_model = model_result["win_rate_model"]
27
- eval_results['model'].append(model_id)
28
- eval_results['buzz_accuracy'].append(buzz_accuracy)
29
- eval_results['win_rate_human'].append(win_rate_human)
30
- eval_results['win_rate_model'].append(win_rate_model)
31
- return pd.DataFrame(eval_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is kept for reference only and is not used in the enhanced implementation
2
+ # The actual implementation is in enhanced_leaderboard.py
3
+
4
  import json
5
  import os
6
 
7
  import pandas as pd
8
+ from loguru import logger
9
+
10
+ from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO
11
+ from src.hf_dataset_utils import download_dataset_snapshot
12
 
13
+
14
+ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
15
+ model_results = []
16
+ dirpath = os.path.join(repo_dir, competition_type, eval_split)
17
+ for root, _, files in os.walk(dirpath):
18
+ if len(files) == 0 or not all(f.endswith(".json") for f in files):
19
  continue
20
  for file in files:
21
+ filepath = os.path.join(root, file)
22
+ try:
23
+ with open(filepath, "r") as fp:
24
+ result = json.load(fp)
25
+ model_results.append(result)
26
+ except Exception as e:
27
+ logger.error(f"Error loading model result from {filepath}: {e}")
28
+ continue
29
+
30
+ return model_results
31
+
32
+
33
+ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
34
+ model_results = fetch_model_results(repo_dir, "tossup", eval_split)
35
+
36
+ eval_results = []
37
+ for result in model_results:
38
+ try:
39
+ metrics = result["metrics"]
40
+ username = result["username"]
41
+ model_name = result["model_name"]
42
+ buzz_accuracy = metrics["buzz_accuracy"]
43
+
44
+ row = {
45
+ "Submission": f"{username}/{model_name}",
46
+ "Avg Score ⬆️": metrics["tossup_score"],
47
+ "Buzz Accuracy": buzz_accuracy,
48
+ "Buzz Position": metrics["buzz_position"],
49
+ }
50
+ if "human_win_rate" in metrics:
51
+ row["Win Rate w/ Humans"] = metrics["human_win_rate"]
52
+ # row["Win Rate w/ Humans (Aggressive)"] = metrics["human_win_rate_strict"]
53
+ else:
54
+ row["Win Rate w/ Humans"] = None
55
+ # row["Win Rate w/ Humans (Aggressive)"] = None
56
+ eval_results.append(row)
57
+ except Exception as e:
58
+ logger.error(f"Error processing model result '{username}/{model_name}': {e}")
59
+ continue
60
+
61
+ df = pd.DataFrame(
62
+ eval_results,
63
+ columns=[
64
+ "Submission",
65
+ "Avg Score ⬆️",
66
+ "Buzz Accuracy",
67
+ "Buzz Position",
68
+ "Win Rate w/ Humans",
69
+ # "Win Rate w/ Humans (Aggressive)",
70
+ ],
71
+ )
72
+ df.sort_values(by="Avg Score ⬆️", ascending=False, inplace=True)
73
+ return df
74
+
75
+
76
+ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
77
+ model_results = fetch_model_results(repo_dir, "bonus", eval_split)
78
+
79
+ eval_results = []
80
+ for result in model_results:
81
+ try:
82
+ metrics = result["metrics"]
83
+ username = result["username"]
84
+ model_name = result["model_name"]
85
+
86
+ row = {
87
+ "Submission": f"{username}/{model_name}",
88
+ "Question Accuracy": metrics["question_accuracy"],
89
+ "Part Accuracy": metrics["part_accuracy"],
90
+ }
91
+ eval_results.append(row)
92
+ except Exception as e:
93
+ logger.error(f"Error processing model result '{username}/{model_name}': {e}")
94
+ continue
95
+
96
+ df = pd.DataFrame(
97
+ eval_results,
98
+ columns=["Submission", "Question Accuracy", "Part Accuracy"],
99
+ )
100
+ df.sort_values(by="Question Accuracy", ascending=False, inplace=True)
101
+ return df
102
+
103
+
104
+ def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
105
+ df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
106
+
107
+ def colour_pos_neg(v):
108
+ """Return a CSS rule for the cell that called the function."""
109
+ if pd.isna(v): # keep NaNs unstyled
110
+ return ""
111
+ return "color: green;" if v > 0 else "color: red;"
112
+
113
+ # Apply formatting and styling
114
+ styled_df = df.style.format(
115
+ {
116
+ "Avg Score ⬆️": "{:5.2f}",
117
+ "Buzz Accuracy": "{:>6.1%}",
118
+ "Buzz Position": "{:>6.1f}",
119
+ "Win Rate w/ Humans": "{:>6.1%}",
120
+ # "Win Rate w/ Humans (Aggressive)": "{:>6.1%}",
121
+ }
122
+ ).map(colour_pos_neg, subset=["Avg Score ⬆️"])
123
+
124
+ return styled_df if style else df
125
+
126
+
127
+ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
128
+ df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split)
129
+
130
+ # Apply formatting and styling
131
+ styled_df = df.style.format(
132
+ {
133
+ "Question Accuracy": "{:>6.1%}",
134
+ "Part Accuracy": "{:>6.1%}",
135
+ }
136
+ )
137
+
138
+ return styled_df if style else df
139
+
140
+
141
+ # TODO: Implement this once we have the proxy server running.
142
+ def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
143
+ # Merge the two dataframes on the 'Submission' column
144
+ merged_df = pd.merge(tossup_df, bonus_df, on="Submission", how="outer")
145
+
146
+ # Calculate the overall score as a weighted average