Spaces:

qanta-challenge
/

leaderboard

Running

App Files Files Community

Maharshi Gor commited on May 17

Commit

e494d40

1 Parent(s): bfbc762

Leaderboard, metrics, and docs.

Browse files

Files changed (10) hide show

README.md +10 -4
app.py +98 -25
metrics_manual.md +34 -0
requirements.txt +7 -5
src/__init__.py +0 -0
src/about.py +1 -1
src/display/css_html_js.py +4 -0
src/envs.py +36 -5
src/hf_dataset_utils.py +161 -0
src/populate.py +139 -24

README.md CHANGED Viewed

@@ -1,11 +1,17 @@
 ---
-title: Grounded Qa Leaderboard
-emoji: 👻
-colorFrom: gray
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.27.0
 app_file: app.py
 pinned: false
 license: mit
 ---

 ---
+title: QANTA 2025 Leaderboard
+emoji: 🎖️
+colorFrom: red
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: 'Leaderboard for QANTA 2025: Human-AI Cooperative Trivia'
 ---
+# QANTA 2025 Leaderboard
+This is the leaderboard for QANTA 2025: Human-AI Cooperative Trivia.

app.py CHANGED Viewed

@@ -1,51 +1,124 @@
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     INTRODUCTION_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    AutoEvalColumn,
-    fields,
 )
-from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_new_leaderboard_df
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
-original_df = get_new_leaderboard_df(EVAL_RESULTS_PATH)
-leaderboard_df = original_df.copy()
-demo = gr.Blocks(css=custom_css)
-with demo:
     gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 System", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard_table = gr.components.Dataframe(
-                value=[leaderboard_df.iloc[idx] for idx in range(len(leaderboard_df))],
-                headers=[c.name for c in fields(AutoEvalColumn)],
-                datatype=[c.type for c in fields(AutoEvalColumn)],
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
+import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from gradio_leaderboard import Leaderboard
 from huggingface_hub import snapshot_download
+from loguru import logger
 from src.about import (
     INTRODUCTION_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
+from src.envs import (
+    API,
+    COMPETITION_URL,
+    EVAL_RESULTS_PATH,
+    EVAL_SPLITS,
+    LEADERBOARD_REFRESH_INTERVAL,
+    REGISTRATION_URL,
+    REPO_ID,
+    RESULTS_REPO,
+    SUBMISSION_URL,
+    TOKEN,
 )
+from src.hf_dataset_utils import download_dataset_snapshot
+from src.populate import (
+    fetch_bonus_leaderboard,
+    fetch_tossup_leaderboard,
+)
+# Load metrics manual content
+def load_metrics_manual():
+    try:
+        with open("metrics_manual.md", "r") as f:
+            return f.read()
+    except Exception as e:
+        logger.error(f"Error loading metrics manual: {e}")
+        return "# Metrics Manual\n\nCould not load metrics manual content."
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO,
+        local_dir=EVAL_RESULTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
     )
 except Exception:
     restart_space()
+def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
+    download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
+    tossup_df = fetch_tossup_leaderboard(split, style)
+    bonus_df = fetch_bonus_leaderboard(split, style)
+    return tossup_df, bonus_df
+def create_leaderboard_interface(app, split: str = "tiny_eval"):
+    leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
+    refresh_btn = gr.Button("🔄 Refresh")
+    tossup_df, bonus_df = refresh_leaderboard(split, style=False)
+    gr.Markdown("## 🛎️ Tossup Round Leaderboard")
+    logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
+    tossup_leaderboard = Leaderboard(
+        value=tossup_df,
+        search_columns=["Submission"],
+        datatype=["str", "number", "number", "number", "number", "number"],
+        elem_id="tossup-table",
+        interactive=False,  # Ensure it's not interactive
+    )
+    gr.Markdown("")
+    gr.Markdown("## 🤔 Bonus Round Leaderboard")
+    logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
+    bonus_leaderboard = Leaderboard(
+        value=bonus_df,
+        search_columns=["Submission"],
+        datatype=["str", "number", "number"],
+        elem_id="bonus-table",
+        interactive=False,  # Ensure it's not interactive
+    )
+    gr.on(
+        triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
+        fn=refresh_leaderboard,
+        inputs=[gr.State(split)],
+        outputs=[tossup_leaderboard, bonus_leaderboard],
+    )
+with gr.Blocks(css=custom_css) as demo:
     gr.HTML(TITLE)
+    gr.Markdown(
+        f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
+        f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
+        elem_classes="welcome-text",
+    )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        for i, (name, split) in enumerate(EVAL_SPLITS.items()):
+            with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
+                leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
+                create_leaderboard_interface(demo, split)
+        # Add the Metrics Guide tab
+        with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
+            gr.Markdown(load_metrics_manual())
+# scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
+# scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

metrics_manual.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# QANTA 2025 Leaderboard Metrics Manual
+This document explains the metrics displayed on the QANTA 2025 Human-AI Cooperative QA competition leaderboard.
+## Tossup Round Metrics
+Tossup rounds measure an AI system's ability to answer questions as they're being read:
+| Metric | Description |
+|--------|-------------|
+| **Submission** | The username and model name of the submission (format: `username/model_name`) |
+| **Avg Score ⬆️** | Average points scored per tossup question. 10 points is the maximum score per question. -5 point for incorrect buzzes, 0 for no buzz. Positive scores (green) indicate good performance, while negative scores (red) indicate penalties for incorrect answers. |
+| **Buzz Accuracy** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
+| **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
+| **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players. |
+## Bonus Round Metrics
+Bonus rounds test an AI system's ability to answer multi-part questions:
+| Metric | Description |
+|--------|-------------|
+| **Submission** | The username and model name of the submission (format: `username/model_name`) |
+| **Question Accuracy** | Percentage of bonus questions where all parts were answered correctly. |
+| **Part Accuracy** | Percentage of individual bonus question parts answered correctly across all questions. |
+## Understanding the Competition
+QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
+1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer
+2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas
+The leaderboard tracks how well AI models perform on both question types across different evaluation datasets.

requirements.txt CHANGED Viewed

@@ -1,11 +1,13 @@
 APScheduler==3.10.1
 black==23.11.0
 click==8.1.3
-datasets==2.14.5
-gradio==4.4.0
 gradio_client==0.7.0
 huggingface-hub>=0.18.0
-numpy==1.24.2
-pandas==2.0.0
 python-dateutil==2.8.2
-requests==2.28.2

 APScheduler==3.10.1
 black==23.11.0
 click==8.1.3
+datasets>=3.0.0
+gradio>=5.0.0
 gradio_client==0.7.0
 huggingface-hub>=0.18.0
+numpy<2.0.0
+pandas>=2.0.0
 python-dateutil==2.8.2
+requests==2.28.2
+gradio_leaderboard
+loguru

src/__init__.py ADDED Viewed

File without changes

src/about.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Adversarial Calibration QA Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">QANTA 2025: Human-AI Cooperative QA Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

src/display/css_html_js.py CHANGED Viewed

@@ -46,6 +46,10 @@ table th:first-child {
     white-space: nowrap;
 }
 .tab-buttons button {
     font-size: 20px;
 }

     white-space: nowrap;
 }
+.table td .cell-wrap span {
+    white-space: pre;
+}
 .tab-buttons button {
     font-size: 20px;
 }

src/envs.py CHANGED Viewed

@@ -4,19 +4,50 @@ from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "umdclip" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/grounded_qa_leaderboard"
-RESULTS_REPO = f"{OWNER}/model-results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 # Info to change for your repository
 # ----------------------------------
+TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
 # ----------------------------------
+OWNER = "qanta-challenge"
+REPO_ID = f"{OWNER}/quizbowl-submission"
+QUEUE_REPO = f"{OWNER}/advcal-requests"
+RESULTS_REPO = f"{OWNER}/advcal-results"
+LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
+USERS_REPO = f"{OWNER}/registered-users"
+EVAL_SPLITS = {"Week 0": "tiny_eval"}
+# Important Links
+QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
+COMPETITION_URL = "https://sites.google.com/view/qanta/2025-competition"
+DOCS_REPO_URL = "https://github.com/qanta-challenge/QANTA25"
+DOCS_URL = DOCS_REPO_URL + "/tree/main"
+GITHUB_ISSUES_URL = DOCS_REPO_URL + "/issues"
+CONTACT_EMAIL = "[email protected]"
+DISCORD_URL = "https://discord.gg/ChmDVatJ6Y"
+REGISTRATION_URL = "https://huggingface.co/spaces/qanta-challenge/register"
+SUBMISSION_URL = "https://huggingface.co/spaces/qanta-challenge/quizbowl-submission"
+EXAMPLES_PATH = "examples"
+# ----------------------------------
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
+LLM_CACHE_PATH = os.path.join(CACHE_PATH, "llm-cache")
+USERS_PATH = os.path.join(CACHE_PATH, "registered-users")
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+LLM_CACHE_REFRESH_INTERVAL = 600  # seconds (30 minutes)
+SERVER_RESTART_INTERVAL = 2 * 24 * 60 * 60  # seconds (2 days)
+LEADERBOARD_REFRESH_INTERVAL = 600  # seconds (10 minutes)
 API = HfApi(token=TOKEN)

src/hf_dataset_utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from huggingface_hub import HfApi, snapshot_download
+from loguru import logger
+api = HfApi()
+def download_dataset_snapshot(repo_id, local_dir):
+    try:
+        logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}")
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=local_dir,
+            repo_type="dataset",
+            tqdm_class=None,
+        )
+    except Exception as e:
+        logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
+        api.restart_space(repo_id=repo_id)
+def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
+    """
+    Remove files or directories matching specified patterns from a Hugging Face dataset repository.
+    Args:
+        repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
+        path_patterns: List of file or directory path patterns to remove
+        commit_message: Message for the commit that removes the files
+    """
+    import fnmatch
+    import os
+    # Get all files in the repository
+    repo_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+    # Find files matching the patterns
+    files_to_remove = []
+    for pattern in path_patterns:
+        matching_files = fnmatch.filter(repo_files, pattern)
+        files_to_remove.extend(matching_files)
+    # Delete each matching file
+    for path in files_to_remove:
+        try:
+            api.delete_file(
+                path_in_repo=path, repo_id=repo_id, repo_type="dataset", commit_message=f"{commit_message}: {path}"
+            )
+            print(f"Successfully removed {path} from {repo_id}")
+        except Exception as e:
+            print(f"Error removing {path}: {e}")
+def update_dataset_info_readme(
+    repo_id: str,
+    dataset_info: dict,
+    license_id: str = None,
+    commit_message: str = "Update dataset_info in README.md",
+):
+    """
+    Update the dataset_info section in the README.md file of a Hugging Face dataset repository.
+    Args:
+        repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
+        dataset_info: Dictionary containing dataset information to include in the README
+        license_id: Optional license identifier (e.g., "mit", "cc-by-4.0")
+        commit_message: Message for the commit
+    Example dataset_info structure:
+    {
+        "features": [
+            {"name": "text", "dtype": "string"},
+            {"name": "label", "dtype": "int64"}
+        ],
+        "splits": [
+            {"name": "train", "num_examples": 10000, "num_bytes": 1000000},
+            {"name": "test", "num_examples": 1000, "num_bytes": 100000}
+        ],
+        "download_size": 1200000,
+        "dataset_size": 1100000,
+        "configs": [
+            {
+                "config_name": "default",
+                "data_files": [
+                    {"split": "train", "path": "data/train.csv"},
+                    {"split": "test", "path": "data/test.csv"}
+                ]
+            }
+        ]
+    }
+    """
+    import re
+    import yaml
+    from huggingface_hub import HfApi
+    api = HfApi()
+    # Check if README.md exists
+    try:
+        readme_content = api.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="README.md", token=None)
+        with open(readme_content, "r", encoding="utf-8") as f:
+            content = f.read()
+    except Exception:
+        # Create a new README.md if it doesn't exist
+        content = ""
+    # Parse existing YAML front matter if it exists
+    yaml_block = None
+    yaml_match = re.search(r"---\s*\n(.*?)\n\s*---", content, re.DOTALL)
+    if yaml_match:
+        yaml_text = yaml_match.group(1)
+        try:
+            yaml_block = yaml.safe_load(yaml_text)
+        except Exception as e:
+            print(f"Error parsing existing YAML front matter: {e}")
+            yaml_block = {}
+    else:
+        yaml_block = {}
+    # Update or add dataset_info and license
+    if dataset_info:
+        yaml_block["dataset_info"] = dataset_info
+    if license_id:
+        yaml_block["license"] = license_id
+    # Generate new YAML front matter
+    new_yaml = yaml.dump(yaml_block, sort_keys=False, default_flow_style=False)
+    new_yaml_block = f"---\n{new_yaml}---\n"
+    # Replace existing YAML front matter or add it at the beginning
+    if yaml_match:
+        new_content = content[: yaml_match.start()] + new_yaml_block + content[yaml_match.end() :]
+    else:
+        new_content = new_yaml_block + content
+    # Create a temporary file with the new content
+    import tempfile
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
+        temp_file.write(new_content)
+        temp_path = temp_file.name
+    # Upload the updated README.md
+    try:
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo="README.md",
+            repo_id=repo_id,
+            repo_type="dataset",
+            commit_message=commit_message,
+        )
+        print(f"Successfully updated README.md in {repo_id}")
+    except Exception as e:
+        print(f"Error updating README.md: {e}")
+    # Clean up temporary file
+    import os
+    os.unlink(temp_path)

src/populate.py CHANGED Viewed

@@ -1,31 +1,146 @@
 import json
 import os
 import pandas as pd
-def get_new_leaderboard_df(results_path: str) -> pd.DataFrame:
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {
-        'model': [],
-        'buzz_accuracy': [],
-        'win_rate_human': [],
-        'win_rate_model': []
-    }
-    for model_result_filepath in model_result_filepaths:
-        with open(model_result_filepath, "r") as fin:
-            model_result = json.load(fin)
-            model_id = model_result["model_id"]
-            buzz_accuracy = model_result["buzz_accuracy"]
-            win_rate_human = model_result["win_rate_human"]
-            win_rate_model = model_result["win_rate_model"]
-            eval_results['model'].append(model_id)
-            eval_results['buzz_accuracy'].append(buzz_accuracy)
-            eval_results['win_rate_human'].append(win_rate_human)
-            eval_results['win_rate_model'].append(win_rate_model)
-    return pd.DataFrame(eval_results)

+# This file is kept for reference only and is not used in the enhanced implementation
+# The actual implementation is in enhanced_leaderboard.py
 import json
 import os
 import pandas as pd
+from loguru import logger
+from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO
+from src.hf_dataset_utils import download_dataset_snapshot
+def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
+    model_results = []
+    dirpath = os.path.join(repo_dir, competition_type, eval_split)
+    for root, _, files in os.walk(dirpath):
+        if len(files) == 0 or not all(f.endswith(".json") for f in files):
             continue
         for file in files:
+            filepath = os.path.join(root, file)
+            try:
+                with open(filepath, "r") as fp:
+                    result = json.load(fp)
+                model_results.append(result)
+            except Exception as e:
+                logger.error(f"Error loading model result from {filepath}: {e}")
+                continue
+    return model_results
+def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
+    model_results = fetch_model_results(repo_dir, "tossup", eval_split)
+    eval_results = []
+    for result in model_results:
+        try:
+            metrics = result["metrics"]
+            username = result["username"]
+            model_name = result["model_name"]
+            buzz_accuracy = metrics["buzz_accuracy"]
+            row = {
+                "Submission": f"{username}/{model_name}",
+                "Avg Score ⬆️": metrics["tossup_score"],
+                "Buzz Accuracy": buzz_accuracy,
+                "Buzz Position": metrics["buzz_position"],
+            }
+            if "human_win_rate" in metrics:
+                row["Win Rate w/ Humans"] = metrics["human_win_rate"]
+                # row["Win Rate w/ Humans (Aggressive)"] = metrics["human_win_rate_strict"]
+            else:
+                row["Win Rate w/ Humans"] = None
+                # row["Win Rate w/ Humans (Aggressive)"] = None
+            eval_results.append(row)
+        except Exception as e:
+            logger.error(f"Error processing model result '{username}/{model_name}': {e}")
+            continue
+    df = pd.DataFrame(
+        eval_results,
+        columns=[
+            "Submission",
+            "Avg Score ⬆️",
+            "Buzz Accuracy",
+            "Buzz Position",
+            "Win Rate w/ Humans",
+            # "Win Rate w/ Humans (Aggressive)",
+        ],
+    )
+    df.sort_values(by="Avg Score ⬆️", ascending=False, inplace=True)
+    return df
+def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
+    model_results = fetch_model_results(repo_dir, "bonus", eval_split)
+    eval_results = []
+    for result in model_results:
+        try:
+            metrics = result["metrics"]
+            username = result["username"]
+            model_name = result["model_name"]
+            row = {
+                "Submission": f"{username}/{model_name}",
+                "Question Accuracy": metrics["question_accuracy"],
+                "Part Accuracy": metrics["part_accuracy"],
+            }
+            eval_results.append(row)
+        except Exception as e:
+            logger.error(f"Error processing model result '{username}/{model_name}': {e}")
+            continue
+    df = pd.DataFrame(
+        eval_results,
+        columns=["Submission", "Question Accuracy", "Part Accuracy"],
+    )
+    df.sort_values(by="Question Accuracy", ascending=False, inplace=True)
+    return df
+def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
+    df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
+    def colour_pos_neg(v):
+        """Return a CSS rule for the cell that called the function."""
+        if pd.isna(v):  # keep NaNs unstyled
+            return ""
+        return "color: green;" if v > 0 else "color: red;"
+    # Apply formatting and styling
+    styled_df = df.style.format(
+        {
+            "Avg Score ⬆️": "{:5.2f}",
+            "Buzz Accuracy": "{:>6.1%}",
+            "Buzz Position": "{:>6.1f}",
+            "Win Rate w/ Humans": "{:>6.1%}",
+            # "Win Rate w/ Humans (Aggressive)": "{:>6.1%}",
+        }
+    ).map(colour_pos_neg, subset=["Avg Score ⬆️"])
+    return styled_df if style else df
+def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
+    df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split)
+    # Apply formatting and styling
+    styled_df = df.style.format(
+        {
+            "Question Accuracy": "{:>6.1%}",
+            "Part Accuracy": "{:>6.1%}",
+        }
+    )
+    return styled_df if style else df
+# TODO: Implement this once we have the proxy server running.
+def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
+    # Merge the two dataframes on the 'Submission' column
+    merged_df = pd.merge(tossup_df, bonus_df, on="Submission", how="outer")
+    # Calculate the overall score as a weighted average