Spaces:
Running
Running
Maharshi Gor
commited on
Commit
·
4a9e506
1
Parent(s):
85c36d8
Leaderboard UI upgrade and Week deadline update
Browse files- app.py +62 -26
- src/display/css_html_js.py +4 -0
- src/envs.py +4 -0
- src/hf_dataset_utils.py +1 -2
- src/populate.py +131 -42
app.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
@@ -13,6 +16,7 @@ from src.display.css_html_js import custom_css
|
|
| 13 |
from src.envs import (
|
| 14 |
API,
|
| 15 |
COMPETITION_URL,
|
|
|
|
| 16 |
EVAL_RESULTS_PATH,
|
| 17 |
EVAL_SPLITS,
|
| 18 |
LEADERBOARD_REFRESH_INTERVAL,
|
|
@@ -29,6 +33,9 @@ from src.populate import (
|
|
| 29 |
fetch_tossup_leaderboard,
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Load metrics manual content
|
| 34 |
def load_metrics_manual():
|
|
@@ -58,75 +65,104 @@ except Exception:
|
|
| 58 |
restart_space()
|
| 59 |
|
| 60 |
|
| 61 |
-
def refresh_leaderboard(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return tossup_df, bonus_df, overall_df
|
| 67 |
|
| 68 |
|
| 69 |
-
def create_leaderboard_interface(app, split: str = "tiny_eval"):
|
| 70 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
| 71 |
-
refresh_btn = gr.Button("🔄 Refresh")
|
| 72 |
|
| 73 |
-
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False)
|
| 74 |
|
| 75 |
-
gr.
|
| 76 |
-
logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
|
| 77 |
-
tossup_leaderboard = Leaderboard(
|
| 78 |
value=tossup_df,
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
datatype=["str", "number", "number", "number", "number"],
|
| 81 |
elem_id="tossup-table",
|
| 82 |
interactive=False, # Ensure it's not interactive
|
| 83 |
)
|
| 84 |
|
| 85 |
-
gr.Markdown("")
|
| 86 |
-
|
| 87 |
-
gr.Markdown("## 🤔 Bonus Round Leaderboard")
|
| 88 |
logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
|
| 89 |
-
bonus_leaderboard =
|
| 90 |
value=bonus_df,
|
| 91 |
-
|
|
|
|
|
|
|
| 92 |
datatype=["str", "number", "number", "number", "number", "number", "number"],
|
| 93 |
elem_id="bonus-table",
|
| 94 |
interactive=False, # Ensure it's not interactive
|
| 95 |
)
|
| 96 |
|
| 97 |
-
gr.
|
| 98 |
-
overall_leaderboard = Leaderboard(
|
| 99 |
value=overall_df,
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
|
| 102 |
)
|
| 103 |
|
| 104 |
gr.on(
|
| 105 |
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
|
| 106 |
fn=refresh_leaderboard,
|
| 107 |
-
inputs=[gr.State(split)],
|
| 108 |
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
|
| 109 |
)
|
| 110 |
|
| 111 |
|
| 112 |
with gr.Blocks(css=custom_css) as demo:
|
| 113 |
gr.HTML(TITLE)
|
| 114 |
-
gr.
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 121 |
for i, (name, split) in enumerate(EVAL_SPLITS.items()):
|
| 122 |
with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
|
| 123 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
| 124 |
-
|
|
|
|
|
|
|
| 125 |
|
| 126 |
# Add the Metrics Guide tab
|
| 127 |
with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
|
| 128 |
gr.Markdown(load_metrics_manual())
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
# scheduler = BackgroundScheduler()
|
| 131 |
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 132 |
# scheduler.start()
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 16 |
from src.envs import (
|
| 17 |
API,
|
| 18 |
COMPETITION_URL,
|
| 19 |
+
CUTOFF_DATES,
|
| 20 |
EVAL_RESULTS_PATH,
|
| 21 |
EVAL_SPLITS,
|
| 22 |
LEADERBOARD_REFRESH_INTERVAL,
|
|
|
|
| 33 |
fetch_tossup_leaderboard,
|
| 34 |
)
|
| 35 |
|
| 36 |
+
logger.remove()
|
| 37 |
+
logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False)
|
| 38 |
+
|
| 39 |
|
| 40 |
# Load metrics manual content
|
| 41 |
def load_metrics_manual():
|
|
|
|
| 65 |
restart_space()
|
| 66 |
|
| 67 |
|
| 68 |
+
def refresh_leaderboard(
|
| 69 |
+
split: str = "tiny_eval",
|
| 70 |
+
style: bool = True,
|
| 71 |
+
date: datetime.date = None,
|
| 72 |
+
profile: gr.OAuthProfile = None,
|
| 73 |
+
):
|
| 74 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
| 75 |
+
try:
|
| 76 |
+
username = profile and profile.username
|
| 77 |
+
except Exception:
|
| 78 |
+
# If the user is not logged in, profile will be None
|
| 79 |
+
username = None
|
| 80 |
+
tossup_df = fetch_tossup_leaderboard(split, style, date, username)
|
| 81 |
+
bonus_df = fetch_bonus_leaderboard(split, style, date, username)
|
| 82 |
+
overall_df = fetch_overall_leaderboard(split, style, date, username)
|
| 83 |
return tossup_df, bonus_df, overall_df
|
| 84 |
|
| 85 |
|
| 86 |
+
def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
|
| 87 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
|
|
|
| 88 |
|
| 89 |
+
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False, date=date)
|
| 90 |
|
| 91 |
+
tossup_leaderboard = gr.Dataframe(
|
|
|
|
|
|
|
| 92 |
value=tossup_df,
|
| 93 |
+
show_search=True,
|
| 94 |
+
label=" 🛎️ Tossup Round Leaderboard",
|
| 95 |
+
show_label=True,
|
| 96 |
datatype=["str", "number", "number", "number", "number"],
|
| 97 |
elem_id="tossup-table",
|
| 98 |
interactive=False, # Ensure it's not interactive
|
| 99 |
)
|
| 100 |
|
|
|
|
|
|
|
|
|
|
| 101 |
logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
|
| 102 |
+
bonus_leaderboard = gr.Dataframe(
|
| 103 |
value=bonus_df,
|
| 104 |
+
show_search=True,
|
| 105 |
+
label=" 🧐 Bonus Round Leaderboard",
|
| 106 |
+
show_label=True,
|
| 107 |
datatype=["str", "number", "number", "number", "number", "number", "number"],
|
| 108 |
elem_id="bonus-table",
|
| 109 |
interactive=False, # Ensure it's not interactive
|
| 110 |
)
|
| 111 |
|
| 112 |
+
overall_leaderboard = gr.Dataframe(
|
|
|
|
| 113 |
value=overall_df,
|
| 114 |
+
show_search=True,
|
| 115 |
+
label=" 🥇 Overall Leaderboard",
|
| 116 |
+
show_label=True,
|
| 117 |
datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
|
| 118 |
)
|
| 119 |
|
| 120 |
gr.on(
|
| 121 |
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
|
| 122 |
fn=refresh_leaderboard,
|
| 123 |
+
inputs=[gr.State(split), gr.State(True), gr.State(date)],
|
| 124 |
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
|
| 125 |
)
|
| 126 |
|
| 127 |
|
| 128 |
with gr.Blocks(css=custom_css) as demo:
|
| 129 |
gr.HTML(TITLE)
|
| 130 |
+
with gr.Row():
|
| 131 |
+
with gr.Column(scale=5):
|
| 132 |
+
gr.Markdown(
|
| 133 |
+
f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
|
| 134 |
+
f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
|
| 135 |
+
elem_classes="welcome-text",
|
| 136 |
+
)
|
| 137 |
+
logged_note = gr.Markdown(
|
| 138 |
+
"## 👉 **Note:** <span style='background-color: lightblue; padding: 10px; margin:4px'>Rows in blue with **(*)**</span> are your submissions past the cutoff date and are only visible to you.",
|
| 139 |
+
visible=False,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
with gr.Column(scale=2):
|
| 143 |
+
beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y")
|
| 144 |
+
gr.Markdown(f"## 📅 Next Cutoff Date: <span style='color:crimson'>{beautify_date}</span>")
|
| 145 |
+
gr.LoginButton("Login to privately view your scores on past weeks.")
|
| 146 |
+
refresh_btn = gr.Button("🔄 Refresh")
|
| 147 |
|
| 148 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 149 |
for i, (name, split) in enumerate(EVAL_SPLITS.items()):
|
| 150 |
with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
|
| 151 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
| 152 |
+
cutoff_date = CUTOFF_DATES[name]
|
| 153 |
+
date = datetime.strptime(cutoff_date, "%Y-%m-%d").date()
|
| 154 |
+
create_leaderboard_interface(demo, refresh_btn, split, date)
|
| 155 |
|
| 156 |
# Add the Metrics Guide tab
|
| 157 |
with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
|
| 158 |
gr.Markdown(load_metrics_manual())
|
| 159 |
|
| 160 |
+
def check_user_logged_in(x: gr.OAuthProfile):
|
| 161 |
+
return gr.update(visible=x is not None)
|
| 162 |
+
|
| 163 |
+
demo.load(check_user_logged_in, outputs=[logged_note])
|
| 164 |
+
|
| 165 |
+
|
| 166 |
# scheduler = BackgroundScheduler()
|
| 167 |
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 168 |
# scheduler.start()
|
src/display/css_html_js.py
CHANGED
|
@@ -46,6 +46,10 @@ table th:first-child {
|
|
| 46 |
white-space: nowrap;
|
| 47 |
}
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
.table td .cell-wrap span {
|
| 50 |
white-space: pre;
|
| 51 |
}
|
|
|
|
| 46 |
white-space: nowrap;
|
| 47 |
}
|
| 48 |
|
| 49 |
+
.header-row .label p {
|
| 50 |
+
font-size: 20px !important;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
.table td .cell-wrap span {
|
| 54 |
white-space: pre;
|
| 55 |
}
|
src/envs.py
CHANGED
|
@@ -16,7 +16,11 @@ QUEUE_REPO = f"{OWNER}/advcal-requests"
|
|
| 16 |
RESULTS_REPO = f"{OWNER}/advcal-results"
|
| 17 |
LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
|
| 18 |
USERS_REPO = f"{OWNER}/registered-users"
|
|
|
|
|
|
|
| 19 |
EVAL_SPLITS = {"Week 1": "w1_eval", "Week 0": "tiny_eval"}
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Important Links
|
| 22 |
QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
|
|
|
|
| 16 |
RESULTS_REPO = f"{OWNER}/advcal-results"
|
| 17 |
LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
|
| 18 |
USERS_REPO = f"{OWNER}/registered-users"
|
| 19 |
+
|
| 20 |
+
ADMIN_USERS = ["mgor"]
|
| 21 |
EVAL_SPLITS = {"Week 1": "w1_eval", "Week 0": "tiny_eval"}
|
| 22 |
+
CUTOFF_DATES = {"Week 1": "2025-05-30", "Week 0": "2025-05-23", "Week 2": "2025-06-07"}
|
| 23 |
+
|
| 24 |
|
| 25 |
# Important Links
|
| 26 |
QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
|
src/hf_dataset_utils.py
CHANGED
|
@@ -14,8 +14,7 @@ def download_dataset_snapshot(repo_id, local_dir):
|
|
| 14 |
tqdm_class=None,
|
| 15 |
)
|
| 16 |
except Exception as e:
|
| 17 |
-
logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}
|
| 18 |
-
api.restart_space(repo_id=repo_id)
|
| 19 |
|
| 20 |
|
| 21 |
def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
|
|
|
|
| 14 |
tqdm_class=None,
|
| 15 |
)
|
| 16 |
except Exception as e:
|
| 17 |
+
logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}")
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
|
src/populate.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
# This file is kept for reference only and is not used in the enhanced implementation
|
| 2 |
# The actual implementation is in enhanced_leaderboard.py
|
| 3 |
|
|
|
|
| 4 |
import json
|
| 5 |
import os
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
from loguru import logger
|
| 9 |
|
| 10 |
-
from src.envs import EVAL_RESULTS_PATH
|
| 11 |
|
| 12 |
|
| 13 |
def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
|
|
@@ -29,7 +30,27 @@ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -
|
|
| 29 |
return model_results
|
| 30 |
|
| 31 |
|
| 32 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
model_results = fetch_model_results(repo_dir, "tossup", eval_split)
|
| 34 |
|
| 35 |
eval_results = []
|
|
@@ -38,9 +59,14 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
| 38 |
metrics = result["metrics"]
|
| 39 |
username = result["username"]
|
| 40 |
model_name = result["model_name"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
row = {
|
| 43 |
-
"Submission":
|
| 44 |
"Expected Score ⬆️": metrics["expected_score"],
|
| 45 |
"Buzz Precision": metrics["buzz_accuracy"],
|
| 46 |
"Buzz Frequency": metrics["buzz_frequency"],
|
|
@@ -67,7 +93,9 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
| 67 |
return df
|
| 68 |
|
| 69 |
|
| 70 |
-
def get_bonuses_leaderboard_df(
|
|
|
|
|
|
|
| 71 |
model_results = fetch_model_results(repo_dir, "bonus", eval_split)
|
| 72 |
|
| 73 |
eval_results = []
|
|
@@ -76,9 +104,14 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
| 76 |
metrics = result["metrics"]
|
| 77 |
username = result["username"]
|
| 78 |
model_name = result["model_name"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
row = {
|
| 81 |
-
"Submission":
|
| 82 |
"Effect ⬆️": metrics["effectiveness"],
|
| 83 |
"Part Acc": metrics["part_accuracy"],
|
| 84 |
"Question Acc": metrics["question_accuracy"],
|
|
@@ -94,7 +127,7 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
| 94 |
eval_results,
|
| 95 |
columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
|
| 96 |
)
|
| 97 |
-
df.sort_values(by="Effect ⬆️", ascending=False, inplace=True)
|
| 98 |
return df
|
| 99 |
|
| 100 |
|
|
@@ -105,36 +138,68 @@ def colour_pos_neg(v):
|
|
| 105 |
return "color: green;" if v > 0 else "color: red;"
|
| 106 |
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# Apply formatting and styling
|
| 112 |
-
styled_df =
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
return styled_df if style else df
|
| 123 |
|
| 124 |
|
| 125 |
-
def fetch_bonus_leaderboard(
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
|
| 128 |
# Apply formatting and styling
|
| 129 |
-
styled_df =
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
return styled_df if style else df
|
| 140 |
|
|
@@ -143,7 +208,10 @@ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
|
|
| 143 |
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
|
| 144 |
# Helper to extract username from 'Submission' (format: username/model_name)
|
| 145 |
def extract_username(submission: str) -> str:
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
# Add username columns
|
| 149 |
tossup_df = tossup_df.copy()
|
|
@@ -189,21 +257,42 @@ def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame)
|
|
| 189 |
return leaderboard.reset_index(drop=True)
|
| 190 |
|
| 191 |
|
| 192 |
-
def
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
overall_df = create_overall_leaderboard(tossup_df, bonus_df)
|
| 196 |
|
| 197 |
# Apply formatting and styling
|
| 198 |
-
styled_df =
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
return styled_df if style else overall_df
|
|
|
|
| 1 |
# This file is kept for reference only and is not used in the enhanced implementation
|
| 2 |
# The actual implementation is in enhanced_leaderboard.py
|
| 3 |
|
| 4 |
+
import datetime
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
from loguru import logger
|
| 10 |
|
| 11 |
+
from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH
|
| 12 |
|
| 13 |
|
| 14 |
def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
|
|
|
|
| 30 |
return model_results
|
| 31 |
|
| 32 |
|
| 33 |
+
def get_submission_date(result: dict) -> datetime.date:
|
| 34 |
+
submission_id = result["id"]
|
| 35 |
+
datetime_str = submission_id.split("__")[-3]
|
| 36 |
+
# str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date
|
| 37 |
+
datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
|
| 38 |
+
return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool:
|
| 42 |
+
if not logged_in_username:
|
| 43 |
+
return False
|
| 44 |
+
if logged_in_username in ADMIN_USERS:
|
| 45 |
+
return True
|
| 46 |
+
if logged_in_username == username:
|
| 47 |
+
return True
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_tossups_leaderboard_df(
|
| 52 |
+
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
|
| 53 |
+
) -> pd.DataFrame:
|
| 54 |
model_results = fetch_model_results(repo_dir, "tossup", eval_split)
|
| 55 |
|
| 56 |
eval_results = []
|
|
|
|
| 59 |
metrics = result["metrics"]
|
| 60 |
username = result["username"]
|
| 61 |
model_name = result["model_name"]
|
| 62 |
+
submission_name = f"{username}/{model_name}"
|
| 63 |
+
if cutoff_date and cutoff_date < get_submission_date(result):
|
| 64 |
+
if not qualify_for_private_observation(username, logged_in_username):
|
| 65 |
+
continue
|
| 66 |
+
submission_name = f"{username}/{model_name} (*)"
|
| 67 |
|
| 68 |
row = {
|
| 69 |
+
"Submission": submission_name,
|
| 70 |
"Expected Score ⬆️": metrics["expected_score"],
|
| 71 |
"Buzz Precision": metrics["buzz_accuracy"],
|
| 72 |
"Buzz Frequency": metrics["buzz_frequency"],
|
|
|
|
| 93 |
return df
|
| 94 |
|
| 95 |
|
| 96 |
+
def get_bonuses_leaderboard_df(
|
| 97 |
+
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
|
| 98 |
+
) -> pd.DataFrame:
|
| 99 |
model_results = fetch_model_results(repo_dir, "bonus", eval_split)
|
| 100 |
|
| 101 |
eval_results = []
|
|
|
|
| 104 |
metrics = result["metrics"]
|
| 105 |
username = result["username"]
|
| 106 |
model_name = result["model_name"]
|
| 107 |
+
submission_name = f"{username}/{model_name}"
|
| 108 |
+
if cutoff_date and cutoff_date < get_submission_date(result):
|
| 109 |
+
if not qualify_for_private_observation(username, logged_in_username):
|
| 110 |
+
continue
|
| 111 |
+
submission_name = f"{username}/{model_name} (*)"
|
| 112 |
|
| 113 |
row = {
|
| 114 |
+
"Submission": submission_name,
|
| 115 |
"Effect ⬆️": metrics["effectiveness"],
|
| 116 |
"Part Acc": metrics["part_accuracy"],
|
| 117 |
"Question Acc": metrics["question_accuracy"],
|
|
|
|
| 127 |
eval_results,
|
| 128 |
columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
|
| 129 |
)
|
| 130 |
+
df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
|
| 131 |
return df
|
| 132 |
|
| 133 |
|
|
|
|
| 138 |
return "color: green;" if v > 0 else "color: red;"
|
| 139 |
|
| 140 |
|
| 141 |
+
# Helper function to bold the highest value in a column
|
| 142 |
+
def bold_max(s):
|
| 143 |
+
is_max = s == s.max()
|
| 144 |
+
return ["font-weight: bold" if v else "" for v in is_max]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def highlight_private_row(row):
|
| 148 |
+
return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row]
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def fetch_tossup_leaderboard(
|
| 152 |
+
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
|
| 153 |
+
):
|
| 154 |
+
df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
|
| 155 |
|
| 156 |
# Apply formatting and styling
|
| 157 |
+
styled_df = (
|
| 158 |
+
df.style.format(
|
| 159 |
+
{
|
| 160 |
+
"Expected Score ⬆️": "{:6.3f}",
|
| 161 |
+
"Buzz Precision": "{:>6.1%}",
|
| 162 |
+
"Buzz Position": "{:>6.1f}",
|
| 163 |
+
"Buzz Frequency": "{:>6.1%}",
|
| 164 |
+
"Win Rate w/ Humans": "{:>6.1%}",
|
| 165 |
+
}
|
| 166 |
+
)
|
| 167 |
+
.map(colour_pos_neg, subset=["Expected Score ⬆️"])
|
| 168 |
+
.apply(highlight_private_row, axis=1)
|
| 169 |
+
.apply(
|
| 170 |
+
bold_max,
|
| 171 |
+
subset=["Expected Score ⬆️", "Buzz Precision", "Buzz Position", "Win Rate w/ Humans"],
|
| 172 |
+
axis=0,
|
| 173 |
+
)
|
| 174 |
+
)
|
| 175 |
|
| 176 |
return styled_df if style else df
|
| 177 |
|
| 178 |
|
| 179 |
+
def fetch_bonus_leaderboard(
|
| 180 |
+
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
|
| 181 |
+
):
|
| 182 |
+
df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
|
| 183 |
|
| 184 |
# Apply formatting and styling
|
| 185 |
+
styled_df = (
|
| 186 |
+
df.style.format(
|
| 187 |
+
{
|
| 188 |
+
"Question Acc": "{:>6.1%}",
|
| 189 |
+
"Part Acc": "{:>6.1%}",
|
| 190 |
+
"Effect ⬆️": "{:6.3f}",
|
| 191 |
+
"Calibration": "{:>6.1%}",
|
| 192 |
+
"Adoption": "{:>6.1%}",
|
| 193 |
+
}
|
| 194 |
+
)
|
| 195 |
+
.map(colour_pos_neg, subset=["Effect ⬆️"])
|
| 196 |
+
.apply(highlight_private_row, axis=1)
|
| 197 |
+
.apply(
|
| 198 |
+
bold_max,
|
| 199 |
+
subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"],
|
| 200 |
+
axis=0,
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
|
| 204 |
return styled_df if style else df
|
| 205 |
|
|
|
|
| 208 |
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
|
| 209 |
# Helper to extract username from 'Submission' (format: username/model_name)
|
| 210 |
def extract_username(submission: str) -> str:
|
| 211 |
+
username = submission.split("/", 1)[0] if "/" in submission else submission
|
| 212 |
+
if submission.endswith(" (*)"):
|
| 213 |
+
username = username + " (*)"
|
| 214 |
+
return username
|
| 215 |
|
| 216 |
# Add username columns
|
| 217 |
tossup_df = tossup_df.copy()
|
|
|
|
| 257 |
return leaderboard.reset_index(drop=True)
|
| 258 |
|
| 259 |
|
| 260 |
+
def highlight_overall_row(row):
|
| 261 |
+
return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row]
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def fetch_overall_leaderboard(
|
| 265 |
+
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
|
| 266 |
+
):
|
| 267 |
+
bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username)
|
| 268 |
+
tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username)
|
| 269 |
overall_df = create_overall_leaderboard(tossup_df, bonus_df)
|
| 270 |
|
| 271 |
# Apply formatting and styling
|
| 272 |
+
styled_df = (
|
| 273 |
+
overall_df.style.format(
|
| 274 |
+
{
|
| 275 |
+
"Overall Score ⬆️": "{:6.3f}",
|
| 276 |
+
"Expected Score (Tossup) ⬆️": "{:6.3f}",
|
| 277 |
+
"Effect (Bonus) ⬆️": "{:6.3f}",
|
| 278 |
+
"Part Acc (Bonus)": "{:>6.1%}",
|
| 279 |
+
"Adoption (Bonus)": "{:>6.1%}",
|
| 280 |
+
},
|
| 281 |
+
na_rep="-",
|
| 282 |
+
)
|
| 283 |
+
.map(colour_pos_neg, subset=["Overall Score ⬆️"])
|
| 284 |
+
.apply(highlight_overall_row, axis=1)
|
| 285 |
+
.apply(
|
| 286 |
+
bold_max,
|
| 287 |
+
subset=[
|
| 288 |
+
"Overall Score ⬆️",
|
| 289 |
+
"Expected Score (Tossup) ⬆️",
|
| 290 |
+
"Effect (Bonus) ⬆️",
|
| 291 |
+
"Part Acc (Bonus)",
|
| 292 |
+
"Adoption (Bonus)",
|
| 293 |
+
],
|
| 294 |
+
axis=0,
|
| 295 |
+
)
|
| 296 |
+
)
|
| 297 |
|
| 298 |
return styled_df if style else overall_df
|