leaderboard / src /populate.py
Maharshi Gor
Leaderboard UI upgrade and Week deadline update
4a9e506
raw
history blame
10.8 kB
# This file is kept for reference only and is not used in the enhanced implementation
# The actual implementation is in enhanced_leaderboard.py
import datetime
import json
import os
import pandas as pd
from loguru import logger
from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH
def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
model_results = []
dirpath = os.path.join(repo_dir, competition_type, eval_split)
for root, _, files in os.walk(dirpath):
if len(files) == 0 or not all(f.endswith(".json") for f in files):
continue
for file in files:
filepath = os.path.join(root, file)
try:
with open(filepath, "r") as fp:
result = json.load(fp)
model_results.append(result)
except Exception as e:
logger.error(f"Error loading model result from {filepath}: {e}")
continue
return model_results
def get_submission_date(result: dict) -> datetime.date:
submission_id = result["id"]
datetime_str = submission_id.split("__")[-3]
# str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date
datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date()
def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool:
if not logged_in_username:
return False
if logged_in_username in ADMIN_USERS:
return True
if logged_in_username == username:
return True
return False
def get_tossups_leaderboard_df(
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
) -> pd.DataFrame:
model_results = fetch_model_results(repo_dir, "tossup", eval_split)
eval_results = []
for result in model_results:
try:
metrics = result["metrics"]
username = result["username"]
model_name = result["model_name"]
submission_name = f"{username}/{model_name}"
if cutoff_date and cutoff_date < get_submission_date(result):
if not qualify_for_private_observation(username, logged_in_username):
continue
submission_name = f"{username}/{model_name} (*)"
row = {
"Submission": submission_name,
"Expected Score ⬆️": metrics["expected_score"],
"Buzz Precision": metrics["buzz_accuracy"],
"Buzz Frequency": metrics["buzz_frequency"],
"Buzz Position": metrics["buzz_position"],
"Win Rate w/ Humans": metrics.get("human_win_rate", None),
}
eval_results.append(row)
except Exception as e:
logger.error(f"Error processing model result '{username}/{model_name}': {e}")
continue
df = pd.DataFrame(
eval_results,
columns=[
"Submission",
"Expected Score ⬆️",
"Buzz Precision",
"Buzz Frequency",
"Buzz Position",
"Win Rate w/ Humans",
],
)
df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True)
return df
def get_bonuses_leaderboard_df(
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
) -> pd.DataFrame:
model_results = fetch_model_results(repo_dir, "bonus", eval_split)
eval_results = []
for result in model_results:
try:
metrics = result["metrics"]
username = result["username"]
model_name = result["model_name"]
submission_name = f"{username}/{model_name}"
if cutoff_date and cutoff_date < get_submission_date(result):
if not qualify_for_private_observation(username, logged_in_username):
continue
submission_name = f"{username}/{model_name} (*)"
row = {
"Submission": submission_name,
"Effect ⬆️": metrics["effectiveness"],
"Part Acc": metrics["part_accuracy"],
"Question Acc": metrics["question_accuracy"],
"Calibration": metrics["calibration"],
"Adoption": metrics["adoption"],
}
eval_results.append(row)
except Exception as e:
logger.error(f"Error processing model result '{username}/{model_name}': {e}")
continue
df = pd.DataFrame(
eval_results,
columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
)
df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
return df
def colour_pos_neg(v):
"""Return a CSS rule for the cell that called the function."""
if pd.isna(v): # keep NaNs unstyled
return ""
return "color: green;" if v > 0 else "color: red;"
# Helper function to bold the highest value in a column
def bold_max(s):
is_max = s == s.max()
return ["font-weight: bold" if v else "" for v in is_max]
def highlight_private_row(row):
return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row]
def fetch_tossup_leaderboard(
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
# Apply formatting and styling
styled_df = (
df.style.format(
{
"Expected Score ⬆️": "{:6.3f}",
"Buzz Precision": "{:>6.1%}",
"Buzz Position": "{:>6.1f}",
"Buzz Frequency": "{:>6.1%}",
"Win Rate w/ Humans": "{:>6.1%}",
}
)
.map(colour_pos_neg, subset=["Expected Score ⬆️"])
.apply(highlight_private_row, axis=1)
.apply(
bold_max,
subset=["Expected Score ⬆️", "Buzz Precision", "Buzz Position", "Win Rate w/ Humans"],
axis=0,
)
)
return styled_df if style else df
def fetch_bonus_leaderboard(
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
# Apply formatting and styling
styled_df = (
df.style.format(
{
"Question Acc": "{:>6.1%}",
"Part Acc": "{:>6.1%}",
"Effect ⬆️": "{:6.3f}",
"Calibration": "{:>6.1%}",
"Adoption": "{:>6.1%}",
}
)
.map(colour_pos_neg, subset=["Effect ⬆️"])
.apply(highlight_private_row, axis=1)
.apply(
bold_max,
subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"],
axis=0,
)
)
return styled_df if style else df
# TODO: Implement this once we have the proxy server running.
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
# Helper to extract username from 'Submission' (format: username/model_name)
def extract_username(submission: str) -> str:
username = submission.split("/", 1)[0] if "/" in submission else submission
if submission.endswith(" (*)"):
username = username + " (*)"
return username
# Add username columns
tossup_df = tossup_df.copy()
tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
bonus_df = bonus_df.copy()
bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
# Pick best tossup per user (highest Expected Score ⬆️)
tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username")
tossup_best = tossup_best.set_index("Username")
# Pick best bonus per user (highest Effect ⬆️)
bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
bonus_best = bonus_best.set_index("Username")
# Merge on Username (outer join to include users who have only one type)
merged = pd.merge(
tossup_best,
bonus_best,
left_index=True,
right_index=True,
how="outer",
suffixes=("_tossup", "_bonus"),
)
# Compose a summary row per user
# Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
leaderboard = pd.DataFrame(
{
"Username": merged.index,
"Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
"Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
"Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
"Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"],
"Effect (Bonus) ⬆️": merged["Effect ⬆️"],
"Part Acc (Bonus)": merged["Part Acc"],
"Adoption (Bonus)": merged["Adoption"],
}
)
leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)
return leaderboard.reset_index(drop=True)
def highlight_overall_row(row):
return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row]
def fetch_overall_leaderboard(
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username)
tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username)
overall_df = create_overall_leaderboard(tossup_df, bonus_df)
# Apply formatting and styling
styled_df = (
overall_df.style.format(
{
"Overall Score ⬆️": "{:6.3f}",
"Expected Score (Tossup) ⬆️": "{:6.3f}",
"Effect (Bonus) ⬆️": "{:6.3f}",
"Part Acc (Bonus)": "{:>6.1%}",
"Adoption (Bonus)": "{:>6.1%}",
},
na_rep="-",
)
.map(colour_pos_neg, subset=["Overall Score ⬆️"])
.apply(highlight_overall_row, axis=1)
.apply(
bold_max,
subset=[
"Overall Score ⬆️",
"Expected Score (Tossup) ⬆️",
"Effect (Bonus) ⬆️",
"Part Acc (Bonus)",
"Adoption (Bonus)",
],
axis=0,
)
)
return styled_df if style else overall_df