Spaces:
Running
Running
| # This file is kept for reference only and is not used in the enhanced implementation | |
| # The actual implementation is in enhanced_leaderboard.py | |
| import datetime | |
| import json | |
| import os | |
| import pandas as pd | |
| from loguru import logger | |
| from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH | |
| def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]: | |
| model_results = [] | |
| dirpath = os.path.join(repo_dir, competition_type, eval_split) | |
| for root, _, files in os.walk(dirpath): | |
| if len(files) == 0 or not all(f.endswith(".json") for f in files): | |
| continue | |
| for file in files: | |
| filepath = os.path.join(root, file) | |
| try: | |
| with open(filepath, "r") as fp: | |
| result = json.load(fp) | |
| model_results.append(result) | |
| except Exception as e: | |
| logger.error(f"Error loading model result from {filepath}: {e}") | |
| continue | |
| return model_results | |
| def get_submission_date(result: dict) -> datetime.date: | |
| submission_id = result["id"] | |
| datetime_str = submission_id.split("__")[-3] | |
| # str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date | |
| datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S") | |
| return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date() | |
| def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool: | |
| if not logged_in_username: | |
| return False | |
| if logged_in_username in ADMIN_USERS: | |
| return True | |
| if logged_in_username == username: | |
| return True | |
| return False | |
| def get_tossups_leaderboard_df( | |
| repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None | |
| ) -> pd.DataFrame: | |
| model_results = fetch_model_results(repo_dir, "tossup", eval_split) | |
| eval_results = [] | |
| for result in model_results: | |
| try: | |
| metrics = result["metrics"] | |
| username = result["username"] | |
| model_name = result["model_name"] | |
| submission_name = f"{username}/{model_name}" | |
| if cutoff_date and cutoff_date < get_submission_date(result): | |
| if not qualify_for_private_observation(username, logged_in_username): | |
| continue | |
| submission_name = f"{username}/{model_name} (*)" | |
| row = { | |
| "Submission": submission_name, | |
| "Expected Score ⬆️": metrics["expected_score"], | |
| "Buzz Precision": metrics["buzz_accuracy"], | |
| "Buzz Frequency": metrics["buzz_frequency"], | |
| "Buzz Position": metrics["buzz_position"], | |
| "Win Rate w/ Humans": metrics.get("human_win_rate", None), | |
| } | |
| eval_results.append(row) | |
| except Exception as e: | |
| logger.error(f"Error processing model result '{username}/{model_name}': {e}") | |
| continue | |
| df = pd.DataFrame( | |
| eval_results, | |
| columns=[ | |
| "Submission", | |
| "Expected Score ⬆️", | |
| "Buzz Precision", | |
| "Buzz Frequency", | |
| "Buzz Position", | |
| "Win Rate w/ Humans", | |
| ], | |
| ) | |
| df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True) | |
| return df | |
| def get_bonuses_leaderboard_df( | |
| repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None | |
| ) -> pd.DataFrame: | |
| model_results = fetch_model_results(repo_dir, "bonus", eval_split) | |
| eval_results = [] | |
| for result in model_results: | |
| try: | |
| metrics = result["metrics"] | |
| username = result["username"] | |
| model_name = result["model_name"] | |
| submission_name = f"{username}/{model_name}" | |
| if cutoff_date and cutoff_date < get_submission_date(result): | |
| if not qualify_for_private_observation(username, logged_in_username): | |
| continue | |
| submission_name = f"{username}/{model_name} (*)" | |
| row = { | |
| "Submission": submission_name, | |
| "Effect ⬆️": metrics["effectiveness"], | |
| "Part Acc": metrics["part_accuracy"], | |
| "Question Acc": metrics["question_accuracy"], | |
| "Calibration": metrics["calibration"], | |
| "Adoption": metrics["adoption"], | |
| } | |
| eval_results.append(row) | |
| except Exception as e: | |
| logger.error(f"Error processing model result '{username}/{model_name}': {e}") | |
| continue | |
| df = pd.DataFrame( | |
| eval_results, | |
| columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"], | |
| ) | |
| df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True) | |
| return df | |
| def colour_pos_neg(v): | |
| """Return a CSS rule for the cell that called the function.""" | |
| if pd.isna(v): # keep NaNs unstyled | |
| return "" | |
| return "color: green;" if v > 0 else "color: red;" | |
| # Helper function to bold the highest value in a column | |
| def bold_max(s): | |
| is_max = s == s.max() | |
| return ["font-weight: bold" if v else "" for v in is_max] | |
| def highlight_private_row(row): | |
| return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row] | |
| def fetch_tossup_leaderboard( | |
| split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
| ): | |
| df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) | |
| # Apply formatting and styling | |
| styled_df = ( | |
| df.style.format( | |
| { | |
| "Expected Score ⬆️": "{:6.3f}", | |
| "Buzz Precision": "{:>6.1%}", | |
| "Buzz Position": "{:>6.1f}", | |
| "Buzz Frequency": "{:>6.1%}", | |
| "Win Rate w/ Humans": "{:>6.1%}", | |
| } | |
| ) | |
| .map(colour_pos_neg, subset=["Expected Score ⬆️"]) | |
| .apply(highlight_private_row, axis=1) | |
| .apply( | |
| bold_max, | |
| subset=["Expected Score ⬆️", "Buzz Precision", "Buzz Position", "Win Rate w/ Humans"], | |
| axis=0, | |
| ) | |
| ) | |
| return styled_df if style else df | |
| def fetch_bonus_leaderboard( | |
| split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
| ): | |
| df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) | |
| # Apply formatting and styling | |
| styled_df = ( | |
| df.style.format( | |
| { | |
| "Question Acc": "{:>6.1%}", | |
| "Part Acc": "{:>6.1%}", | |
| "Effect ⬆️": "{:6.3f}", | |
| "Calibration": "{:>6.1%}", | |
| "Adoption": "{:>6.1%}", | |
| } | |
| ) | |
| .map(colour_pos_neg, subset=["Effect ⬆️"]) | |
| .apply(highlight_private_row, axis=1) | |
| .apply( | |
| bold_max, | |
| subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"], | |
| axis=0, | |
| ) | |
| ) | |
| return styled_df if style else df | |
| # TODO: Implement this once we have the proxy server running. | |
| def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame: | |
| # Helper to extract username from 'Submission' (format: username/model_name) | |
| def extract_username(submission: str) -> str: | |
| username = submission.split("/", 1)[0] if "/" in submission else submission | |
| if submission.endswith(" (*)"): | |
| username = username + " (*)" | |
| return username | |
| # Add username columns | |
| tossup_df = tossup_df.copy() | |
| tossup_df["Username"] = tossup_df["Submission"].apply(extract_username) | |
| bonus_df = bonus_df.copy() | |
| bonus_df["Username"] = bonus_df["Submission"].apply(extract_username) | |
| # Pick best tossup per user (highest Expected Score ⬆️) | |
| tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username") | |
| tossup_best = tossup_best.set_index("Username") | |
| # Pick best bonus per user (highest Effect ⬆️) | |
| bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username") | |
| bonus_best = bonus_best.set_index("Username") | |
| # Merge on Username (outer join to include users who have only one type) | |
| merged = pd.merge( | |
| tossup_best, | |
| bonus_best, | |
| left_index=True, | |
| right_index=True, | |
| how="outer", | |
| suffixes=("_tossup", "_bonus"), | |
| ) | |
| # Compose a summary row per user | |
| # Columns: Username, Tossup Submission, Bonus Submission, all metrics from both | |
| leaderboard = pd.DataFrame( | |
| { | |
| "Username": merged.index, | |
| "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1], | |
| "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1], | |
| "Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1), | |
| "Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"], | |
| "Effect (Bonus) ⬆️": merged["Effect ⬆️"], | |
| "Part Acc (Bonus)": merged["Part Acc"], | |
| "Adoption (Bonus)": merged["Adoption"], | |
| } | |
| ) | |
| leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False) | |
| return leaderboard.reset_index(drop=True) | |
| def highlight_overall_row(row): | |
| return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row] | |
| def fetch_overall_leaderboard( | |
| split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
| ): | |
| bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username) | |
| tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username) | |
| overall_df = create_overall_leaderboard(tossup_df, bonus_df) | |
| # Apply formatting and styling | |
| styled_df = ( | |
| overall_df.style.format( | |
| { | |
| "Overall Score ⬆️": "{:6.3f}", | |
| "Expected Score (Tossup) ⬆️": "{:6.3f}", | |
| "Effect (Bonus) ⬆️": "{:6.3f}", | |
| "Part Acc (Bonus)": "{:>6.1%}", | |
| "Adoption (Bonus)": "{:>6.1%}", | |
| }, | |
| na_rep="-", | |
| ) | |
| .map(colour_pos_neg, subset=["Overall Score ⬆️"]) | |
| .apply(highlight_overall_row, axis=1) | |
| .apply( | |
| bold_max, | |
| subset=[ | |
| "Overall Score ⬆️", | |
| "Expected Score (Tossup) ⬆️", | |
| "Effect (Bonus) ⬆️", | |
| "Part Acc (Bonus)", | |
| "Adoption (Bonus)", | |
| ], | |
| axis=0, | |
| ) | |
| ) | |
| return styled_df if style else overall_df | |