Maharshi Gor commited on
Commit
4a9e506
·
1 Parent(s): 85c36d8

Leaderboard UI upgrade and Week deadline update

Browse files
Files changed (5) hide show
  1. app.py +62 -26
  2. src/display/css_html_js.py +4 -0
  3. src/envs.py +4 -0
  4. src/hf_dataset_utils.py +1 -2
  5. src/populate.py +131 -42
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -13,6 +16,7 @@ from src.display.css_html_js import custom_css
13
  from src.envs import (
14
  API,
15
  COMPETITION_URL,
 
16
  EVAL_RESULTS_PATH,
17
  EVAL_SPLITS,
18
  LEADERBOARD_REFRESH_INTERVAL,
@@ -29,6 +33,9 @@ from src.populate import (
29
  fetch_tossup_leaderboard,
30
  )
31
 
 
 
 
32
 
33
  # Load metrics manual content
34
  def load_metrics_manual():
@@ -58,75 +65,104 @@ except Exception:
58
  restart_space()
59
 
60
 
61
- def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
 
 
 
 
 
62
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
63
- tossup_df = fetch_tossup_leaderboard(split, style)
64
- bonus_df = fetch_bonus_leaderboard(split, style)
65
- overall_df = fetch_overall_leaderboard(split, style)
 
 
 
 
 
66
  return tossup_df, bonus_df, overall_df
67
 
68
 
69
- def create_leaderboard_interface(app, split: str = "tiny_eval"):
70
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
71
- refresh_btn = gr.Button("🔄 Refresh")
72
 
73
- tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False)
74
 
75
- gr.Markdown("## 🛎️ Tossup Round Leaderboard")
76
- logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
77
- tossup_leaderboard = Leaderboard(
78
  value=tossup_df,
79
- search_columns=["Submission"],
 
 
80
  datatype=["str", "number", "number", "number", "number"],
81
  elem_id="tossup-table",
82
  interactive=False, # Ensure it's not interactive
83
  )
84
 
85
- gr.Markdown("")
86
-
87
- gr.Markdown("## 🤔 Bonus Round Leaderboard")
88
  logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
89
- bonus_leaderboard = Leaderboard(
90
  value=bonus_df,
91
- search_columns=["Submission"],
 
 
92
  datatype=["str", "number", "number", "number", "number", "number", "number"],
93
  elem_id="bonus-table",
94
  interactive=False, # Ensure it's not interactive
95
  )
96
 
97
- gr.Markdown("## 🥇 Overall Leaderboard")
98
- overall_leaderboard = Leaderboard(
99
  value=overall_df,
100
- search_columns=["Username", "Tossup Submission", "Bonus Submission"],
 
 
101
  datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
102
  )
103
 
104
  gr.on(
105
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
106
  fn=refresh_leaderboard,
107
- inputs=[gr.State(split)],
108
  outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
109
  )
110
 
111
 
112
  with gr.Blocks(css=custom_css) as demo:
113
  gr.HTML(TITLE)
114
- gr.Markdown(
115
- f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
116
- f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
117
- elem_classes="welcome-text",
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
121
  for i, (name, split) in enumerate(EVAL_SPLITS.items()):
122
  with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
123
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
124
- create_leaderboard_interface(demo, split)
 
 
125
 
126
  # Add the Metrics Guide tab
127
  with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
128
  gr.Markdown(load_metrics_manual())
129
 
 
 
 
 
 
 
130
  # scheduler = BackgroundScheduler()
131
  # scheduler.add_job(restart_space, "interval", seconds=1800)
132
  # scheduler.start()
 
1
+ import sys
2
+ from datetime import datetime
3
+
4
  import gradio as gr
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
16
  from src.envs import (
17
  API,
18
  COMPETITION_URL,
19
+ CUTOFF_DATES,
20
  EVAL_RESULTS_PATH,
21
  EVAL_SPLITS,
22
  LEADERBOARD_REFRESH_INTERVAL,
 
33
  fetch_tossup_leaderboard,
34
  )
35
 
36
+ logger.remove()
37
+ logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False)
38
+
39
 
40
  # Load metrics manual content
41
  def load_metrics_manual():
 
65
  restart_space()
66
 
67
 
68
+ def refresh_leaderboard(
69
+ split: str = "tiny_eval",
70
+ style: bool = True,
71
+ date: datetime.date = None,
72
+ profile: gr.OAuthProfile = None,
73
+ ):
74
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
75
+ try:
76
+ username = profile and profile.username
77
+ except Exception:
78
+ # If the user is not logged in, profile will be None
79
+ username = None
80
+ tossup_df = fetch_tossup_leaderboard(split, style, date, username)
81
+ bonus_df = fetch_bonus_leaderboard(split, style, date, username)
82
+ overall_df = fetch_overall_leaderboard(split, style, date, username)
83
  return tossup_df, bonus_df, overall_df
84
 
85
 
86
+ def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
87
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
 
88
 
89
+ tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False, date=date)
90
 
91
+ tossup_leaderboard = gr.Dataframe(
 
 
92
  value=tossup_df,
93
+ show_search=True,
94
+ label=" 🛎️ Tossup Round Leaderboard",
95
+ show_label=True,
96
  datatype=["str", "number", "number", "number", "number"],
97
  elem_id="tossup-table",
98
  interactive=False, # Ensure it's not interactive
99
  )
100
 
 
 
 
101
  logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
102
+ bonus_leaderboard = gr.Dataframe(
103
  value=bonus_df,
104
+ show_search=True,
105
+ label=" 🧐 Bonus Round Leaderboard",
106
+ show_label=True,
107
  datatype=["str", "number", "number", "number", "number", "number", "number"],
108
  elem_id="bonus-table",
109
  interactive=False, # Ensure it's not interactive
110
  )
111
 
112
+ overall_leaderboard = gr.Dataframe(
 
113
  value=overall_df,
114
+ show_search=True,
115
+ label=" 🥇 Overall Leaderboard",
116
+ show_label=True,
117
  datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
118
  )
119
 
120
  gr.on(
121
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
122
  fn=refresh_leaderboard,
123
+ inputs=[gr.State(split), gr.State(True), gr.State(date)],
124
  outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
125
  )
126
 
127
 
128
  with gr.Blocks(css=custom_css) as demo:
129
  gr.HTML(TITLE)
130
+ with gr.Row():
131
+ with gr.Column(scale=5):
132
+ gr.Markdown(
133
+ f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
134
+ f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
135
+ elem_classes="welcome-text",
136
+ )
137
+ logged_note = gr.Markdown(
138
+ "## 👉 **Note:** <span style='background-color: lightblue; padding: 10px; margin:4px'>Rows in blue with **(*)**</span> are your submissions past the cutoff date and are only visible to you.",
139
+ visible=False,
140
+ )
141
+
142
+ with gr.Column(scale=2):
143
+ beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y")
144
+ gr.Markdown(f"## 📅 Next Cutoff Date: &nbsp;&nbsp; <span style='color:crimson'>{beautify_date}</span>")
145
+ gr.LoginButton("Login to privately view your scores on past weeks.")
146
+ refresh_btn = gr.Button("🔄 Refresh")
147
 
148
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
149
  for i, (name, split) in enumerate(EVAL_SPLITS.items()):
150
  with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i):
151
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
152
+ cutoff_date = CUTOFF_DATES[name]
153
+ date = datetime.strptime(cutoff_date, "%Y-%m-%d").date()
154
+ create_leaderboard_interface(demo, refresh_btn, split, date)
155
 
156
  # Add the Metrics Guide tab
157
  with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"):
158
  gr.Markdown(load_metrics_manual())
159
 
160
+ def check_user_logged_in(x: gr.OAuthProfile):
161
+ return gr.update(visible=x is not None)
162
+
163
+ demo.load(check_user_logged_in, outputs=[logged_note])
164
+
165
+
166
  # scheduler = BackgroundScheduler()
167
  # scheduler.add_job(restart_space, "interval", seconds=1800)
168
  # scheduler.start()
src/display/css_html_js.py CHANGED
@@ -46,6 +46,10 @@ table th:first-child {
46
  white-space: nowrap;
47
  }
48
 
 
 
 
 
49
  .table td .cell-wrap span {
50
  white-space: pre;
51
  }
 
46
  white-space: nowrap;
47
  }
48
 
49
+ .header-row .label p {
50
+ font-size: 20px !important;
51
+ }
52
+
53
  .table td .cell-wrap span {
54
  white-space: pre;
55
  }
src/envs.py CHANGED
@@ -16,7 +16,11 @@ QUEUE_REPO = f"{OWNER}/advcal-requests"
16
  RESULTS_REPO = f"{OWNER}/advcal-results"
17
  LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
18
  USERS_REPO = f"{OWNER}/registered-users"
 
 
19
  EVAL_SPLITS = {"Week 1": "w1_eval", "Week 0": "tiny_eval"}
 
 
20
 
21
  # Important Links
22
  QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
 
16
  RESULTS_REPO = f"{OWNER}/advcal-results"
17
  LLM_CACHE_REPO = f"{OWNER}/advcal-llm-cache"
18
  USERS_REPO = f"{OWNER}/registered-users"
19
+
20
+ ADMIN_USERS = ["mgor"]
21
  EVAL_SPLITS = {"Week 1": "w1_eval", "Week 0": "tiny_eval"}
22
+ CUTOFF_DATES = {"Week 1": "2025-05-30", "Week 0": "2025-05-23", "Week 2": "2025-06-07"}
23
+
24
 
25
  # Important Links
26
  QANTA_WEBSITE_URL = "https://sites.google.com/view/qanta/home"
src/hf_dataset_utils.py CHANGED
@@ -14,8 +14,7 @@ def download_dataset_snapshot(repo_id, local_dir):
14
  tqdm_class=None,
15
  )
16
  except Exception as e:
17
- logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
18
- api.restart_space(repo_id=repo_id)
19
 
20
 
21
  def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
 
14
  tqdm_class=None,
15
  )
16
  except Exception as e:
17
+ logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}")
 
18
 
19
 
20
  def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
src/populate.py CHANGED
@@ -1,13 +1,14 @@
1
  # This file is kept for reference only and is not used in the enhanced implementation
2
  # The actual implementation is in enhanced_leaderboard.py
3
 
 
4
  import json
5
  import os
6
 
7
  import pandas as pd
8
  from loguru import logger
9
 
10
- from src.envs import EVAL_RESULTS_PATH
11
 
12
 
13
  def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
@@ -29,7 +30,27 @@ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -
29
  return model_results
30
 
31
 
32
- def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  model_results = fetch_model_results(repo_dir, "tossup", eval_split)
34
 
35
  eval_results = []
@@ -38,9 +59,14 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
38
  metrics = result["metrics"]
39
  username = result["username"]
40
  model_name = result["model_name"]
 
 
 
 
 
41
 
42
  row = {
43
- "Submission": f"{username}/{model_name}",
44
  "Expected Score ⬆️": metrics["expected_score"],
45
  "Buzz Precision": metrics["buzz_accuracy"],
46
  "Buzz Frequency": metrics["buzz_frequency"],
@@ -67,7 +93,9 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
67
  return df
68
 
69
 
70
- def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
 
 
71
  model_results = fetch_model_results(repo_dir, "bonus", eval_split)
72
 
73
  eval_results = []
@@ -76,9 +104,14 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
76
  metrics = result["metrics"]
77
  username = result["username"]
78
  model_name = result["model_name"]
 
 
 
 
 
79
 
80
  row = {
81
- "Submission": f"{username}/{model_name}",
82
  "Effect ⬆️": metrics["effectiveness"],
83
  "Part Acc": metrics["part_accuracy"],
84
  "Question Acc": metrics["question_accuracy"],
@@ -94,7 +127,7 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
94
  eval_results,
95
  columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
96
  )
97
- df.sort_values(by="Effect ⬆️", ascending=False, inplace=True)
98
  return df
99
 
100
 
@@ -105,36 +138,68 @@ def colour_pos_neg(v):
105
  return "color: green;" if v > 0 else "color: red;"
106
 
107
 
108
- def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
109
- df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  # Apply formatting and styling
112
- styled_df = df.style.format(
113
- {
114
- "Expected Score ⬆️": "{:5.2f}",
115
- "Buzz Precision": "{:>6.1%}",
116
- "Buzz Position": "{:>6.1f}",
117
- "Buzz Frequency": "{:>6.1%}",
118
- "Win Rate w/ Humans": "{:>6.1%}",
119
- }
120
- ).map(colour_pos_neg, subset=["Expected Score ⬆️"])
 
 
 
 
 
 
 
 
 
121
 
122
  return styled_df if style else df
123
 
124
 
125
- def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
126
- df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split)
 
 
127
 
128
  # Apply formatting and styling
129
- styled_df = df.style.format(
130
- {
131
- "Question Acc": "{:>6.1%}",
132
- "Part Acc": "{:>6.1%}",
133
- "Effect ⬆️": "{:5.2f}",
134
- "Calibration": "{:>6.1%}",
135
- "Adoption": "{:>6.1%}",
136
- }
137
- ).map(colour_pos_neg, subset=["Effect ⬆️"])
 
 
 
 
 
 
 
 
 
138
 
139
  return styled_df if style else df
140
 
@@ -143,7 +208,10 @@ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
143
  def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
144
  # Helper to extract username from 'Submission' (format: username/model_name)
145
  def extract_username(submission: str) -> str:
146
- return submission.split("/", 1)[0] if "/" in submission else submission
 
 
 
147
 
148
  # Add username columns
149
  tossup_df = tossup_df.copy()
@@ -189,21 +257,42 @@ def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame)
189
  return leaderboard.reset_index(drop=True)
190
 
191
 
192
- def fetch_overall_leaderboard(split: str = "tiny_eval", style: bool = True):
193
- bonus_df = fetch_bonus_leaderboard(split, style=False)
194
- tossup_df = fetch_tossup_leaderboard(split, style=False)
 
 
 
 
 
 
195
  overall_df = create_overall_leaderboard(tossup_df, bonus_df)
196
 
197
  # Apply formatting and styling
198
- styled_df = overall_df.style.format(
199
- {
200
- "Overall Score ⬆️": "{:5.2f}",
201
- "Expected Score (Tossup) ⬆️": "{:5.2f}",
202
- "Effect (Bonus) ⬆️": "{:5.2f}",
203
- "Part Acc (Bonus)": "{:>6.1%}",
204
- "Adoption (Bonus)": "{:>6.1%}",
205
- },
206
- na_rep="-",
207
- ).map(colour_pos_neg, subset=["Overall Score ⬆️"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  return styled_df if style else overall_df
 
1
  # This file is kept for reference only and is not used in the enhanced implementation
2
  # The actual implementation is in enhanced_leaderboard.py
3
 
4
+ import datetime
5
  import json
6
  import os
7
 
8
  import pandas as pd
9
  from loguru import logger
10
 
11
+ from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH
12
 
13
 
14
  def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
 
30
  return model_results
31
 
32
 
33
+ def get_submission_date(result: dict) -> datetime.date:
34
+ submission_id = result["id"]
35
+ datetime_str = submission_id.split("__")[-3]
36
+ # str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date
37
+ datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
38
+ return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date()
39
+
40
+
41
+ def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool:
42
+ if not logged_in_username:
43
+ return False
44
+ if logged_in_username in ADMIN_USERS:
45
+ return True
46
+ if logged_in_username == username:
47
+ return True
48
+ return False
49
+
50
+
51
+ def get_tossups_leaderboard_df(
52
+ repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
53
+ ) -> pd.DataFrame:
54
  model_results = fetch_model_results(repo_dir, "tossup", eval_split)
55
 
56
  eval_results = []
 
59
  metrics = result["metrics"]
60
  username = result["username"]
61
  model_name = result["model_name"]
62
+ submission_name = f"{username}/{model_name}"
63
+ if cutoff_date and cutoff_date < get_submission_date(result):
64
+ if not qualify_for_private_observation(username, logged_in_username):
65
+ continue
66
+ submission_name = f"{username}/{model_name} (*)"
67
 
68
  row = {
69
+ "Submission": submission_name,
70
  "Expected Score ⬆️": metrics["expected_score"],
71
  "Buzz Precision": metrics["buzz_accuracy"],
72
  "Buzz Frequency": metrics["buzz_frequency"],
 
93
  return df
94
 
95
 
96
+ def get_bonuses_leaderboard_df(
97
+ repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
98
+ ) -> pd.DataFrame:
99
  model_results = fetch_model_results(repo_dir, "bonus", eval_split)
100
 
101
  eval_results = []
 
104
  metrics = result["metrics"]
105
  username = result["username"]
106
  model_name = result["model_name"]
107
+ submission_name = f"{username}/{model_name}"
108
+ if cutoff_date and cutoff_date < get_submission_date(result):
109
+ if not qualify_for_private_observation(username, logged_in_username):
110
+ continue
111
+ submission_name = f"{username}/{model_name} (*)"
112
 
113
  row = {
114
+ "Submission": submission_name,
115
  "Effect ⬆️": metrics["effectiveness"],
116
  "Part Acc": metrics["part_accuracy"],
117
  "Question Acc": metrics["question_accuracy"],
 
127
  eval_results,
128
  columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
129
  )
130
+ df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
131
  return df
132
 
133
 
 
138
  return "color: green;" if v > 0 else "color: red;"
139
 
140
 
141
+ # Helper function to bold the highest value in a column
142
+ def bold_max(s):
143
+ is_max = s == s.max()
144
+ return ["font-weight: bold" if v else "" for v in is_max]
145
+
146
+
147
+ def highlight_private_row(row):
148
+ return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row]
149
+
150
+
151
+ def fetch_tossup_leaderboard(
152
+ split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
153
+ ):
154
+ df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
155
 
156
  # Apply formatting and styling
157
+ styled_df = (
158
+ df.style.format(
159
+ {
160
+ "Expected Score ⬆️": "{:6.3f}",
161
+ "Buzz Precision": "{:>6.1%}",
162
+ "Buzz Position": "{:>6.1f}",
163
+ "Buzz Frequency": "{:>6.1%}",
164
+ "Win Rate w/ Humans": "{:>6.1%}",
165
+ }
166
+ )
167
+ .map(colour_pos_neg, subset=["Expected Score ⬆️"])
168
+ .apply(highlight_private_row, axis=1)
169
+ .apply(
170
+ bold_max,
171
+ subset=["Expected Score ⬆️", "Buzz Precision", "Buzz Position", "Win Rate w/ Humans"],
172
+ axis=0,
173
+ )
174
+ )
175
 
176
  return styled_df if style else df
177
 
178
 
179
+ def fetch_bonus_leaderboard(
180
+ split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
181
+ ):
182
+ df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
183
 
184
  # Apply formatting and styling
185
+ styled_df = (
186
+ df.style.format(
187
+ {
188
+ "Question Acc": "{:>6.1%}",
189
+ "Part Acc": "{:>6.1%}",
190
+ "Effect ⬆️": "{:6.3f}",
191
+ "Calibration": "{:>6.1%}",
192
+ "Adoption": "{:>6.1%}",
193
+ }
194
+ )
195
+ .map(colour_pos_neg, subset=["Effect ⬆️"])
196
+ .apply(highlight_private_row, axis=1)
197
+ .apply(
198
+ bold_max,
199
+ subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"],
200
+ axis=0,
201
+ )
202
+ )
203
 
204
  return styled_df if style else df
205
 
 
208
  def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
209
  # Helper to extract username from 'Submission' (format: username/model_name)
210
  def extract_username(submission: str) -> str:
211
+ username = submission.split("/", 1)[0] if "/" in submission else submission
212
+ if submission.endswith(" (*)"):
213
+ username = username + " (*)"
214
+ return username
215
 
216
  # Add username columns
217
  tossup_df = tossup_df.copy()
 
257
  return leaderboard.reset_index(drop=True)
258
 
259
 
260
+ def highlight_overall_row(row):
261
+ return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row]
262
+
263
+
264
+ def fetch_overall_leaderboard(
265
+ split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
266
+ ):
267
+ bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username)
268
+ tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username)
269
  overall_df = create_overall_leaderboard(tossup_df, bonus_df)
270
 
271
  # Apply formatting and styling
272
+ styled_df = (
273
+ overall_df.style.format(
274
+ {
275
+ "Overall Score ⬆️": "{:6.3f}",
276
+ "Expected Score (Tossup) ⬆️": "{:6.3f}",
277
+ "Effect (Bonus) ⬆️": "{:6.3f}",
278
+ "Part Acc (Bonus)": "{:>6.1%}",
279
+ "Adoption (Bonus)": "{:>6.1%}",
280
+ },
281
+ na_rep="-",
282
+ )
283
+ .map(colour_pos_neg, subset=["Overall Score ⬆️"])
284
+ .apply(highlight_overall_row, axis=1)
285
+ .apply(
286
+ bold_max,
287
+ subset=[
288
+ "Overall Score ⬆️",
289
+ "Expected Score (Tossup) ⬆️",
290
+ "Effect (Bonus) ⬆️",
291
+ "Part Acc (Bonus)",
292
+ "Adoption (Bonus)",
293
+ ],
294
+ axis=0,
295
+ )
296
+ )
297
 
298
  return styled_df if style else overall_df