Maharshi Gor commited on
Commit
025f1f3
·
1 Parent(s): ec7e710

Cost information and model-model comparison

Browse files
Files changed (4) hide show
  1. app.py +17 -5
  2. src/display/css_html_js.py +5 -0
  3. src/envs.py +1 -1
  4. src/populate.py +68 -48
app.py CHANGED
@@ -80,31 +80,43 @@ def refresh_leaderboard(
80
  tossup_df = fetch_tossup_leaderboard(split, style, date, username)
81
  bonus_df = fetch_bonus_leaderboard(split, style, date, username)
82
  overall_df = fetch_overall_leaderboard(split, style, date, username)
 
83
  return tossup_df, bonus_df, overall_df
84
 
85
 
86
  def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
87
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
88
 
89
- tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False, date=date)
90
 
 
 
 
 
 
 
 
91
  tossup_leaderboard = gr.Dataframe(
92
  value=tossup_df,
93
  show_search=True,
94
  label=" 🛎️ Tossup Round Leaderboard",
95
  show_label=True,
96
- datatype=["str", "number", "number", "number", "number"],
97
  elem_id="tossup-table",
98
  interactive=False, # Ensure it's not interactive
99
  )
100
 
101
- logger.info(f"Bonus dataframe columns: {bonus_df.columns}")
 
 
 
 
102
  bonus_leaderboard = gr.Dataframe(
103
  value=bonus_df,
104
  show_search=True,
105
  label=" 🧐 Bonus Round Leaderboard",
106
  show_label=True,
107
- datatype=["str", "number", "number", "number", "number", "number", "number"],
108
  elem_id="bonus-table",
109
  interactive=False, # Ensure it's not interactive
110
  )
@@ -114,7 +126,7 @@ def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", dat
114
  show_search=True,
115
  label=" 🥇 Overall Leaderboard",
116
  show_label=True,
117
- datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
118
  )
119
 
120
  gr.on(
 
80
  tossup_df = fetch_tossup_leaderboard(split, style, date, username)
81
  bonus_df = fetch_bonus_leaderboard(split, style, date, username)
82
  overall_df = fetch_overall_leaderboard(split, style, date, username)
83
+
84
  return tossup_df, bonus_df, overall_df
85
 
86
 
87
  def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
88
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
89
 
90
+ tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date)
91
 
92
+ gr.HTML(
93
+ "<div style='font-size: 18px;'>"
94
+ "ℹ️ <b>E [Score]</b> is the <b>Expected Score</b> for a question. 🙋🏻 and 🤖 indicate the scores against just the Human and the AI players respectively.<br>"
95
+ "ℹ️ <b>Cost</b> is the cost in USD of executing the pipeline <b>per question prefix</b>. (Typically we have upto ~20 prefixes per tossup question)"
96
+ "ℹ️ <b>When does the cost matter?</b> When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.<br>"
97
+ "</div>"
98
+ )
99
  tossup_leaderboard = gr.Dataframe(
100
  value=tossup_df,
101
  show_search=True,
102
  label=" 🛎️ Tossup Round Leaderboard",
103
  show_label=True,
104
+ datatype=["str", "number", "number", "number", "number", "number", "number"],
105
  elem_id="tossup-table",
106
  interactive=False, # Ensure it's not interactive
107
  )
108
 
109
+ gr.HTML(
110
+ "<div style='font-size: 18px;'>"
111
+ "ℹ️ <b>Cost for Bonus pipeline</b> is the cost in USD of executing the pipeline <b>per bonus part</b>. (We have exactly 3 parts per bonus question)"
112
+ "</div>"
113
+ )
114
  bonus_leaderboard = gr.Dataframe(
115
  value=bonus_df,
116
  show_search=True,
117
  label=" 🧐 Bonus Round Leaderboard",
118
  show_label=True,
119
+ datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"],
120
  elem_id="bonus-table",
121
  interactive=False, # Ensure it's not interactive
122
  )
 
126
  show_search=True,
127
  label=" 🥇 Overall Leaderboard",
128
  show_label=True,
129
+ datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"],
130
  )
131
 
132
  gr.on(
src/display/css_html_js.py CHANGED
@@ -102,6 +102,11 @@ table th:first-child {
102
  #box-filter > .form{
103
  border: 0
104
  }
 
 
 
 
 
105
  """
106
 
107
  get_window_url_params = """
 
102
  #box-filter > .form{
103
  border: 0
104
  }
105
+
106
+ span.multiline.text[role="button"] {
107
+ font-size: 16px !important;
108
+ }
109
+
110
  """
111
 
112
  get_window_url_params = """
src/envs.py CHANGED
@@ -19,7 +19,7 @@ USERS_REPO = f"{OWNER}/registered-users"
19
 
20
  ADMIN_USERS = ["mgor"]
21
  EVAL_SPLITS = {"Week 2": "w2_eval", "Week 1": "w1_eval", "Week 0": "tiny_eval"}
22
- CUTOFF_DATES = {"Week 1": "2025-05-30", "Week 0": "2025-05-23", "Week 2": "2025-06-07"}
23
 
24
 
25
  # Important Links
 
19
 
20
  ADMIN_USERS = ["mgor"]
21
  EVAL_SPLITS = {"Week 2": "w2_eval", "Week 1": "w1_eval", "Week 0": "tiny_eval"}
22
+ CUTOFF_DATES = {"Week 1": "2025-05-30", "Week 0": "2025-05-23", "Week 2": "2025-06-10"}
23
 
24
 
25
  # Important Links
src/populate.py CHANGED
@@ -18,6 +18,9 @@ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -
18
  if len(files) == 0 or not all(f.endswith(".json") for f in files):
19
  continue
20
  for file in files:
 
 
 
21
  filepath = os.path.join(root, file)
22
  try:
23
  with open(filepath, "r") as fp:
@@ -30,6 +33,15 @@ def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -
30
  return model_results
31
 
32
 
 
 
 
 
 
 
 
 
 
33
  def get_submission_date(result: dict) -> datetime.date:
34
  submission_id = result["id"]
35
  datetime_str = submission_id.split("__")[-3]
@@ -52,10 +64,12 @@ def get_tossups_leaderboard_df(
52
  repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
53
  ) -> pd.DataFrame:
54
  model_results = fetch_model_results(repo_dir, "tossup", eval_split)
 
55
 
56
  eval_results = []
57
  for result in model_results:
58
  try:
 
59
  metrics = result["metrics"]
60
  username = result["username"]
61
  model_name = result["model_name"]
@@ -64,32 +78,26 @@ def get_tossups_leaderboard_df(
64
  if not qualify_for_private_observation(username, logged_in_username):
65
  continue
66
  submission_name = f"{username}/{model_name} (*)"
67
-
 
68
  row = {
69
  "Submission": submission_name,
70
- "Expected Score ⬆️": metrics["expected_score"],
71
- "Buzz Precision": metrics["buzz_accuracy"],
72
- "Buzz Frequency": metrics["buzz_frequency"],
 
 
 
73
  "Buzz Position": metrics["buzz_position"],
74
- "Win Rate w/ Humans": metrics.get("human_win_rate", None),
75
  }
76
  eval_results.append(row)
77
  except Exception as e:
78
- logger.error(f"Error processing model result '{username}/{model_name}': {e}")
79
  continue
80
 
81
- df = pd.DataFrame(
82
- eval_results,
83
- columns=[
84
- "Submission",
85
- "Expected Score ⬆️",
86
- "Buzz Precision",
87
- "Buzz Frequency",
88
- "Buzz Position",
89
- "Win Rate w/ Humans",
90
- ],
91
- )
92
- df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True)
93
  return df
94
 
95
 
@@ -112,6 +120,7 @@ def get_bonuses_leaderboard_df(
112
 
113
  row = {
114
  "Submission": submission_name,
 
115
  "Effect ⬆️": metrics["effectiveness"],
116
  "Part Acc": metrics["part_accuracy"],
117
  "Question Acc": metrics["question_accuracy"],
@@ -120,13 +129,10 @@ def get_bonuses_leaderboard_df(
120
  }
121
  eval_results.append(row)
122
  except Exception as e:
123
- logger.error(f"Error processing model result '{username}/{model_name}': {e}")
124
  continue
125
 
126
- df = pd.DataFrame(
127
- eval_results,
128
- columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
129
- )
130
  df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
131
  return df
132
 
@@ -135,7 +141,24 @@ def colour_pos_neg(v):
135
  """Return a CSS rule for the cell that called the function."""
136
  if pd.isna(v): # keep NaNs unstyled
137
  return ""
138
- return "color: green;" if v > 0 else "color: red;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  # Helper function to bold the highest value in a column
@@ -154,21 +177,22 @@ def fetch_tossup_leaderboard(
154
  df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
155
 
156
  # Apply formatting and styling
 
 
157
  styled_df = (
158
  df.style.format(
159
  {
160
- "Expected Score ⬆️": "{:6.3f}",
161
- "Buzz Precision": "{:>6.1%}",
162
- "Buzz Position": "{:>6.1f}",
163
- "Buzz Frequency": "{:>6.1%}",
164
- "Win Rate w/ Humans": "{:>6.1%}",
165
  }
166
  )
167
- .map(colour_pos_neg, subset=["Expected Score ⬆️"])
 
168
  .apply(highlight_private_row, axis=1)
169
  .apply(
170
  bold_max,
171
- subset=["Expected Score ⬆️", "Buzz Precision", "Buzz Position", "Win Rate w/ Humans"],
172
  axis=0,
173
  )
174
  )
@@ -190,9 +214,11 @@ def fetch_bonus_leaderboard(
190
  "Effect ⬆️": "{:6.3f}",
191
  "Calibration": "{:>6.1%}",
192
  "Adoption": "{:>6.1%}",
 
193
  }
194
  )
195
  .map(colour_pos_neg, subset=["Effect ⬆️"])
 
196
  .apply(highlight_private_row, axis=1)
197
  .apply(
198
  bold_max,
@@ -220,7 +246,7 @@ def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame)
220
  bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
221
 
222
  # Pick best tossup per user (highest Expected Score ⬆️)
223
- tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username")
224
  tossup_best = tossup_best.set_index("Username")
225
 
226
  # Pick best bonus per user (highest Effect ⬆️)
@@ -244,11 +270,11 @@ def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame)
244
  "Username": merged.index,
245
  "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
246
  "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
247
- "Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
248
- "Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"],
249
- "Effect (Bonus) ⬆️": merged["Effect ⬆️"],
250
- "Part Acc (Bonus)": merged["Part Acc"],
251
- "Adoption (Bonus)": merged["Adoption"],
252
  }
253
  )
254
 
@@ -273,10 +299,10 @@ def fetch_overall_leaderboard(
273
  overall_df.style.format(
274
  {
275
  "Overall Score ⬆️": "{:6.3f}",
276
- "Expected Score (Tossup) ⬆️": "{:6.3f}",
277
- "Effect (Bonus) ⬆️": "{:6.3f}",
278
- "Part Acc (Bonus)": "{:>6.1%}",
279
- "Adoption (Bonus)": "{:>6.1%}",
280
  },
281
  na_rep="-",
282
  )
@@ -284,13 +310,7 @@ def fetch_overall_leaderboard(
284
  .apply(highlight_overall_row, axis=1)
285
  .apply(
286
  bold_max,
287
- subset=[
288
- "Overall Score ⬆️",
289
- "Expected Score (Tossup) ⬆️",
290
- "Effect (Bonus) ⬆️",
291
- "Part Acc (Bonus)",
292
- "Adoption (Bonus)",
293
- ],
294
  axis=0,
295
  )
296
  )
 
18
  if len(files) == 0 or not all(f.endswith(".json") for f in files):
19
  continue
20
  for file in files:
21
+ # Check if the file name is a valid submission id
22
+ if not file.startswith(f"{competition_type}__"):
23
+ continue
24
  filepath = os.path.join(root, file)
25
  try:
26
  with open(filepath, "r") as fp:
 
33
  return model_results
34
 
35
 
36
+ def fetch_tossup_elo_results(repo_dir: str, eval_split: str) -> list[dict]:
37
+ elo_results = []
38
+ dirpath = os.path.join(repo_dir, "tossup", eval_split)
39
+ filepath = os.path.join(dirpath, "elo_results.json")
40
+ with open(filepath, "r") as fp:
41
+ elo_results = json.load(fp)
42
+ return elo_results
43
+
44
+
45
  def get_submission_date(result: dict) -> datetime.date:
46
  submission_id = result["id"]
47
  datetime_str = submission_id.split("__")[-3]
 
64
  repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
65
  ) -> pd.DataFrame:
66
  model_results = fetch_model_results(repo_dir, "tossup", eval_split)
67
+ elo_results = fetch_tossup_elo_results(repo_dir, eval_split)
68
 
69
  eval_results = []
70
  for result in model_results:
71
  try:
72
+ submission_id = result["id"]
73
  metrics = result["metrics"]
74
  username = result["username"]
75
  model_name = result["model_name"]
 
78
  if not qualify_for_private_observation(username, logged_in_username):
79
  continue
80
  submission_name = f"{username}/{model_name} (*)"
81
+ e_score_ai = elo_results.get(submission_id, 0.0)
82
+ overall_expected_score = 0.5 * (metrics["expected_score"] + e_score_ai)
83
  row = {
84
  "Submission": submission_name,
85
+ "E [Score] ⬆️": overall_expected_score,
86
+ "E [Score] (🙋🏻)": metrics["expected_score"],
87
+ "E [Score] (🤖)": e_score_ai,
88
+ "Cost ⬇️": result["cost"],
89
+ "Buz Prec.": metrics["buzz_accuracy"],
90
+ "Buz Freq.": metrics["buzz_frequency"],
91
  "Buzz Position": metrics["buzz_position"],
92
+ "Win Rate w/ 🙋🏻": metrics.get("human_win_rate", None),
93
  }
94
  eval_results.append(row)
95
  except Exception as e:
96
+ logger.error(f"Error processing model result for eval_split={eval_split} '{username}/{model_name}': {e}")
97
  continue
98
 
99
+ df = pd.DataFrame(eval_results)
100
+ df.sort_values(by="E [Score] ⬆️", ascending=False, inplace=True)
 
 
 
 
 
 
 
 
 
 
101
  return df
102
 
103
 
 
120
 
121
  row = {
122
  "Submission": submission_name,
123
+ "Cost ⬇️": result["cost"],
124
  "Effect ⬆️": metrics["effectiveness"],
125
  "Part Acc": metrics["part_accuracy"],
126
  "Question Acc": metrics["question_accuracy"],
 
129
  }
130
  eval_results.append(row)
131
  except Exception as e:
132
+ logger.exception(f"Error processing model result '{username}/{model_name}': {e}")
133
  continue
134
 
135
+ df = pd.DataFrame(eval_results)
 
 
 
136
  df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
137
  return df
138
 
 
141
  """Return a CSS rule for the cell that called the function."""
142
  if pd.isna(v): # keep NaNs unstyled
143
  return ""
144
+ return "color: green;" if float(v) > 0 else "color: red;"
145
+
146
+
147
+ def color_cost(v):
148
+ if pd.isna(v):
149
+ return ""
150
+ # Bucket the cost into 5 categories with darker colors
151
+ cost = float(v)
152
+ if cost < 1:
153
+ return "color: #006400;" # dark green
154
+ elif cost < 2:
155
+ return "color: #00008b;" # dark blue
156
+ elif cost < 3:
157
+ return "color: #8b8b00;" # dark yellow
158
+ elif cost < 4:
159
+ return "color: #8b4500;" # dark orange
160
+ else:
161
+ return "color: #8b0000;" # dark red
162
 
163
 
164
  # Helper function to bold the highest value in a column
 
177
  df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)
178
 
179
  # Apply formatting and styling
180
+ percent_cols = ["Buz Prec.", "Buz Freq.", "Win Rate w/ 🙋🏻"]
181
+ float_cols = ["E [Score] ⬆️", "E [Score] (🙋🏻)", "E [Score] (🤖)", "Buzz Position"]
182
  styled_df = (
183
  df.style.format(
184
  {
185
+ **dict.fromkeys(percent_cols, "{:>6.1%}"),
186
+ **dict.fromkeys(float_cols, "{:6.3f}"),
187
+ "Cost ⬇️": "${:,.2f}",
 
 
188
  }
189
  )
190
+ .map(colour_pos_neg, subset=["E [Score] ⬆️", "E [Score] (🤖)", "E [Score] (🙋🏻)"])
191
+ .map(color_cost, subset=["Cost ⬇️"])
192
  .apply(highlight_private_row, axis=1)
193
  .apply(
194
  bold_max,
195
+ subset=[*percent_cols, *float_cols],
196
  axis=0,
197
  )
198
  )
 
214
  "Effect ⬆️": "{:6.3f}",
215
  "Calibration": "{:>6.1%}",
216
  "Adoption": "{:>6.1%}",
217
+ "Cost ⬇️": "${:,.2f}",
218
  }
219
  )
220
  .map(colour_pos_neg, subset=["Effect ⬆️"])
221
+ .map(color_cost, subset=["Cost ⬇️"])
222
  .apply(highlight_private_row, axis=1)
223
  .apply(
224
  bold_max,
 
246
  bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
247
 
248
  # Pick best tossup per user (highest Expected Score ⬆️)
249
+ tossup_best = tossup_df.sort_values("E [Score] ⬆️", ascending=False).drop_duplicates("Username")
250
  tossup_best = tossup_best.set_index("Username")
251
 
252
  # Pick best bonus per user (highest Effect ⬆️)
 
270
  "Username": merged.index,
271
  "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
272
  "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
273
+ "Overall Score ⬆️": merged[["E [Score] ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
274
+ "Tossup Score ⬆️": merged["E [Score] ⬆️"],
275
+ "Bonus Effect ⬆️": merged["Effect ⬆️"],
276
+ "Bonus Part Acc": merged["Part Acc"],
277
+ "Bonus Adoption": merged["Adoption"],
278
  }
279
  )
280
 
 
299
  overall_df.style.format(
300
  {
301
  "Overall Score ⬆️": "{:6.3f}",
302
+ "Tossup Score ⬆️": "{:6.3f}",
303
+ "Bonus Effect ⬆️": "{:6.3f}",
304
+ "Bonus Part Acc": "{:>6.1%}",
305
+ "Bonus Adoption": "{:>6.1%}",
306
  },
307
  na_rep="-",
308
  )
 
310
  .apply(highlight_overall_row, axis=1)
311
  .apply(
312
  bold_max,
313
+ subset=["Overall Score ⬆️", "Tossup Score ⬆️", "Bonus Effect ⬆️"],
 
 
 
 
 
 
314
  axis=0,
315
  )
316
  )