terryyz commited on
Commit
95a116e
·
1 Parent(s): 062800e
Files changed (1) hide show
  1. elo_calculation.py +27 -13
elo_calculation.py CHANGED
@@ -12,6 +12,9 @@ from sklearn.linear_model import LogisticRegression
12
  import yaml
13
  import os
14
 
 
 
 
15
 
16
  def load_model_metadata():
17
  """Load model metadata from api_config.yaml"""
@@ -42,6 +45,10 @@ def load_model_metadata():
42
 
43
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
44
  """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
 
 
 
 
45
  ptbl_a_win = pd.pivot_table(
46
  df[df["winner"] == "model_a"],
47
  index="model_a",
@@ -49,9 +56,12 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
49
  aggfunc="size",
50
  fill_value=0,
51
  )
 
 
 
52
  # if no tie, create a zero matrix
53
  if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
54
- ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
55
  else:
56
  ptbl_tie = pd.pivot_table(
57
  df[df["winner"].isin(["tie", "tie (bothbad)"])],
@@ -60,6 +70,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
60
  aggfunc="size",
61
  fill_value=0,
62
  )
 
63
  ptbl_tie = ptbl_tie + ptbl_tie.T
64
 
65
  ptbl_b_win = pd.pivot_table(
@@ -69,6 +80,8 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
69
  aggfunc="size",
70
  fill_value=0,
71
  )
 
 
72
  ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
73
 
74
  models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
@@ -102,8 +115,6 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
102
  lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
103
  lr.fit(X, Y, sample_weight=sample_weights)
104
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
105
- if "mixtral-8x7b-instruct-v0.1" in models.index:
106
- elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
107
  return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
108
 
109
 
@@ -161,8 +172,8 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
161
  confidence_intervals = {} # Initialize to avoid uninitialized variable error
162
 
163
  # Check if we have sufficient data for Bradley-Terry model
164
- # Since we only display models with >= 10 votes, we need enough battles
165
- if len(battles_df) < 10:
166
  # Not enough battles for reliable ranking
167
  all_models = set(
168
  battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
@@ -175,7 +186,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
175
  elo_ratings = compute_mle_elo(battles_df)
176
 
177
  # Calculate confidence intervals using bootstrap
178
- if len(battles_df) >= 10: # Only calculate CI if we have enough data
179
  try:
180
  np.random.seed(42)
181
  bootstrap_df = get_bootstrap_result(
@@ -184,6 +195,11 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
184
 
185
  # Calculate 95% confidence intervals
186
  if not bootstrap_df.empty:
 
 
 
 
 
187
  for model in bootstrap_df.columns:
188
  scores = bootstrap_df[model].dropna()
189
  if len(scores) > 0:
@@ -192,8 +208,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
192
  median_score = scores.median()
193
  ci_margin = (upper - lower) / 2
194
  confidence_intervals[model] = ci_margin
195
- else:
196
- confidence_intervals[model] = 0
197
  else:
198
  # Fallback: no confidence intervals
199
  for model in elo_ratings.index:
@@ -216,7 +230,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
216
  def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
217
  """
218
  Create ranking DataFrame with all necessary columns
219
- Only includes models with at least 10 battles
220
 
221
  Args:
222
  elo_ratings (pd.Series): Elo ratings for each model
@@ -225,17 +239,17 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
225
 
226
  Returns:
227
  pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
228
- Empty DataFrame if no models have >= 10 votes
229
  """
230
  # Load model metadata
231
  metadata = load_model_metadata()
232
 
233
  # Create ranking list with Elo ratings and confidence intervals
234
- # Only include models with at least 10 battles
235
  ranking_list = []
236
  for model in elo_ratings.index:
237
- # Skip models with fewer than 10 votes
238
- if vote_counts.get(model, 0) < 10:
239
  continue
240
 
241
  ci_margin = confidence_intervals.get(model, 0)
 
12
  import yaml
13
  import os
14
 
15
+ # Minimum number of votes required for a model to be included in rankings
16
+ MIN_VOTES_THRESHOLD = 100
17
+
18
 
19
  def load_model_metadata():
20
  """Load model metadata from api_config.yaml"""
 
45
 
46
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
47
  """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
48
+
49
+ # Get all unique models to ensure consistent indexing
50
+ all_models = pd.Index(sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())))
51
+
52
  ptbl_a_win = pd.pivot_table(
53
  df[df["winner"] == "model_a"],
54
  index="model_a",
 
56
  aggfunc="size",
57
  fill_value=0,
58
  )
59
+ # Reindex to include all models
60
+ ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
61
+
62
  # if no tie, create a zero matrix
63
  if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
64
+ ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models)
65
  else:
66
  ptbl_tie = pd.pivot_table(
67
  df[df["winner"].isin(["tie", "tie (bothbad)"])],
 
70
  aggfunc="size",
71
  fill_value=0,
72
  )
73
+ ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
74
  ptbl_tie = ptbl_tie + ptbl_tie.T
75
 
76
  ptbl_b_win = pd.pivot_table(
 
80
  aggfunc="size",
81
  fill_value=0,
82
  )
83
+ ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
84
+
85
  ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
86
 
87
  models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
 
115
  lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
116
  lr.fit(X, Y, sample_weight=sample_weights)
117
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
 
 
118
  return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
119
 
120
 
 
172
  confidence_intervals = {} # Initialize to avoid uninitialized variable error
173
 
174
  # Check if we have sufficient data for Bradley-Terry model
175
+ # Since we only display models with >= MIN_VOTES_THRESHOLD votes, we need enough battles
176
+ if len(battles_df) < MIN_VOTES_THRESHOLD:
177
  # Not enough battles for reliable ranking
178
  all_models = set(
179
  battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
 
186
  elo_ratings = compute_mle_elo(battles_df)
187
 
188
  # Calculate confidence intervals using bootstrap
189
+ if len(battles_df) >= MIN_VOTES_THRESHOLD: # Only calculate CI if we have enough data
190
  try:
191
  np.random.seed(42)
192
  bootstrap_df = get_bootstrap_result(
 
195
 
196
  # Calculate 95% confidence intervals
197
  if not bootstrap_df.empty:
198
+ # Initialize CI for all models first
199
+ for model in elo_ratings.index:
200
+ confidence_intervals[model] = 0
201
+
202
+ # Update with bootstrap results
203
  for model in bootstrap_df.columns:
204
  scores = bootstrap_df[model].dropna()
205
  if len(scores) > 0:
 
208
  median_score = scores.median()
209
  ci_margin = (upper - lower) / 2
210
  confidence_intervals[model] = ci_margin
 
 
211
  else:
212
  # Fallback: no confidence intervals
213
  for model in elo_ratings.index:
 
230
  def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
231
  """
232
  Create ranking DataFrame with all necessary columns
233
+ Only includes models with at least MIN_VOTES_THRESHOLD battles
234
 
235
  Args:
236
  elo_ratings (pd.Series): Elo ratings for each model
 
239
 
240
  Returns:
241
  pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
242
+ Empty DataFrame if no models have >= MIN_VOTES_THRESHOLD votes
243
  """
244
  # Load model metadata
245
  metadata = load_model_metadata()
246
 
247
  # Create ranking list with Elo ratings and confidence intervals
248
+ # Only include models with at least MIN_VOTES_THRESHOLD battles
249
  ranking_list = []
250
  for model in elo_ratings.index:
251
+ # Skip models with fewer than MIN_VOTES_THRESHOLD votes
252
+ if vote_counts.get(model, 0) < MIN_VOTES_THRESHOLD:
253
  continue
254
 
255
  ci_margin = confidence_intervals.get(model, 0)