terryyz commited on
Commit
062800e
·
1 Parent(s): dd89a8c
Files changed (1) hide show
  1. elo_calculation.py +58 -107
elo_calculation.py CHANGED
@@ -42,127 +42,77 @@ def load_model_metadata():
42
 
43
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
44
  """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Get all unique models
47
- all_models = sorted(list(set(df["model_a"].tolist() + df["model_b"].tolist())))
48
-
49
- # Create win matrices for each outcome type
50
- # Initialize empty matrices with float dtype to avoid warnings
51
- ptbl_a_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
52
- ptbl_b_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
53
- ptbl_tie = pd.DataFrame(0.0, index=all_models, columns=all_models)
54
-
55
- # Count wins for model_a
56
- model_a_wins = df[df["winner"] == "model_a"]
57
- if not model_a_wins.empty:
58
- a_win_counts = model_a_wins.groupby(["model_a", "model_b"]).size()
59
- for (model_a, model_b), count in a_win_counts.items():
60
- ptbl_a_win.loc[model_a, model_b] = count
61
-
62
- # Count wins for model_b
63
- model_b_wins = df[df["winner"] == "model_b"]
64
- if not model_b_wins.empty:
65
- b_win_counts = model_b_wins.groupby(["model_a", "model_b"]).size()
66
- for (model_a, model_b), count in b_win_counts.items():
67
- ptbl_b_win.loc[model_a, model_b] = count
68
-
69
- # Count ties
70
- ties = df[df["winner"].isin(["tie", "tie (bothbad)"])]
71
- if not ties.empty:
72
- tie_counts = ties.groupby(["model_a", "model_b"]).size()
73
- for (model_a, model_b), count in tie_counts.items():
74
- # For ties, we count 0.5 win for each model
75
- ptbl_tie.loc[model_a, model_b] = count * 0.5
76
- ptbl_tie.loc[model_b, model_a] = count * 0.5
77
-
78
- models = pd.Series(np.arange(len(all_models)), index=all_models)
79
  p = len(models)
80
-
81
- # Create training data for logistic regression
82
- X = []
83
- Y = []
84
  sample_weights = []
85
-
86
- for model_a in all_models:
87
- for model_b in all_models:
88
- if model_a == model_b:
89
  continue
90
-
91
- # Count total games between these models
92
- a_wins = ptbl_a_win.loc[model_a, model_b]
93
- b_wins = ptbl_b_win.loc[model_a, model_b]
94
- ties = ptbl_tie.loc[model_a, model_b]
95
-
96
- total_games = a_wins + b_wins + ties
97
- if total_games == 0:
98
  continue
99
-
100
- # Create feature vector: difference in model strengths
101
- x = np.zeros(p)
102
- x[models[model_a]] = 1.0
103
- x[models[model_b]] = -1.0
104
-
105
- # Add data points for model_a wins
106
- if a_wins > 0:
107
- X.append(x)
108
- Y.append(1) # model_a wins
109
- sample_weights.append(a_wins)
110
-
111
- # Add data points for model_b wins (model_a loses)
112
- if b_wins > 0:
113
- X.append(x) # same feature vector
114
- Y.append(0) # model_a loses
115
- sample_weights.append(b_wins)
116
-
117
- # Add data points for ties - treat as half wins for model_a
118
- if ties > 0:
119
- # Add ties as both wins and losses with half weight each
120
- X.append(x)
121
- Y.append(1) # model_a wins (tie counted as win)
122
- sample_weights.append(ties / 2)
123
-
124
- X.append(x)
125
- Y.append(0) # model_a loses (tie counted as loss)
126
- sample_weights.append(ties / 2)
127
-
128
- if len(X) == 0 or len(set(Y)) < 2:
129
- # Not enough data or no variation in outcomes
130
- return pd.Series({model: INIT_RATING for model in all_models}).sort_values(ascending=False)
131
-
132
- X = np.array(X)
133
- Y = np.array(Y)
134
- sample_weights = np.array(sample_weights)
135
-
136
- # Fit logistic regression
137
- lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6, max_iter=1000)
138
  lr.fit(X, Y, sample_weight=sample_weights)
139
-
140
- # Convert coefficients to Elo ratings
141
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
142
-
143
-
144
  return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
145
 
146
 
147
- def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
148
  """Get bootstrap results for confidence interval calculation"""
149
-
150
  rows = []
151
  for i in tqdm(range(num_round), desc="bootstrap"):
152
- # Bootstrap sample with replacement
153
- bootstrap_sample = battles.sample(frac=1.0, replace=True)
154
- try:
155
- elo_result = func_compute_elo(bootstrap_sample)
156
- rows.append(elo_result)
157
- except Exception as e:
158
- # Skip failed bootstrap samples
159
- continue
160
-
161
- if not rows:
162
- return pd.DataFrame()
163
-
164
  df = pd.DataFrame(rows)
165
- # Sort columns by median Elo score (descending)
166
  return df[df.median().sort_values(ascending=False).index]
167
 
168
 
@@ -227,6 +177,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
227
  # Calculate confidence intervals using bootstrap
228
  if len(battles_df) >= 10: # Only calculate CI if we have enough data
229
  try:
 
230
  bootstrap_df = get_bootstrap_result(
231
  battles_df, compute_mle_elo, num_round=100
232
  )
 
42
 
43
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
44
  """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
45
+ ptbl_a_win = pd.pivot_table(
46
+ df[df["winner"] == "model_a"],
47
+ index="model_a",
48
+ columns="model_b",
49
+ aggfunc="size",
50
+ fill_value=0,
51
+ )
52
+ # if no tie, create a zero matrix
53
+ if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
54
+ ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
55
+ else:
56
+ ptbl_tie = pd.pivot_table(
57
+ df[df["winner"].isin(["tie", "tie (bothbad)"])],
58
+ index="model_a",
59
+ columns="model_b",
60
+ aggfunc="size",
61
+ fill_value=0,
62
+ )
63
+ ptbl_tie = ptbl_tie + ptbl_tie.T
64
+
65
+ ptbl_b_win = pd.pivot_table(
66
+ df[df["winner"] == "model_b"],
67
+ index="model_a",
68
+ columns="model_b",
69
+ aggfunc="size",
70
+ fill_value=0,
71
+ )
72
+ ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
73
+
74
+ models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  p = len(models)
77
+ X = np.zeros([p * (p - 1) * 2, p])
78
+ Y = np.zeros(p * (p - 1) * 2)
79
+
80
+ cur_row = 0
81
  sample_weights = []
82
+ for m_a in ptbl_win.index:
83
+ for m_b in ptbl_win.columns:
84
+ if m_a == m_b:
 
85
  continue
86
+ # if nan skip
87
+ if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
 
 
 
 
 
 
88
  continue
89
+ X[cur_row, models[m_a]] = +math.log(BASE)
90
+ X[cur_row, models[m_b]] = -math.log(BASE)
91
+ Y[cur_row] = 1.0
92
+ sample_weights.append(ptbl_win.loc[m_a, m_b])
93
+
94
+ X[cur_row + 1, models[m_a]] = math.log(BASE)
95
+ X[cur_row + 1, models[m_b]] = -math.log(BASE)
96
+ Y[cur_row + 1] = 0.0
97
+ sample_weights.append(ptbl_win.loc[m_b, m_a])
98
+ cur_row += 2
99
+ X = X[:cur_row]
100
+ Y = Y[:cur_row]
101
+
102
+ lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  lr.fit(X, Y, sample_weight=sample_weights)
 
 
104
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
105
+ if "mixtral-8x7b-instruct-v0.1" in models.index:
106
+ elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
107
  return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
108
 
109
 
110
+ def get_bootstrap_result(battles, func_compute_elo, num_round):
111
  """Get bootstrap results for confidence interval calculation"""
 
112
  rows = []
113
  for i in tqdm(range(num_round), desc="bootstrap"):
114
+ rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
 
 
 
 
 
 
 
 
 
 
 
115
  df = pd.DataFrame(rows)
 
116
  return df[df.median().sort_values(ascending=False).index]
117
 
118
 
 
177
  # Calculate confidence intervals using bootstrap
178
  if len(battles_df) >= 10: # Only calculate CI if we have enough data
179
  try:
180
+ np.random.seed(42)
181
  bootstrap_df = get_bootstrap_result(
182
  battles_df, compute_mle_elo, num_round=100
183
  )