Spaces:

bigcode
/

arena

Running

App Files Files Community

terryyz commited on Oct 5

Commit

95a116e

1 Parent(s): 062800e

fix

Browse files

Files changed (1) hide show

elo_calculation.py +27 -13

elo_calculation.py CHANGED Viewed

@@ -12,6 +12,9 @@ from sklearn.linear_model import LogisticRegression
 import yaml
 import os
 def load_model_metadata():
     """Load model metadata from api_config.yaml"""
@@ -42,6 +45,10 @@ def load_model_metadata():
 def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
     """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
     ptbl_a_win = pd.pivot_table(
         df[df["winner"] == "model_a"],
         index="model_a",
@@ -49,9 +56,12 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
         aggfunc="size",
         fill_value=0,
     )
     # if no tie, create a zero matrix
     if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
-        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
     else:
         ptbl_tie = pd.pivot_table(
             df[df["winner"].isin(["tie", "tie (bothbad)"])],
@@ -60,6 +70,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
             aggfunc="size",
             fill_value=0,
         )
         ptbl_tie = ptbl_tie + ptbl_tie.T
     ptbl_b_win = pd.pivot_table(
@@ -69,6 +80,8 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
         aggfunc="size",
         fill_value=0,
     )
     ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
     models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
@@ -102,8 +115,6 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
     lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
     lr.fit(X, Y, sample_weight=sample_weights)
     elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    if "mixtral-8x7b-instruct-v0.1" in models.index:
-        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
     return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
@@ -161,8 +172,8 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
     confidence_intervals = {}  # Initialize to avoid uninitialized variable error
     # Check if we have sufficient data for Bradley-Terry model
-    # Since we only display models with >= 10 votes, we need enough battles
-    if len(battles_df) < 10:
         # Not enough battles for reliable ranking
         all_models = set(
             battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
@@ -175,7 +186,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
             elo_ratings = compute_mle_elo(battles_df)
             # Calculate confidence intervals using bootstrap
-            if len(battles_df) >= 10:  # Only calculate CI if we have enough data
                 try:
                     np.random.seed(42)
                     bootstrap_df = get_bootstrap_result(
@@ -184,6 +195,11 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
                     # Calculate 95% confidence intervals
                     if not bootstrap_df.empty:
                         for model in bootstrap_df.columns:
                             scores = bootstrap_df[model].dropna()
                             if len(scores) > 0:
@@ -192,8 +208,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
                                 median_score = scores.median()
                                 ci_margin = (upper - lower) / 2
                                 confidence_intervals[model] = ci_margin
-                            else:
-                                confidence_intervals[model] = 0
                     else:
                         # Fallback: no confidence intervals
                         for model in elo_ratings.index:
@@ -216,7 +230,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
 def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     """
     Create ranking DataFrame with all necessary columns
-    Only includes models with at least 10 battles
     Args:
         elo_ratings (pd.Series): Elo ratings for each model
@@ -225,17 +239,17 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     Returns:
         pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
-                     Empty DataFrame if no models have >= 10 votes
     """
     # Load model metadata
     metadata = load_model_metadata()
     # Create ranking list with Elo ratings and confidence intervals
-    # Only include models with at least 10 battles
     ranking_list = []
     for model in elo_ratings.index:
-        # Skip models with fewer than 10 votes
-        if vote_counts.get(model, 0) < 10:
             continue
         ci_margin = confidence_intervals.get(model, 0)

 import yaml
 import os
+# Minimum number of votes required for a model to be included in rankings
+MIN_VOTES_THRESHOLD = 100
 def load_model_metadata():
     """Load model metadata from api_config.yaml"""
 def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
     """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
+    # Get all unique models to ensure consistent indexing
+    all_models = pd.Index(sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())))
     ptbl_a_win = pd.pivot_table(
         df[df["winner"] == "model_a"],
         index="model_a",
         aggfunc="size",
         fill_value=0,
     )
+    # Reindex to include all models
+    ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
     # if no tie, create a zero matrix
     if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
+        ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models)
     else:
         ptbl_tie = pd.pivot_table(
             df[df["winner"].isin(["tie", "tie (bothbad)"])],
             aggfunc="size",
             fill_value=0,
         )
+        ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
         ptbl_tie = ptbl_tie + ptbl_tie.T
     ptbl_b_win = pd.pivot_table(
         aggfunc="size",
         fill_value=0,
     )
+    ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
     ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
     models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
     lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
     lr.fit(X, Y, sample_weight=sample_weights)
     elo_scores = SCALE * lr.coef_[0] + INIT_RATING
     return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
     confidence_intervals = {}  # Initialize to avoid uninitialized variable error
     # Check if we have sufficient data for Bradley-Terry model
+    # Since we only display models with >= MIN_VOTES_THRESHOLD votes, we need enough battles
+    if len(battles_df) < MIN_VOTES_THRESHOLD:
         # Not enough battles for reliable ranking
         all_models = set(
             battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
             elo_ratings = compute_mle_elo(battles_df)
             # Calculate confidence intervals using bootstrap
+            if len(battles_df) >= MIN_VOTES_THRESHOLD:  # Only calculate CI if we have enough data
                 try:
                     np.random.seed(42)
                     bootstrap_df = get_bootstrap_result(
                     # Calculate 95% confidence intervals
                     if not bootstrap_df.empty:
+                        # Initialize CI for all models first
+                        for model in elo_ratings.index:
+                            confidence_intervals[model] = 0
+                        # Update with bootstrap results
                         for model in bootstrap_df.columns:
                             scores = bootstrap_df[model].dropna()
                             if len(scores) > 0:
                                 median_score = scores.median()
                                 ci_margin = (upper - lower) / 2
                                 confidence_intervals[model] = ci_margin
                     else:
                         # Fallback: no confidence intervals
                         for model in elo_ratings.index:
 def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     """
     Create ranking DataFrame with all necessary columns
+    Only includes models with at least MIN_VOTES_THRESHOLD battles
     Args:
         elo_ratings (pd.Series): Elo ratings for each model
     Returns:
         pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
+                     Empty DataFrame if no models have >= MIN_VOTES_THRESHOLD votes
     """
     # Load model metadata
     metadata = load_model_metadata()
     # Create ranking list with Elo ratings and confidence intervals
+    # Only include models with at least MIN_VOTES_THRESHOLD battles
     ranking_list = []
     for model in elo_ratings.index:
+        # Skip models with fewer than MIN_VOTES_THRESHOLD votes
+        if vote_counts.get(model, 0) < MIN_VOTES_THRESHOLD:
             continue
         ci_margin = confidence_intervals.get(model, 0)