Spaces:
Sleeping
Sleeping
Commit
·
c058232
1
Parent(s):
58cc0ce
fix: aggregate Global Leaderboard by model to prevent duplicates
Browse files
app.py
CHANGED
|
@@ -56,15 +56,21 @@ class LeaderboardManager:
|
|
| 56 |
return self.leaderboard.copy()
|
| 57 |
|
| 58 |
def get_top_results(self, n: int = None) -> pd.DataFrame:
|
| 59 |
-
"""Get top N results by composite score."""
|
| 60 |
if self.leaderboard.empty:
|
| 61 |
return self.leaderboard
|
| 62 |
|
| 63 |
if n is None:
|
| 64 |
n = self.config.top_results
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Sort by composite score and add ranking
|
| 67 |
-
top_results = (
|
| 68 |
.sort_values('composite_score', ascending=False)
|
| 69 |
.head(n)
|
| 70 |
.reset_index(drop=True))
|
|
|
|
| 56 |
return self.leaderboard.copy()
|
| 57 |
|
| 58 |
def get_top_results(self, n: int = None) -> pd.DataFrame:
|
| 59 |
+
"""Get top N results by composite score, aggregated by model."""
|
| 60 |
if self.leaderboard.empty:
|
| 61 |
return self.leaderboard
|
| 62 |
|
| 63 |
if n is None:
|
| 64 |
n = self.config.top_results
|
| 65 |
|
| 66 |
+
# Group by model and calculate averages
|
| 67 |
+
numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
|
| 68 |
+
|
| 69 |
+
# Calculate averages for numeric columns
|
| 70 |
+
model_aggregated = self.leaderboard.groupby('model_name')[numeric_columns].mean().reset_index()
|
| 71 |
+
|
| 72 |
# Sort by composite score and add ranking
|
| 73 |
+
top_results = (model_aggregated
|
| 74 |
.sort_values('composite_score', ascending=False)
|
| 75 |
.head(n)
|
| 76 |
.reset_index(drop=True))
|