Spaces:
Running
Running
File size: 7,378 Bytes
ba99c06 dfe9318 ba99c06 9f4f2cd ba99c06 9f4f2cd dfe9318 9f4f2cd dfe9318 9f4f2cd dfe9318 9f4f2cd ba99c06 aa2b984 ba99c06 dfe9318 ba99c06 aa2b984 ba99c06 dfe9318 ba99c06 9f4f2cd ba99c06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
"""
Ranking module for BigCodeArena
Handles model leaderboard functionality and data management
"""
import gradio as gr
import pandas as pd
import numpy as np
import datetime
import os
from collections import defaultdict
from datasets import Dataset, load_dataset
# Import Elo calculation utilities
from elo_calculation import (
calculate_elo_with_confidence_intervals,
create_ranking_dataframe,
)
# HuggingFace dataset configuration
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
HF_TOKEN = os.getenv("HF_TOKEN")
REFRESH_TIME = os.getenv("REFRESH_TIME") or 60*60*12 # 12 hours by default
# Global ranking data cache
ranking_data = None
ranking_last_updated = None
def load_ranking_data(hf_token=None, force_reload=False):
"""Load and calculate ranking data from HuggingFace dataset"""
global ranking_data, ranking_last_updated
try:
# Use global token if not provided
token = hf_token or HF_TOKEN
if not token:
return pd.DataFrame()
if not HF_DATASET_NAME:
return pd.DataFrame()
# Load dataset - force download if requested
if force_reload:
# Force download from remote, ignore cache
dataset = load_dataset(
HF_DATASET_NAME,
split="train",
token=token,
download_mode="force_redownload",
)
else:
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
if hasattr(dataset, "to_pandas"):
df = dataset.to_pandas()
else:
df = pd.DataFrame(dataset)
if df.empty:
return pd.DataFrame()
# Filter to only include samples where both models have code in their responses
# code_a and code_b are lists/arrays of dicts, check if each dict has non-empty "code"
def has_valid_code(x):
"""Check if x is a list/array of dicts where each dict has a non-empty 'code' field"""
# Handle None, NaN, and other non-list/array values
if x is None or (not isinstance(x, (list, np.ndarray))) or len(x) == 0:
return False
return all(
isinstance(item, dict) and
'code' in item and
item['code'] and
len(str(item['code']).strip()) > 0
for item in x
)
if 'code_a' in df.columns and 'code_b' in df.columns:
# Filter rows where both code_a and code_b have valid code
valid_code_a = df['code_a'].apply(has_valid_code)
valid_code_b = df['code_b'].apply(has_valid_code)
df = df[valid_code_a & valid_code_b]
if df.empty:
return pd.DataFrame()
# Convert vote format for Elo calculation and count votes
battle_data = []
vote_counts = defaultdict(int)
for _, row in df.iterrows():
model_a = row["model_a"]
model_b = row["model_b"]
vote = row["vote"]
# Convert vote to winner format for Elo
if vote == "left": # Model A wins
winner = "model_a"
elif vote == "right": # Model B wins
winner = "model_b"
elif vote == "tie":
winner = "tie"
elif vote == "both_bad":
winner = "tie (bothbad)"
else:
continue # Skip invalid votes
battle_data.append(
{"model_a": model_a, "model_b": model_b, "winner": winner}
)
# Count votes for each model
vote_counts[model_a] += 1
vote_counts[model_b] += 1
# Create DataFrame for Elo calculation
battles_df = pd.DataFrame(battle_data)
if battles_df.empty:
return pd.DataFrame()
# Calculate Elo ratings using Bradley-Terry Model with confidence intervals
elo_ratings, confidence_intervals = calculate_elo_with_confidence_intervals(
battles_df, vote_counts
)
# Create ranking DataFrame
ranking_df = create_ranking_dataframe(
elo_ratings, confidence_intervals, vote_counts
)
ranking_data = ranking_df
ranking_last_updated = datetime.datetime.now()
return ranking_df
except Exception as e:
return pd.DataFrame()
def update_ranking_display():
"""Update ranking display with current data"""
df = load_ranking_data()
if df.empty:
return gr.update(value=df), "**Last Updated:** No enough data available"
# Drop License column if it exists
if 'License' in df.columns:
df = df.drop(columns=['License'])
last_update = (
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
if ranking_last_updated
else "Unknown"
)
return gr.update(value=df), f"**Last Updated:** {last_update}"
def force_update_ranking_display():
"""Force update ranking data from HuggingFace (for timer)"""
df = load_ranking_data(force_reload=True)
if df.empty:
return gr.update(value=df), "**Last Updated:** No enough data available"
# Drop License column if it exists
if 'License' in df.columns:
df = df.drop(columns=['License'])
last_update = (
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
if ranking_last_updated
else "Unknown"
)
return gr.update(value=df), f"**Last Updated:** {last_update}"
def create_ranking_tab():
"""Create the ranking tab UI component"""
with gr.Tab("π Ranking", id="ranking"):
gr.Markdown("## π Model Leaderboard")
gr.Markdown(
"""
> **Note:** This ranking table shows raw results from user votes.
> More detailed analysis will be added manually.
"""
)
ranking_table = gr.Dataframe(
headers=[
"Rank",
"Model",
"Score",
"95% CI (Β±)",
"Votes",
"Organization",
],
datatype=[
"number",
"str",
"number",
"str",
"number",
"str",
],
label="Model Rankings",
interactive=False,
wrap=True,
)
ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet")
# Timer for auto-refresh every REFRESH_TIME seconds
ranking_timer = gr.Timer(value=REFRESH_TIME, active=True)
return ranking_table, ranking_last_update, ranking_timer
def setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer):
"""Setup event handlers for ranking functionality"""
# Timer tick handler for auto-refresh with force reload
ranking_timer.tick(
fn=force_update_ranking_display,
inputs=[],
outputs=[ranking_table, ranking_last_update],
)
# Auto-load ranking on startup
demo.load(
fn=update_ranking_display,
inputs=[],
outputs=[ranking_table, ranking_last_update],
)
return ranking_table, ranking_last_update
|