Spaces:
Running
Running
| """ | |
| Ranking module for BigCodeArena | |
| Handles model leaderboard functionality and data management | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import datetime | |
| import os | |
| from collections import defaultdict | |
| from datasets import Dataset, load_dataset | |
| # Import Elo calculation utilities | |
| from elo_calculation import ( | |
| calculate_elo_with_confidence_intervals, | |
| create_ranking_dataframe, | |
| ) | |
| # HuggingFace dataset configuration | |
| HF_DATASET_NAME = os.getenv("HF_DATASET_NAME") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| REFRESH_TIME = os.getenv("REFRESH_TIME") or 60*60*12 # 12 hours by default | |
| # Global ranking data cache | |
| ranking_data = None | |
| ranking_last_updated = None | |
| def load_ranking_data(hf_token=None, force_reload=False): | |
| """Load and calculate ranking data from HuggingFace dataset""" | |
| global ranking_data, ranking_last_updated | |
| try: | |
| # Use global token if not provided | |
| token = hf_token or HF_TOKEN | |
| if not token: | |
| return pd.DataFrame() | |
| if not HF_DATASET_NAME: | |
| return pd.DataFrame() | |
| # Load dataset - force download if requested | |
| if force_reload: | |
| # Force download from remote, ignore cache | |
| dataset = load_dataset( | |
| HF_DATASET_NAME, | |
| split="train", | |
| token=token, | |
| download_mode="force_redownload", | |
| ) | |
| else: | |
| dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload") | |
| # Convert to pandas DataFrame - handle both Dataset and DatasetDict | |
| if hasattr(dataset, "to_pandas"): | |
| df = dataset.to_pandas() | |
| else: | |
| df = pd.DataFrame(dataset) | |
| if df.empty: | |
| return pd.DataFrame() | |
| # Filter to only include samples where both models have code in their responses | |
| # code_a and code_b are lists/arrays of dicts, check if each dict has non-empty "code" | |
| def has_valid_code(x): | |
| """Check if x is a list/array of dicts where each dict has a non-empty 'code' field""" | |
| # Handle None, NaN, and other non-list/array values | |
| if x is None or (not isinstance(x, (list, np.ndarray))) or len(x) == 0: | |
| return False | |
| return all( | |
| isinstance(item, dict) and | |
| 'code' in item and | |
| item['code'] and | |
| len(str(item['code']).strip()) > 0 | |
| for item in x | |
| ) | |
| if 'code_a' in df.columns and 'code_b' in df.columns: | |
| # Filter rows where both code_a and code_b have valid code | |
| valid_code_a = df['code_a'].apply(has_valid_code) | |
| valid_code_b = df['code_b'].apply(has_valid_code) | |
| df = df[valid_code_a & valid_code_b] | |
| if df.empty: | |
| return pd.DataFrame() | |
| # Convert vote format for Elo calculation and count votes | |
| battle_data = [] | |
| vote_counts = defaultdict(int) | |
| for _, row in df.iterrows(): | |
| model_a = row["model_a"] | |
| model_b = row["model_b"] | |
| vote = row["vote"] | |
| # Convert vote to winner format for Elo | |
| if vote == "left": # Model A wins | |
| winner = "model_a" | |
| elif vote == "right": # Model B wins | |
| winner = "model_b" | |
| elif vote == "tie": | |
| winner = "tie" | |
| elif vote == "both_bad": | |
| winner = "tie (bothbad)" | |
| else: | |
| continue # Skip invalid votes | |
| battle_data.append( | |
| {"model_a": model_a, "model_b": model_b, "winner": winner} | |
| ) | |
| # Count votes for each model | |
| vote_counts[model_a] += 1 | |
| vote_counts[model_b] += 1 | |
| # Create DataFrame for Elo calculation | |
| battles_df = pd.DataFrame(battle_data) | |
| if battles_df.empty: | |
| return pd.DataFrame() | |
| # Calculate Elo ratings using Bradley-Terry Model with confidence intervals | |
| elo_ratings, confidence_intervals = calculate_elo_with_confidence_intervals( | |
| battles_df, vote_counts | |
| ) | |
| # Create ranking DataFrame | |
| ranking_df = create_ranking_dataframe( | |
| elo_ratings, confidence_intervals, vote_counts | |
| ) | |
| ranking_data = ranking_df | |
| ranking_last_updated = datetime.datetime.now() | |
| return ranking_df | |
| except Exception as e: | |
| return pd.DataFrame() | |
| def update_ranking_display(): | |
| """Update ranking display with current data""" | |
| df = load_ranking_data() | |
| if df.empty: | |
| return gr.update(value=df), "**Last Updated:** No enough data available" | |
| # Drop License column if it exists | |
| if 'License' in df.columns: | |
| df = df.drop(columns=['License']) | |
| last_update = ( | |
| ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S") | |
| if ranking_last_updated | |
| else "Unknown" | |
| ) | |
| return gr.update(value=df), f"**Last Updated:** {last_update}" | |
| def force_update_ranking_display(): | |
| """Force update ranking data from HuggingFace (for timer)""" | |
| df = load_ranking_data(force_reload=True) | |
| if df.empty: | |
| return gr.update(value=df), "**Last Updated:** No enough data available" | |
| # Drop License column if it exists | |
| if 'License' in df.columns: | |
| df = df.drop(columns=['License']) | |
| last_update = ( | |
| ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S") | |
| if ranking_last_updated | |
| else "Unknown" | |
| ) | |
| return gr.update(value=df), f"**Last Updated:** {last_update}" | |
| def create_ranking_tab(): | |
| """Create the ranking tab UI component""" | |
| with gr.Tab("π Ranking", id="ranking"): | |
| gr.Markdown("## π Model Leaderboard") | |
| gr.Markdown( | |
| """ | |
| > **Note:** This ranking table shows raw results from user votes. | |
| > More detailed analysis will be added manually. | |
| """ | |
| ) | |
| ranking_table = gr.Dataframe( | |
| headers=[ | |
| "Rank", | |
| "Model", | |
| "Score", | |
| "95% CI (Β±)", | |
| "Votes", | |
| "Organization", | |
| ], | |
| datatype=[ | |
| "number", | |
| "str", | |
| "number", | |
| "str", | |
| "number", | |
| "str", | |
| ], | |
| label="Model Rankings", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet") | |
| # Timer for auto-refresh every REFRESH_TIME seconds | |
| ranking_timer = gr.Timer(value=REFRESH_TIME, active=True) | |
| return ranking_table, ranking_last_update, ranking_timer | |
| def setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer): | |
| """Setup event handlers for ranking functionality""" | |
| # Timer tick handler for auto-refresh with force reload | |
| ranking_timer.tick( | |
| fn=force_update_ranking_display, | |
| inputs=[], | |
| outputs=[ranking_table, ranking_last_update], | |
| ) | |
| # Auto-load ranking on startup | |
| demo.load( | |
| fn=update_ranking_display, | |
| inputs=[], | |
| outputs=[ranking_table, ranking_last_update], | |
| ) | |
| return ranking_table, ranking_last_update | |