Spaces:
Running
Running
| import sys | |
| from datetime import datetime | |
| import gradio as gr | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from gradio_leaderboard import Leaderboard | |
| from huggingface_hub import snapshot_download | |
| from loguru import logger | |
| from src.about import ( | |
| INTRODUCTION_TEXT, | |
| TITLE, | |
| ) | |
| from src.display.css_html_js import custom_css | |
| from src.envs import ( | |
| API, | |
| COMPETITION_URL, | |
| CUTOFF_DATES, | |
| EVAL_RESULTS_PATH, | |
| EVAL_SPLITS, | |
| LEADERBOARD_REFRESH_INTERVAL, | |
| REGISTRATION_URL, | |
| REPO_ID, | |
| RESULTS_REPO, | |
| SUBMISSION_URL, | |
| TOKEN, | |
| ) | |
| from src.hf_dataset_utils import download_dataset_snapshot | |
| from src.populate import ( | |
| fetch_bonus_leaderboard, | |
| fetch_overall_leaderboard, | |
| fetch_tossup_leaderboard, | |
| ) | |
| logger.remove() | |
| logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False) | |
| # Load metrics manual content | |
| def load_metrics_manual(): | |
| try: | |
| with open("metrics_manual.md", "r") as f: | |
| return f.read() | |
| except Exception as e: | |
| logger.error(f"Error loading metrics manual: {e}") | |
| return "# Metrics Manual\n\nCould not load metrics manual content." | |
| def restart_space(): | |
| API.restart_space(repo_id=REPO_ID) | |
| try: | |
| print(EVAL_RESULTS_PATH) | |
| snapshot_download( | |
| repo_id=RESULTS_REPO, | |
| local_dir=EVAL_RESULTS_PATH, | |
| repo_type="dataset", | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| token=TOKEN, | |
| ) | |
| except Exception: | |
| restart_space() | |
| def refresh_leaderboard( | |
| split: str = "tiny_eval", | |
| style: bool = True, | |
| date: datetime.date = None, | |
| profile: gr.OAuthProfile = None, | |
| ): | |
| download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) | |
| try: | |
| username = profile and profile.username | |
| except Exception: | |
| # If the user is not logged in, profile will be None | |
| username = None | |
| tossup_df = fetch_tossup_leaderboard(split, style, date, username) | |
| bonus_df = fetch_bonus_leaderboard(split, style, date, username) | |
| overall_df = fetch_overall_leaderboard(split, style, date, username) | |
| return tossup_df, bonus_df, overall_df | |
| def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None): | |
| leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) | |
| tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date) | |
| gr.HTML( | |
| "<div style='font-size: 18px;'>" | |
| "βΉοΈ <b>E [Score]</b> is the <b>Expected Score</b> for a question. ππ» and π€ indicate the scores against just the Human and the AI players respectively.<br>" | |
| "βΉοΈ <b>Cost</b> is the cost in USD of executing the pipeline <b>per question prefix</b>. (Typically we have upto ~20 prefixes per tossup question)" | |
| "βΉοΈ <b>When does the cost matter?</b> When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.<br>" | |
| "</div>" | |
| ) | |
| tossup_leaderboard = gr.Dataframe( | |
| value=tossup_df, | |
| show_search=True, | |
| label=" ποΈ Tossup Round Leaderboard", | |
| show_label=True, | |
| datatype=["str", "number", "number", "number", "number", "number", "number"], | |
| elem_id="tossup-table", | |
| interactive=False, # Ensure it's not interactive | |
| ) | |
| gr.HTML( | |
| "<div style='font-size: 18px;'>" | |
| "βΉοΈ <b>Cost for Bonus pipeline</b> is the cost in USD of executing the pipeline <b>per bonus part</b>. (We have exactly 3 parts per bonus question)" | |
| "</div>" | |
| ) | |
| bonus_leaderboard = gr.Dataframe( | |
| value=bonus_df, | |
| show_search=True, | |
| label=" π§ Bonus Round Leaderboard", | |
| show_label=True, | |
| datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"], | |
| elem_id="bonus-table", | |
| interactive=False, # Ensure it's not interactive | |
| ) | |
| overall_leaderboard = gr.Dataframe( | |
| value=overall_df, | |
| show_search=True, | |
| label=" π₯ Overall Leaderboard", | |
| show_label=True, | |
| datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"], | |
| ) | |
| gr.on( | |
| triggers=[leaderboard_timer.tick, refresh_btn.click, app.load], | |
| fn=refresh_leaderboard, | |
| inputs=[gr.State(split), gr.State(True), gr.State(date)], | |
| outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard], | |
| ) | |
| with gr.Blocks(css=custom_css) as demo: | |
| gr.HTML(TITLE) | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| gr.Markdown( | |
| f"## π Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n" | |
| f"## π² Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).", | |
| elem_classes="welcome-text", | |
| ) | |
| logged_note = gr.Markdown( | |
| "## π **Note:** <span style='background-color: lightblue; padding: 10px; margin:4px'>Rows in blue with **(*)**</span> are your submissions past the cutoff date and are only visible to you.", | |
| visible=False, | |
| ) | |
| with gr.Column(scale=2): | |
| beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y") | |
| gr.Markdown(f"## π Next Cutoff Date: <span style='color:crimson'>{beautify_date}</span>") | |
| gr.LoginButton("Login to privately view your scores on past weeks.") | |
| refresh_btn = gr.Button("π Refresh") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| for i, (name, split) in enumerate(EVAL_SPLITS.items()): | |
| with gr.TabItem(f"π {name}", elem_id="llm-benchmark-tab-table", id=i): | |
| leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) | |
| cutoff_date = CUTOFF_DATES[name] | |
| date = datetime.strptime(cutoff_date, "%Y-%m-%d").date() | |
| create_leaderboard_interface(demo, refresh_btn, split, date) | |
| # Add the Metrics Guide tab | |
| with gr.TabItem("π Metrics Guide", elem_id="metrics-guide-tab"): | |
| gr.Markdown(load_metrics_manual()) | |
| def check_user_logged_in(x: gr.OAuthProfile | None = None): | |
| return gr.update(visible=x is not None) | |
| demo.load(check_user_logged_in, outputs=[logged_note]) | |
| # scheduler = BackgroundScheduler() | |
| # scheduler.add_job(restart_space, "interval", seconds=1800) | |
| # scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch() | |