Spaces:

meta-agents-research-environments
/

leaderboard

Running on CPU Upgrade

Pierre Andrews commited on Sep 22

Commit

d97ec7b

0 Parent(s):

Initial commit

> Co-authored-by: Romain Froger <[email protected]>
> Co-authored-by: Pierre Andrews <[email protected]>
> Co-authored-by: Clémentine Fourrier <[email protected]>

Files changed (6) hide show

.gitattributes +35 -0
README.md +16 -0
app.py +566 -0
content.py +66 -0
requirements.txt +6 -0
utils.py +35 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: Gaia 2 Agents Evaluation Leaderboard
+emoji: 🐠
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 5.46.0
+app_file: app.py
+pinned: false
+hf_oauth: true
+hf_oauth_scopes:
+- email
+- read-repos
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import datetime
+import json
+import os
+from pathlib import Path
+import datasets
+import gradio as gr
+import pandas as pd
+import requests
+from apscheduler.schedulers.background import BackgroundScheduler
+# InfoStrings
+from content import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    CONTACT_DATASET,
+    INTRODUCTION_TEXT,
+    LEADERBOARD_PATH,
+    OWNER,
+    RESULTS_DATASET,
+    SCENARIO_LIST,
+    SUBMISSION_TEXT,
+    TITLE,
+)
+from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
+from huggingface_hub import create_repo, snapshot_download, upload_folder
+from utils import api, Experiment, format_log, model_hyperlink, TOKEN
+contact_infos = datasets.load_dataset(
+    CONTACT_DATASET, token=TOKEN, verification_mode=datasets.VerificationMode.NO_CHECKS
+)  # download_mode="force_redownload"
+def get_display_name(capability: str) -> str:
+    """
+    Convert internal capability names to user-friendly display names.
+    Args:
+        capability: Internal capability name from the benchmark
+    Returns:
+        User-friendly display name for the leaderboard
+    """
+    if "noise" in capability:
+        return "noise"
+    elif "agent2agent" in capability or "a2a" in capability:
+        return "A2A"
+    else:
+        return capability
+def cleanup(row) -> dict:
+    """
+    Transform raw evaluation data into a clean format for the leaderboard display.
+    Args:
+        row: Raw evaluation result row from the dataset
+    Returns:
+        Dictionary with cleaned and formatted data for leaderboard display
+    """
+    result = {}
+    # Basic model information
+    result["Model"] = row["metadata.model"]
+    result["Provider"] = row["metadata.model_provider"]
+    result["Total score (%)"] = round(row["statistics.global.macro_success_rate"], 1)
+    # Define the order of capability columns for consistent display
+    scenario_order = [
+        "execution",
+        "search",
+        "ambiguity",
+        "adaptability",
+        "time",
+        "mini_noise",
+        "mini_agent2agent",
+    ]
+    # Process each capability score with aligned formatting
+    for capability in scenario_order:
+        if capability in SCENARIO_LIST:
+            display_name = get_display_name(capability)
+            # Extract score and standard error
+            score = row[f"statistics.per_capability.{capability}.success_rate"]
+            sem = row[f"statistics.per_capability.{capability}.success_rate_sem"]
+            # Format with decimal alignment using non-breaking spaces
+            score_str = f"{score:4.1f}".replace(" ", "\u00A0")
+            sem_str = f"{sem:.1f}"  # No width formatting for SEM to avoid extra spaces
+            result[f"{display_name} (%)"] = f"{score_str} ± {sem_str}"
+    # Add metadata fields
+    result["Number of runs"] = (
+        row["statistics.global.total_runs"] / row["statistics.global.total_scenarios"]
+        if row["statistics.global.total_scenarios"] != 0
+        else 0
+    )
+    result["Submitter"] = row["metadata.organisation"]
+    result["Submission date"] = row["metadata.timestamp"][:10]
+    return result
+def get_dataframe_from_results() -> pd.DataFrame:
+    """
+    Load and process evaluation results from the dataset to create a leaderboard DataFrame.
+    Retrieves raw evaluation data, processes it through the cleanup function,
+    and returns a sorted DataFrame ready for leaderboard display.
+    Returns:
+        Pandas DataFrame with processed leaderboard data, sorted by total score
+        Returns empty DataFrame if no data is available
+    """
+    split = "train"
+    # Load evaluation results dataset
+    try:
+        eval_results = datasets.load_dataset(
+            RESULTS_DATASET,
+            token=TOKEN,
+            verification_mode=datasets.VerificationMode.NO_CHECKS,
+        )
+    except datasets.data_files.EmptyDatasetError:
+        eval_results = datasets.DatasetDict()
+    # Return empty DataFrame if no data available
+    if not eval_results or split not in eval_results or len(eval_results[split]) == 0:
+        return pd.DataFrame([])
+    results = eval_results[split]
+    local_df = results.flatten()
+    # Define columns to extract from the raw data
+    metadata_columns = [
+        "metadata.model",
+        "metadata.model_provider",
+        "metadata.organisation",
+        "metadata.timestamp",
+        "metadata.url",
+    ]
+    global_stats_columns = [
+        "statistics.global.macro_success_rate",
+        "statistics.global.total_runs",
+        "statistics.global.total_scenarios",
+    ]
+    # Add per-capability statistics columns
+    capability_columns = []
+    for capability in SCENARIO_LIST:
+        capability_columns.extend(
+            [
+                f"statistics.per_capability.{capability}.success_rate",
+                f"statistics.per_capability.{capability}.success_rate_sem",
+            ]
+        )
+    # Combine all required columns
+    columns = metadata_columns + global_stats_columns + capability_columns
+    # Process the data: select columns, clean up, and remove original columns
+    local_df = local_df.select_columns(columns)
+    mapped_df = local_df.map(cleanup, batched=False)
+    mapped_df = mapped_df.remove_columns(columns)
+    # Convert to pandas DataFrame and sort by total score (highest first)
+    df = pd.DataFrame(mapped_df)
+    df = df.sort_values(by=["Total score (%)"], ascending=False)
+    return df
+# ATM only one set
+eval_dataframe_val = get_dataframe_from_results()
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+def add_new_eval(
+    organisation: str,
+    path_to_repository: str,
+    profile: gr.OAuthProfile,
+    token: gr.OAuthToken,
+):
+    # ---- USER CHECKS ----
+    # Was the profile created less than 2 month ago?
+    user_data = requests.get(
+        f"https://huggingface.co/api/users/{profile.username}/overview"
+    )
+    creation_date = json.loads(user_data.content)["createdAt"]
+    if datetime.datetime.now() - datetime.datetime.strptime(
+        creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
+    ) < datetime.timedelta(days=60):
+        raise Exception("This account is not authorized to submit on GAIA2.")
+    # Can't submit several times per day
+    contact_infos = datasets.load_dataset(
+        CONTACT_DATASET,
+        token=TOKEN,
+        verification_mode=datasets.VerificationMode.NO_CHECKS,
+    )
+    user_submission_dates = sorted(
+        row["date"]
+        for row in contact_infos["train"]
+        if row["username"] == profile.username
+    )
+    if len(user_submission_dates) > 0 and user_submission_dates[
+        -1
+    ] == datetime.datetime.today().strftime("%Y-%m-%d"):
+        raise Exception("You already submitted once today, please try again tomorrow.")
+    # ---- EXPERIMENT MANAGEMENT ----
+    # Download locally with HF hub
+    snapshot_path = snapshot_download(
+        repo_id=path_to_repository, token=token.token, repo_type="dataset"
+    )
+    # Test completeness with datasets
+    try:
+        for scenario in SCENARIO_LIST:
+            # Loading what the user provided
+            datasets.load_dataset(
+                snapshot_path,
+                scenario,
+                split="test",
+                verification_mode=datasets.VerificationMode.NO_CHECKS,
+            )
+    except Exception as e:
+        print(e)
+        raise ValueError(
+            f"We cannot load the scenario {scenario} for your dataset ({path_to_repository}). Please make sure the dataset is accessible and all subsets are there."
+        )
+    with open(Path(snapshot_path, "computed_stats.json")) as f:
+        results = json.load(f)
+    model = results["metadata"]["model"]
+    results["metadata"]["organisation"] = organisation
+    results["metadata"]["url"] = path_to_repository
+    try:
+        ds = datasets.load_dataset(RESULTS_DATASET, split="train")
+    except datasets.data_files.EmptyDatasetError:
+        ds = datasets.Dataset.from_dict({})
+    if results in ds:
+        raise Exception("This precise model and results file was already submitted")
+    ds = ds.add_item(results)
+    ds.push_to_hub(RESULTS_DATASET, split="train", private=True)
+    experiment = Experiment(path_to_repository, organisation, model)
+    # Save copy to hub
+    create_repo(
+        repo_id=f"{OWNER}/{str(experiment)}",
+        repo_type="dataset",
+        token=TOKEN,
+        private=True,
+    )
+    upload_folder(
+        folder_path=snapshot_path,
+        repo_id=f"{OWNER}/{str(experiment)}",
+        repo_type="dataset",
+        token=TOKEN,
+    )
+    print(f"Adding new eval: {str(experiment)}")
+    # SAVE ALL INFO
+    contact_info = {
+        "model": experiment.model,
+        "path_to_hub": experiment.path_to_hub,
+        "path_to_hub_private_copy": f"{OWNER}/{str(experiment)}",
+        "organisation": experiment.organisation,
+        "date": experiment.cur_date,
+        "username": profile.username,
+        "mail": getattr(profile, "email", None),
+    }
+    contact_infos["test"] = contact_infos["test"].add_item(contact_info)
+    contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)
+    return format_log(
+        f"Model {model} submitted by {organisation} successfully.\nPlease wait a couple minutes and refresh the leaderboard to see your score displayed."
+    )
+def refresh():
+    return get_dataframe_from_results()
+# Custom CSS for sleek styling
+custom_css = """
+<style>
+    /* Global styling */
+    .gradio-container {
+        max-width: 1400px !important;
+        margin: auto;
+        padding: 20px;
+        background: linear-gradient(135deg, #f8fbff 0%, #e3f2fd 100%);
+        min-height: auto !important; /* override HF default */
+        padding-bottom: 0 !important; /* remove extra bottom padding */
+    }
+    html, body, #root {
+    margin: 0;
+    padding: 0;
+    height: auto !important;          /* don't lock to viewport height */
+    min-height: 100%;
+    overflow-x: hidden !important;
+    overflow-y: auto !important;     /* ensure vertical scroll is possible */
+    box-sizing: border-box;
+    }
+    /* Markdown text styling */
+    .markdown-text {
+        background: white;
+        padding: 25px;
+        border-radius: 12px;
+        box-shadow: 0 4px 20px rgba(0,0,0,0.08);
+        margin: 20px 0;
+        border-left: 4px solid #0081FB;
+        font-size: 16px;
+        line-height: 1.6;
+    }
+    /* Button styling */
+    .gr-button {
+        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%) !important;
+        border: none !important;
+        border-radius: 8px !important;
+        color: white !important;
+        font-weight: 600 !important;
+        padding: 12px 24px !important;
+        transition: all 0.3s ease !important;
+        box-shadow: 0 4px 15px rgba(0, 129, 251, 0.3) !important;
+    }
+    .gr-button:hover {
+        transform: translateY(-2px) !important;
+        box-shadow: 0 8px 25px rgba(0, 129, 251, 0.4) !important;
+    }
+    /* Input fields styling */
+    .gr-textbox {
+        border-radius: 8px !important;
+        border: 2px solid #e1e5e9 !important;
+        background: white !important;
+        transition: all 0.3s ease !important;
+    }
+    .gr-textbox:focus {
+        border-color: #667eea !important;
+        box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
+    }
+    /* Accordion styling */
+    .gr-accordion {
+        background: white !important;
+        border-radius: 12px !important;
+        box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
+        border: none !important;
+        margin: 15px 0 !important;
+    }
+    /* Leaderboard styling */
+    .leaderboard-container {
+        background: white !important;
+        border-radius: 15px !important;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important;
+        overflow: hidden !important;
+        margin: 25px 0 !important;
+        border: none !important;
+    }
+    /* Remove any default Gradio gray backgrounds */
+    .gradio-container .gr-column,
+    .gradio-container .gr-row {
+        background: transparent !important;
+    }
+    /* Ensure leaderboard table has clean white background */
+    .leaderboard-container table,
+    .leaderboard-container .gr-table {
+        background: white !important;
+        border: none !important;
+    }
+    /* Submission form styling */
+    .submission-section {
+        background: white;
+        padding: 30px;
+        border-radius: 15px;
+        box-shadow: 0 6px 25px rgba(0,0,0,0.08);
+        margin: 25px 0;
+    }
+</style>
+"""
+demo = gr.Blocks(
+    #css=custom_css,
+    theme=gr.themes.Soft(
+        font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
+    ),
+)
+with demo:
+    gr.HTML(TITLE)
+    with gr.Accordion("About", open=True):
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    # Enhanced leaderboard with custom styling
+    with gr.Column(elem_classes="leaderboard-container"):
+        #gr.HTML(
+        #    """
+        #<div style="padding: 20px 20px 0 20px;">
+        #    <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
+        #        🏆 GAIA2 Leaderboard Rankings
+        #    </h2>
+        #    <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
+        #        Click on column headers to sort • Use filters to narrow results
+        #    </p>
+        #</div>
+        #"""
+        #)
+        leaderboard_table_val = Leaderboard(
+            value=eval_dataframe_val,
+            select_columns=SelectColumns(
+                default_selection=[
+                    "Model",
+                    "Provider",
+                    "Total score (%)",
+                    "execution (%)",
+                    "search (%)",
+                    "ambiguity (%)",
+                    "adaptability (%)",
+                    "time (%)",
+                    "noise (%)",
+                    "A2A (%)",
+                    "Submission date",
+                ],
+                cant_deselect=[
+                    "Model",
+                    "Provider",
+                    "Total score (%)",
+                    "Submission date",
+                ]
+            ),
+            search_columns=["Model", "Provider", "Submitter"],
+            filter_columns=[
+                "Provider",
+                ColumnFilter("Model", type="dropdown", label="🔍 Select Model"),
+            ],
+        )
+    # Enhanced submission section
+    with gr.Column(elem_classes="submission-section"):
+        gr.HTML(
+            """
+        <h2 style="margin: 0 0 20px 0; font-weight: 700; font-size: 1.8em;">
+            🚀 Submit Your Model
+        </h2>
+        """
+        )
+        with gr.Accordion("📋 How to submit", open=True):
+            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                gr.LoginButton(size="lg")
+            with gr.Column(scale=2):
+                organisation_tbox = gr.Textbox(
+                    label="🏢 Organization",
+                    placeholder="Enter your organization name",
+                    container=True,
+                )
+            with gr.Column(scale=3):
+                dataset_tbox = gr.Textbox(
+                    label="📊 Hub Dataset Path",
+                    placeholder="username/dataset-name",
+                    container=True,
+                )
+            with gr.Column(scale=1):
+                submit_button = gr.Button("Submit", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                refresh_button = gr.Button("🔄 Refresh the display", variant="secondary", size="lg")
+        submission_result = gr.Markdown()
+    with gr.Column():
+        gr.HTML(
+            """
+            <div style="text-align: center; margin: 20px 0; display: flex; justify-content: center; gap: 50px; flex-wrap: wrap;">
+                <!-- GitHub Button -->
+                <a href="https://github.com/facebookresearch/meta-agents-research-environments" target="_blank"
+                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
+                        background: linear-gradient(135deg, #24292e 0%, #000000 100%);
+                        color: white; font-weight: 600; padding: 14px 28px;
+                        border-radius: 10px; text-decoration: none; font-size: 16px;
+                        box-shadow: 0 4px 12px rgba(0,0,0,0.3); transition: all 0.3s ease;
+                        min-width: 220px; text-align: center;">
+                    <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="white" viewBox="0 0 24 24">
+                        <path d="M12 .5C5.7.5.5 5.7.5 12c0 5.1 3.3 9.4 7.9 10.9.6.1.8-.2.8-.6v-2.1c-3.2.7-3.9-1.4-3.9-1.4-.5-1.2-1.2-1.6-1.2-1.6-1-.7.1-.7.1-.7 1.1.1 1.7 1.1 1.7 1.1 1 .1.8 1.4 2.9 1.9.3-.8.6-1.3.6-1.3-2.6-.3-5.3-1.3-5.3-5.8 0-1.3.5-2.4 1.1-3.3 0-.3-.5-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.6 1.6.1 2.9.1 3.2.7.9 1.1 2 1.1 3.3 0 4.5-2.7 5.5-5.3 5.8.4.3.7 1 .7 2v3c0 .3.2.7.8.6A11.5 11.5 0 0 0 23.5 12C23.5 5.7 18.3.5 12 .5Z"/>
+                    </svg>
+                    Star ARE on GitHub ⭐
+                </a>
+                <!-- Blog Post -->
+                <a href="https://ai.meta.com/research/publications/are-scaling-up-agent-environments-and-evaluations/" target="_blank"
+                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
+                        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
+                        color: white; font-weight: 600; padding: 14px 28px;
+                        border-radius: 10px; text-decoration: none; font-size: 16px;
+                        box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
+                        min-width: 220px; text-align: center;">
+                    🧑‍🔬 Read the paper
+                </a>
+                <!-- Demo Button -->
+                <a href="https://huggingface.co/spaces/meta-agents-research-environments/demo" target="_blank"
+                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
+                        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
+                        color: white; font-weight: 600; padding: 14px 28px;
+                        border-radius: 10px; text-decoration: none; font-size: 16px;
+                        box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
+                        min-width: 220px; text-align: center;">
+                    🚀 Try the ARE Demo
+                </a>
+            </div>
+            """
+        )
+    with gr.Column():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+    submit_button.click(
+        add_new_eval,
+        [organisation_tbox, dataset_tbox],
+        submission_result,
+    )
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[leaderboard_table_val],
+    )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)

content.py ADDED Viewed

	@@ -0,0 +1,66 @@

+OWNER = "meta-agents-research-environments"
+SUBMISSION_DATASET = f"{OWNER}/leaderboard_submissions_internal"
+CONTACT_DATASET = f"{OWNER}/leaderboard_contact_info_internal"
+RESULTS_DATASET = f"{OWNER}/leaderboard_results"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+TITLE = """
+<div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
+    <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
+        GAIA2 Leaderboard 🏆
+    </h1>
+</div>
+"""
+SCENARIO_LIST = [
+    "adaptability",
+    "mini_noise",
+    "time",
+    "execution",
+    "ambiguity",
+    "mini_agent2agent",
+    "search",
+]
+MAX_PARALLELISM = 10
+INTRODUCTION_TEXT = """
+[**GAIA2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, GAIA2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
+GAIA2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
+⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on GAIA2.
+"""
+SUBMISSION_TEXT = """
+You can find a complete setup guide [there](https://facebookresearch.github.io/meta-agents-research-environments/user_guide/gaia2_evaluation.html), but here are some simplified instructions.
+First, install Meta's agent research environment in your python environment of choice (uv, conda, virtualenv, ...)
+```bash
+pip install meta-agents-research-environments
+```
+Then, run the benchmark for all configurations: adaptability, mini_noise, time, execution, ambiguity, mini_agent2agent, search.
+Don't forget to upload all results to the hub with the `hf_upload` kwarg!
+```bash
+are-benchmark gaia2-run \\
+    --hf meta-agents-research-environments/gaia2 \\
+    --hf-split validation \\
+    --hf-config CONFIGURATION \\
+    --model YOUR_MODEL \\
+    --provider YOUR_PROVIDER \\
+    --agent default \\
+    --max-concurrent-scenarios 2 \\
+    --scenario-timeout 300 \\
+    --output-dir ./monitored_test_results \\
+    --hf-upload YOUR_HUB_DATASET_TO_SAVE_RESULTS
+```
+Add all the relevant information about your model in the README!
+Finally, log in to this page, complete the informations for logging, and provide the path to your submission dataset.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r""""""

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets
+gradio
+huggingface-hub
+pandas
+APScheduler
+gradio_leaderboard

utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import datetime
+import os
+import datasets as ds
+from dataclasses import dataclass
+from huggingface_hub import HfApi
+TOKEN = os.environ.get("TOKEN", None)
+api = HfApi()
+@dataclass
+class Experiment:
+    path_to_hub: str
+    organisation: str
+    model: str
+    cur_date: str = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M'))
+    def __str__(self):
+        return f"{self.organisation}_{self.model}_{self.cur_date}"
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'