Pierre Andrews commited on
Commit
d97ec7b
·
0 Parent(s):

Initial commit

Browse files

> Co-authored-by: Romain Froger <[email protected]>
> Co-authored-by: Pierre Andrews <[email protected]>
> Co-authored-by: Clémentine Fourrier <[email protected]>

Files changed (6) hide show
  1. .gitattributes +35 -0
  2. README.md +16 -0
  3. app.py +566 -0
  4. content.py +66 -0
  5. requirements.txt +6 -0
  6. utils.py +35 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Gaia 2 Agents Evaluation Leaderboard
3
+ emoji: 🐠
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.46.0
8
+ app_file: app.py
9
+ pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - email
13
+ - read-repos
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import datasets
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import requests
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+
13
+ # InfoStrings
14
+ from content import (
15
+ CITATION_BUTTON_LABEL,
16
+ CITATION_BUTTON_TEXT,
17
+ CONTACT_DATASET,
18
+ INTRODUCTION_TEXT,
19
+ LEADERBOARD_PATH,
20
+ OWNER,
21
+ RESULTS_DATASET,
22
+ SCENARIO_LIST,
23
+ SUBMISSION_TEXT,
24
+ TITLE,
25
+ )
26
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
27
+
28
+ from huggingface_hub import create_repo, snapshot_download, upload_folder
29
+
30
+ from utils import api, Experiment, format_log, model_hyperlink, TOKEN
31
+
32
+ contact_infos = datasets.load_dataset(
33
+ CONTACT_DATASET, token=TOKEN, verification_mode=datasets.VerificationMode.NO_CHECKS
34
+ ) # download_mode="force_redownload"
35
+
36
+
37
+ def get_display_name(capability: str) -> str:
38
+ """
39
+ Convert internal capability names to user-friendly display names.
40
+
41
+ Args:
42
+ capability: Internal capability name from the benchmark
43
+
44
+ Returns:
45
+ User-friendly display name for the leaderboard
46
+ """
47
+ if "noise" in capability:
48
+ return "noise"
49
+ elif "agent2agent" in capability or "a2a" in capability:
50
+ return "A2A"
51
+ else:
52
+ return capability
53
+
54
+
55
+ def cleanup(row) -> dict:
56
+ """
57
+ Transform raw evaluation data into a clean format for the leaderboard display.
58
+
59
+ Args:
60
+ row: Raw evaluation result row from the dataset
61
+
62
+ Returns:
63
+ Dictionary with cleaned and formatted data for leaderboard display
64
+ """
65
+ result = {}
66
+
67
+ # Basic model information
68
+ result["Model"] = row["metadata.model"]
69
+ result["Provider"] = row["metadata.model_provider"]
70
+ result["Total score (%)"] = round(row["statistics.global.macro_success_rate"], 1)
71
+
72
+ # Define the order of capability columns for consistent display
73
+ scenario_order = [
74
+ "execution",
75
+ "search",
76
+ "ambiguity",
77
+ "adaptability",
78
+ "time",
79
+ "mini_noise",
80
+ "mini_agent2agent",
81
+ ]
82
+
83
+ # Process each capability score with aligned formatting
84
+ for capability in scenario_order:
85
+ if capability in SCENARIO_LIST:
86
+ display_name = get_display_name(capability)
87
+
88
+ # Extract score and standard error
89
+ score = row[f"statistics.per_capability.{capability}.success_rate"]
90
+ sem = row[f"statistics.per_capability.{capability}.success_rate_sem"]
91
+
92
+ # Format with decimal alignment using non-breaking spaces
93
+ score_str = f"{score:4.1f}".replace(" ", "\u00A0")
94
+ sem_str = f"{sem:.1f}" # No width formatting for SEM to avoid extra spaces
95
+
96
+ result[f"{display_name} (%)"] = f"{score_str} ± {sem_str}"
97
+
98
+ # Add metadata fields
99
+ result["Number of runs"] = (
100
+ row["statistics.global.total_runs"] / row["statistics.global.total_scenarios"]
101
+ if row["statistics.global.total_scenarios"] != 0
102
+ else 0
103
+ )
104
+ result["Submitter"] = row["metadata.organisation"]
105
+ result["Submission date"] = row["metadata.timestamp"][:10]
106
+
107
+ return result
108
+
109
+
110
+ def get_dataframe_from_results() -> pd.DataFrame:
111
+ """
112
+ Load and process evaluation results from the dataset to create a leaderboard DataFrame.
113
+
114
+ Retrieves raw evaluation data, processes it through the cleanup function,
115
+ and returns a sorted DataFrame ready for leaderboard display.
116
+
117
+ Returns:
118
+ Pandas DataFrame with processed leaderboard data, sorted by total score
119
+ Returns empty DataFrame if no data is available
120
+ """
121
+ split = "train"
122
+
123
+ # Load evaluation results dataset
124
+ try:
125
+ eval_results = datasets.load_dataset(
126
+ RESULTS_DATASET,
127
+ token=TOKEN,
128
+ verification_mode=datasets.VerificationMode.NO_CHECKS,
129
+ )
130
+ except datasets.data_files.EmptyDatasetError:
131
+ eval_results = datasets.DatasetDict()
132
+
133
+ # Return empty DataFrame if no data available
134
+ if not eval_results or split not in eval_results or len(eval_results[split]) == 0:
135
+ return pd.DataFrame([])
136
+
137
+ results = eval_results[split]
138
+ local_df = results.flatten()
139
+
140
+ # Define columns to extract from the raw data
141
+ metadata_columns = [
142
+ "metadata.model",
143
+ "metadata.model_provider",
144
+ "metadata.organisation",
145
+ "metadata.timestamp",
146
+ "metadata.url",
147
+ ]
148
+
149
+ global_stats_columns = [
150
+ "statistics.global.macro_success_rate",
151
+ "statistics.global.total_runs",
152
+ "statistics.global.total_scenarios",
153
+ ]
154
+
155
+ # Add per-capability statistics columns
156
+ capability_columns = []
157
+ for capability in SCENARIO_LIST:
158
+ capability_columns.extend(
159
+ [
160
+ f"statistics.per_capability.{capability}.success_rate",
161
+ f"statistics.per_capability.{capability}.success_rate_sem",
162
+ ]
163
+ )
164
+
165
+ # Combine all required columns
166
+ columns = metadata_columns + global_stats_columns + capability_columns
167
+
168
+ # Process the data: select columns, clean up, and remove original columns
169
+ local_df = local_df.select_columns(columns)
170
+ mapped_df = local_df.map(cleanup, batched=False)
171
+ mapped_df = mapped_df.remove_columns(columns)
172
+
173
+ # Convert to pandas DataFrame and sort by total score (highest first)
174
+ df = pd.DataFrame(mapped_df)
175
+ df = df.sort_values(by=["Total score (%)"], ascending=False)
176
+
177
+ return df
178
+
179
+
180
+ # ATM only one set
181
+ eval_dataframe_val = get_dataframe_from_results()
182
+
183
+
184
+ def restart_space():
185
+ api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
186
+
187
+
188
+ def add_new_eval(
189
+ organisation: str,
190
+ path_to_repository: str,
191
+ profile: gr.OAuthProfile,
192
+ token: gr.OAuthToken,
193
+ ):
194
+ # ---- USER CHECKS ----
195
+ # Was the profile created less than 2 month ago?
196
+ user_data = requests.get(
197
+ f"https://huggingface.co/api/users/{profile.username}/overview"
198
+ )
199
+ creation_date = json.loads(user_data.content)["createdAt"]
200
+ if datetime.datetime.now() - datetime.datetime.strptime(
201
+ creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
202
+ ) < datetime.timedelta(days=60):
203
+ raise Exception("This account is not authorized to submit on GAIA2.")
204
+
205
+ # Can't submit several times per day
206
+ contact_infos = datasets.load_dataset(
207
+ CONTACT_DATASET,
208
+ token=TOKEN,
209
+ verification_mode=datasets.VerificationMode.NO_CHECKS,
210
+ )
211
+ user_submission_dates = sorted(
212
+ row["date"]
213
+ for row in contact_infos["train"]
214
+ if row["username"] == profile.username
215
+ )
216
+ if len(user_submission_dates) > 0 and user_submission_dates[
217
+ -1
218
+ ] == datetime.datetime.today().strftime("%Y-%m-%d"):
219
+ raise Exception("You already submitted once today, please try again tomorrow.")
220
+
221
+ # ---- EXPERIMENT MANAGEMENT ----
222
+ # Download locally with HF hub
223
+ snapshot_path = snapshot_download(
224
+ repo_id=path_to_repository, token=token.token, repo_type="dataset"
225
+ )
226
+
227
+ # Test completeness with datasets
228
+ try:
229
+ for scenario in SCENARIO_LIST:
230
+ # Loading what the user provided
231
+ datasets.load_dataset(
232
+ snapshot_path,
233
+ scenario,
234
+ split="test",
235
+ verification_mode=datasets.VerificationMode.NO_CHECKS,
236
+ )
237
+ except Exception as e:
238
+ print(e)
239
+ raise ValueError(
240
+ f"We cannot load the scenario {scenario} for your dataset ({path_to_repository}). Please make sure the dataset is accessible and all subsets are there."
241
+ )
242
+
243
+ with open(Path(snapshot_path, "computed_stats.json")) as f:
244
+ results = json.load(f)
245
+ model = results["metadata"]["model"]
246
+ results["metadata"]["organisation"] = organisation
247
+ results["metadata"]["url"] = path_to_repository
248
+
249
+ try:
250
+ ds = datasets.load_dataset(RESULTS_DATASET, split="train")
251
+ except datasets.data_files.EmptyDatasetError:
252
+ ds = datasets.Dataset.from_dict({})
253
+
254
+ if results in ds:
255
+ raise Exception("This precise model and results file was already submitted")
256
+ ds = ds.add_item(results)
257
+ ds.push_to_hub(RESULTS_DATASET, split="train", private=True)
258
+
259
+ experiment = Experiment(path_to_repository, organisation, model)
260
+
261
+ # Save copy to hub
262
+ create_repo(
263
+ repo_id=f"{OWNER}/{str(experiment)}",
264
+ repo_type="dataset",
265
+ token=TOKEN,
266
+ private=True,
267
+ )
268
+ upload_folder(
269
+ folder_path=snapshot_path,
270
+ repo_id=f"{OWNER}/{str(experiment)}",
271
+ repo_type="dataset",
272
+ token=TOKEN,
273
+ )
274
+
275
+ print(f"Adding new eval: {str(experiment)}")
276
+
277
+ # SAVE ALL INFO
278
+ contact_info = {
279
+ "model": experiment.model,
280
+ "path_to_hub": experiment.path_to_hub,
281
+ "path_to_hub_private_copy": f"{OWNER}/{str(experiment)}",
282
+ "organisation": experiment.organisation,
283
+ "date": experiment.cur_date,
284
+ "username": profile.username,
285
+ "mail": getattr(profile, "email", None),
286
+ }
287
+ contact_infos["test"] = contact_infos["test"].add_item(contact_info)
288
+ contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)
289
+
290
+ return format_log(
291
+ f"Model {model} submitted by {organisation} successfully.\nPlease wait a couple minutes and refresh the leaderboard to see your score displayed."
292
+ )
293
+
294
+
295
+ def refresh():
296
+ return get_dataframe_from_results()
297
+
298
+
299
+ # Custom CSS for sleek styling
300
+ custom_css = """
301
+ <style>
302
+ /* Global styling */
303
+ .gradio-container {
304
+ max-width: 1400px !important;
305
+ margin: auto;
306
+ padding: 20px;
307
+ background: linear-gradient(135deg, #f8fbff 0%, #e3f2fd 100%);
308
+ min-height: auto !important; /* override HF default */
309
+ padding-bottom: 0 !important; /* remove extra bottom padding */
310
+ }
311
+
312
+ html, body, #root {
313
+ margin: 0;
314
+ padding: 0;
315
+ height: auto !important; /* don't lock to viewport height */
316
+ min-height: 100%;
317
+ overflow-x: hidden !important;
318
+ overflow-y: auto !important; /* ensure vertical scroll is possible */
319
+ box-sizing: border-box;
320
+ }
321
+
322
+ /* Markdown text styling */
323
+ .markdown-text {
324
+ background: white;
325
+ padding: 25px;
326
+ border-radius: 12px;
327
+ box-shadow: 0 4px 20px rgba(0,0,0,0.08);
328
+ margin: 20px 0;
329
+ border-left: 4px solid #0081FB;
330
+ font-size: 16px;
331
+ line-height: 1.6;
332
+ }
333
+
334
+ /* Button styling */
335
+ .gr-button {
336
+ background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%) !important;
337
+ border: none !important;
338
+ border-radius: 8px !important;
339
+ color: white !important;
340
+ font-weight: 600 !important;
341
+ padding: 12px 24px !important;
342
+ transition: all 0.3s ease !important;
343
+ box-shadow: 0 4px 15px rgba(0, 129, 251, 0.3) !important;
344
+ }
345
+
346
+ .gr-button:hover {
347
+ transform: translateY(-2px) !important;
348
+ box-shadow: 0 8px 25px rgba(0, 129, 251, 0.4) !important;
349
+ }
350
+
351
+ /* Input fields styling */
352
+ .gr-textbox {
353
+ border-radius: 8px !important;
354
+ border: 2px solid #e1e5e9 !important;
355
+ background: white !important;
356
+ transition: all 0.3s ease !important;
357
+ }
358
+
359
+ .gr-textbox:focus {
360
+ border-color: #667eea !important;
361
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
362
+ }
363
+
364
+ /* Accordion styling */
365
+ .gr-accordion {
366
+ background: white !important;
367
+ border-radius: 12px !important;
368
+ box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
369
+ border: none !important;
370
+ margin: 15px 0 !important;
371
+ }
372
+
373
+ /* Leaderboard styling */
374
+ .leaderboard-container {
375
+ background: white !important;
376
+ border-radius: 15px !important;
377
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important;
378
+ overflow: hidden !important;
379
+ margin: 25px 0 !important;
380
+ border: none !important;
381
+ }
382
+
383
+ /* Remove any default Gradio gray backgrounds */
384
+ .gradio-container .gr-column,
385
+ .gradio-container .gr-row {
386
+ background: transparent !important;
387
+ }
388
+
389
+ /* Ensure leaderboard table has clean white background */
390
+ .leaderboard-container table,
391
+ .leaderboard-container .gr-table {
392
+ background: white !important;
393
+ border: none !important;
394
+ }
395
+
396
+ /* Submission form styling */
397
+ .submission-section {
398
+ background: white;
399
+ padding: 30px;
400
+ border-radius: 15px;
401
+ box-shadow: 0 6px 25px rgba(0,0,0,0.08);
402
+ margin: 25px 0;
403
+ }
404
+ </style>
405
+ """
406
+
407
+ demo = gr.Blocks(
408
+ #css=custom_css,
409
+ theme=gr.themes.Soft(
410
+ font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
411
+ ),
412
+ )
413
+ with demo:
414
+ gr.HTML(TITLE)
415
+
416
+ with gr.Accordion("About", open=True):
417
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
418
+
419
+ # Enhanced leaderboard with custom styling
420
+ with gr.Column(elem_classes="leaderboard-container"):
421
+ #gr.HTML(
422
+ # """
423
+ #<div style="padding: 20px 20px 0 20px;">
424
+ # <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
425
+ # 🏆 GAIA2 Leaderboard Rankings
426
+ # </h2>
427
+ # <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
428
+ # Click on column headers to sort • Use filters to narrow results
429
+ # </p>
430
+ #</div>
431
+ #"""
432
+ #)
433
+
434
+ leaderboard_table_val = Leaderboard(
435
+ value=eval_dataframe_val,
436
+ select_columns=SelectColumns(
437
+ default_selection=[
438
+ "Model",
439
+ "Provider",
440
+ "Total score (%)",
441
+ "execution (%)",
442
+ "search (%)",
443
+ "ambiguity (%)",
444
+ "adaptability (%)",
445
+ "time (%)",
446
+ "noise (%)",
447
+ "A2A (%)",
448
+ "Submission date",
449
+ ],
450
+ cant_deselect=[
451
+ "Model",
452
+ "Provider",
453
+ "Total score (%)",
454
+ "Submission date",
455
+ ]
456
+ ),
457
+ search_columns=["Model", "Provider", "Submitter"],
458
+ filter_columns=[
459
+ "Provider",
460
+ ColumnFilter("Model", type="dropdown", label="🔍 Select Model"),
461
+ ],
462
+ )
463
+
464
+ # Enhanced submission section
465
+ with gr.Column(elem_classes="submission-section"):
466
+ gr.HTML(
467
+ """
468
+ <h2 style="margin: 0 0 20px 0; font-weight: 700; font-size: 1.8em;">
469
+ 🚀 Submit Your Model
470
+ </h2>
471
+ """
472
+ )
473
+
474
+ with gr.Accordion("📋 How to submit", open=True):
475
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
476
+
477
+ with gr.Row(equal_height=True):
478
+ with gr.Column(scale=1):
479
+ gr.LoginButton(size="lg")
480
+ with gr.Column(scale=2):
481
+ organisation_tbox = gr.Textbox(
482
+ label="🏢 Organization",
483
+ placeholder="Enter your organization name",
484
+ container=True,
485
+ )
486
+ with gr.Column(scale=3):
487
+ dataset_tbox = gr.Textbox(
488
+ label="📊 Hub Dataset Path",
489
+ placeholder="username/dataset-name",
490
+ container=True,
491
+ )
492
+ with gr.Column(scale=1):
493
+ submit_button = gr.Button("Submit", variant="primary", size="lg")
494
+ with gr.Column(scale=1):
495
+ refresh_button = gr.Button("🔄 Refresh the display", variant="secondary", size="lg")
496
+
497
+ submission_result = gr.Markdown()
498
+
499
+
500
+ with gr.Column():
501
+ gr.HTML(
502
+ """
503
+ <div style="text-align: center; margin: 20px 0; display: flex; justify-content: center; gap: 50px; flex-wrap: wrap;">
504
+ <!-- GitHub Button -->
505
+ <a href="https://github.com/facebookresearch/meta-agents-research-environments" target="_blank"
506
+ style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
507
+ background: linear-gradient(135deg, #24292e 0%, #000000 100%);
508
+ color: white; font-weight: 600; padding: 14px 28px;
509
+ border-radius: 10px; text-decoration: none; font-size: 16px;
510
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3); transition: all 0.3s ease;
511
+ min-width: 220px; text-align: center;">
512
+ <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="white" viewBox="0 0 24 24">
513
+ <path d="M12 .5C5.7.5.5 5.7.5 12c0 5.1 3.3 9.4 7.9 10.9.6.1.8-.2.8-.6v-2.1c-3.2.7-3.9-1.4-3.9-1.4-.5-1.2-1.2-1.6-1.2-1.6-1-.7.1-.7.1-.7 1.1.1 1.7 1.1 1.7 1.1 1 .1.8 1.4 2.9 1.9.3-.8.6-1.3.6-1.3-2.6-.3-5.3-1.3-5.3-5.8 0-1.3.5-2.4 1.1-3.3 0-.3-.5-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.6 1.6.1 2.9.1 3.2.7.9 1.1 2 1.1 3.3 0 4.5-2.7 5.5-5.3 5.8.4.3.7 1 .7 2v3c0 .3.2.7.8.6A11.5 11.5 0 0 0 23.5 12C23.5 5.7 18.3.5 12 .5Z"/>
514
+ </svg>
515
+ Star ARE on GitHub ⭐
516
+ </a>
517
+ <!-- Blog Post -->
518
+ <a href="https://ai.meta.com/research/publications/are-scaling-up-agent-environments-and-evaluations/" target="_blank"
519
+ style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
520
+ background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
521
+ color: white; font-weight: 600; padding: 14px 28px;
522
+ border-radius: 10px; text-decoration: none; font-size: 16px;
523
+ box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
524
+ min-width: 220px; text-align: center;">
525
+ 🧑‍🔬 Read the paper
526
+ </a>
527
+ <!-- Demo Button -->
528
+ <a href="https://huggingface.co/spaces/meta-agents-research-environments/demo" target="_blank"
529
+ style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
530
+ background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
531
+ color: white; font-weight: 600; padding: 14px 28px;
532
+ border-radius: 10px; text-decoration: none; font-size: 16px;
533
+ box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
534
+ min-width: 220px; text-align: center;">
535
+ 🚀 Try the ARE Demo
536
+ </a>
537
+ </div>
538
+ """
539
+ )
540
+
541
+ with gr.Column():
542
+ with gr.Accordion("📙 Citation", open=False):
543
+ citation_button = gr.Textbox(
544
+ value=CITATION_BUTTON_TEXT,
545
+ label=CITATION_BUTTON_LABEL,
546
+ elem_id="citation-button",
547
+ show_copy_button=True,
548
+ )
549
+
550
+ submit_button.click(
551
+ add_new_eval,
552
+ [organisation_tbox, dataset_tbox],
553
+ submission_result,
554
+ )
555
+
556
+ refresh_button.click(
557
+ refresh,
558
+ inputs=[],
559
+ outputs=[leaderboard_table_val],
560
+ )
561
+
562
+
563
+ scheduler = BackgroundScheduler()
564
+ scheduler.add_job(restart_space, "interval", seconds=3600)
565
+ scheduler.start()
566
+ demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)
content.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OWNER = "meta-agents-research-environments"
2
+ SUBMISSION_DATASET = f"{OWNER}/leaderboard_submissions_internal"
3
+ CONTACT_DATASET = f"{OWNER}/leaderboard_contact_info_internal"
4
+ RESULTS_DATASET = f"{OWNER}/leaderboard_results"
5
+ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
6
+
7
+ TITLE = """
8
+ <div style="text-align: center; padding: 20px 0; background: linear-gradient(135deg, #1877f2 0%, #42a5f5 100%); border-radius: 15px; margin-bottom: 30px;">
9
+ <h1 style="color: white; font-size: 2em; margin: 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">
10
+ GAIA2 Leaderboard 🏆
11
+ </h1>
12
+ </div>
13
+ """
14
+
15
+ SCENARIO_LIST = [
16
+ "adaptability",
17
+ "mini_noise",
18
+ "time",
19
+ "execution",
20
+ "ambiguity",
21
+ "mini_agent2agent",
22
+ "search",
23
+ ]
24
+
25
+ MAX_PARALLELISM = 10
26
+
27
+ INTRODUCTION_TEXT = """
28
+ [**GAIA2**](https://huggingface.co/datasets/meta-agents-research-environments/gaia2) is a benchmark designed to measure general agent capabilities. Beyond traditional search and execution tasks, GAIA2 runs asynchronously, requiring agents to handle ambiguities and noise, adapt to dynamic environments, collaborate with other agents, and operate under temporal constraints. As of publication, no system dominates across the task spectrum: stronger reasoning often comes at the cost of efficiency & the ability to complete sensitive tasks in due time.
29
+
30
+ GAIA2 evaluates agents across the following dimensions: **Execution** (instruction following, multi-step tool-use), **Search** (information retrieval), **Ambiguity** (handling unclear or incomplete instructions), **Adaptability** (responding to dynamic environment changes), **Time** (managing temporal constraints and scheduling), **Noise** (operating effectively despite irrelevant information and random tool failures) and **Agent-to-Agent** (collaboration and coordination with other agents).
31
+
32
+ ⚠️ All scores on this page are self reported. Associated traces are made available to the open-source community in order to enable deeper study of the tradeoffs between model behavior vs performance on GAIA2.
33
+ """
34
+
35
+ SUBMISSION_TEXT = """
36
+ You can find a complete setup guide [there](https://facebookresearch.github.io/meta-agents-research-environments/user_guide/gaia2_evaluation.html), but here are some simplified instructions.
37
+
38
+ First, install Meta's agent research environment in your python environment of choice (uv, conda, virtualenv, ...)
39
+ ```bash
40
+ pip install meta-agents-research-environments
41
+ ```
42
+
43
+ Then, run the benchmark for all configurations: adaptability, mini_noise, time, execution, ambiguity, mini_agent2agent, search.
44
+ Don't forget to upload all results to the hub with the `hf_upload` kwarg!
45
+
46
+ ```bash
47
+ are-benchmark gaia2-run \\
48
+ --hf meta-agents-research-environments/gaia2 \\
49
+ --hf-split validation \\
50
+ --hf-config CONFIGURATION \\
51
+ --model YOUR_MODEL \\
52
+ --provider YOUR_PROVIDER \\
53
+ --agent default \\
54
+ --max-concurrent-scenarios 2 \\
55
+ --scenario-timeout 300 \\
56
+ --output-dir ./monitored_test_results \\
57
+ --hf-upload YOUR_HUB_DATASET_TO_SAVE_RESULTS
58
+ ```
59
+
60
+ Add all the relevant information about your model in the README!
61
+
62
+ Finally, log in to this page, complete the informations for logging, and provide the path to your submission dataset.
63
+ """
64
+
65
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
66
+ CITATION_BUTTON_TEXT = r""""""
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ datasets
2
+ gradio
3
+ huggingface-hub
4
+ pandas
5
+ APScheduler
6
+ gradio_leaderboard
utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+
4
+ import datasets as ds
5
+ from dataclasses import dataclass
6
+
7
+ from huggingface_hub import HfApi
8
+
9
+ TOKEN = os.environ.get("TOKEN", None)
10
+
11
+ api = HfApi()
12
+
13
+
14
+ @dataclass
15
+ class Experiment:
16
+ path_to_hub: str
17
+ organisation: str
18
+ model: str
19
+ cur_date: str = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M'))
20
+
21
+ def __str__(self):
22
+ return f"{self.organisation}_{self.model}_{self.cur_date}"
23
+
24
+
25
+ def format_error(msg):
26
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
27
+
28
+ def format_warning(msg):
29
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
30
+
31
+ def format_log(msg):
32
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
33
+
34
+ def model_hyperlink(link, model_name):
35
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'