cchristophe commited on
Commit
fc21df8
Β·
verified Β·
1 Parent(s): 7ce1e6a

Cleaning and removing df copy causing storage issue

Browse files
Files changed (1) hide show
  1. app.py +223 -1257
app.py CHANGED
@@ -1,145 +1,64 @@
1
- import subprocess
2
-
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
  import time
 
 
 
 
8
 
9
  from src.about import (
10
- CITATION_BUTTON_LABEL,
11
- CITATION_BUTTON_TEXT,
12
- EVALUATION_QUEUE_TEXT,
13
- INTRODUCTION_TEXT,
14
- LLM_BENCHMARKS_TEXT_1,
15
- LLM_BENCHMARKS_TEXT_2,
16
- CROSS_EVALUATION_METRICS,
17
- NOTE_GENERATION_METRICS,
18
- HEALTHBENCH_METRICS,
19
- # EVALUATION_EXAMPLE_IMG,
20
- # LLM_BENCHMARKS_TEXT_2,
21
- # ENTITY_DISTRIBUTION_IMG,
22
- # LLM_BENCHMARKS_TEXT_3,
23
- TITLE,
24
- LOGO,
25
- FIVE_PILLAR_DIAGRAM
26
  )
27
  from src.display.css_html_js import custom_css
28
- # changes to be made here
29
  from src.display.utils import (
30
- DATASET_BENCHMARK_COLS,
31
- OPEN_ENDED_BENCHMARK_COLS,
32
- MED_SAFETY_BENCHMARK_COLS,
33
- MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
34
- ACI_BENCHMARK_COLS,
35
- SOAP_BENCHMARK_COLS,
36
- HEALTHBENCH_BENCHMARK_COLS,
37
- HEALTHBENCH_HARD_BENCHMARK_COLS,
38
- DATASET_COLS,
39
- OPEN_ENDED_COLS,
40
- MED_SAFETY_COLS,
41
- MEDICAL_SUMMARIZATION_COLS,
42
- ACI_COLS,
43
- SOAP_COLS,
44
- HEALTHBENCH_COLS,
45
- HEALTHBENCH_HARD_COLS,
46
- EVAL_COLS,
47
- EVAL_TYPES,
48
- NUMERIC_INTERVALS,
49
- TYPES,
50
- AutoEvalColumn,
51
- ModelType,
52
- ModelArch,
53
- PromptTemplateName,
54
- Precision,
55
- WeightType,
56
- fields,
57
- render_generation_templates,
58
- OpenEndedArabic_COLS,
59
- OpenEndedArabic_BENCHMARK_COLS,
60
- OpenEndedFrench_COLS,
61
- OpenEndedFrench_BENCHMARK_COLS,
62
- OpenEndedPortuguese_COLS,
63
- OpenEndedPortuguese_BENCHMARK_COLS,
64
- OpenEndedRomanian_COLS,
65
- OpenEndedRomanian_BENCHMARK_COLS,
66
- OpenEndedGreek_COLS,
67
- OpenEndedGreek_BENCHMARK_COLS,
68
- OpenEndedSpanish_COLS,
69
- OpenEndedSpanish_BENCHMARK_COLS,
70
- ClosedEndedMultilingual_COLS,
71
- ClosedEndedMultilingual_BENCHMARK_COLS,
72
  )
73
-
74
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
75
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
76
- from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
 
 
 
 
77
 
78
  def restart_space():
79
  API.restart_space(repo_id=REPO_ID)
80
 
81
 
82
- print(f"QUEUE_REPO: {QUEUE_REPO}")
83
- print(f"RESULTS_REPO: {RESULTS_REPO}")
84
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
85
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
86
- print(f"TOKEN: {TOKEN}")
87
-
88
- try:
89
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
90
- snapshot_download(
91
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
92
- )
93
- print(f"EVAL_REQUESTS_PATH downloaded")
94
- except Exception:
95
- print("An error occurred while downloading EVAL_REQUESTS_PATH. Please check the connection or the repository settings.")
96
- restart_space()
97
  try:
98
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
99
- snapshot_download(
100
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
101
- )
102
- print(f"EVAL_RESULTS_PATH downloaded")
103
- except Exception:
104
- print("An error occurred while downloading EVAL_RESULTS_PATH. Please check the connection or the repository settings.")
105
  restart_space()
106
 
107
- # Span based results
108
- # changes to be made here
109
-
110
  start_time = time.time()
111
 
112
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
113
- harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
114
- print("Closed ended English results loaded")
115
-
116
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
117
- open_ended_leaderboard_df = open_ended_original_df.copy()
118
- print("Open ended English results loaded")
119
-
120
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
121
- med_safety_leaderboard_df = med_safety_original_df.copy()
122
- print("Med safety results loaded")
123
-
124
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
125
- medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
126
- print("Medical summarization results loaded")
127
-
128
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
129
- aci_leaderboard_df = aci_original_df.copy()
130
- print("ACI results loaded")
131
-
132
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
133
- soap_leaderboard_df = soap_original_df.copy()
134
- print("SOAP results loaded")
135
-
136
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
137
- healthbench_leaderboard_df = healthbench_original_df.copy()
138
-
139
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
140
- healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
141
- print("Healthbench results loaded")
142
-
143
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
144
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
145
  _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
@@ -148,128 +67,36 @@ _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PAT
148
  _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
149
  _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
150
 
151
-
152
- open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
153
- open_ended_french_leaderboard_df = open_ended_french_df.copy()
154
- open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
155
- open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
156
- open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
157
- open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
158
- print("Open ended multilingual results loaded")
159
-
160
- closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
161
- print("Closed ended multilingual results loaded")
162
-
 
 
 
 
 
163
  end_time = time.time()
164
- total_time = end_time - start_time
165
- print(f"Total time taken to load all results: {total_time:.2f} seconds")
166
-
167
- # breakpoint()
168
- # # Token based results
169
- # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
170
- # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
171
-
172
- # _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
173
- # token_based_types_leaderboard_df = token_based_types_original_df.copy()
174
 
 
 
175
 
176
- (
177
- finished_eval_queue_df,
178
- running_eval_queue_df,
179
- pending_eval_queue_df,
180
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
181
-
182
- # breakpoint()
183
- def update_df(shown_columns, subset="datasets"):
184
- # changes to be made here
185
- if subset == "datasets":
186
- leaderboard_table_df = harness_datasets_leaderboard_df.copy()
187
- hidden_leader_board_df = harness_datasets_original_df
188
- elif subset == "open_ended":
189
- leaderboard_table_df = open_ended_leaderboard_df.copy()
190
- hidden_leader_board_df = open_ended_original_df
191
- elif subset == "med_safety":
192
- leaderboard_table_df = med_safety_leaderboard_df.copy()
193
- hidden_leader_board_df = med_safety_original_df
194
- elif subset == "medical_summarization":
195
- leaderboard_table_df = medical_summarization_leaderboard_df.copy()
196
- hidden_leader_board_df = medical_summarization_original_df
197
- elif subset == "aci":
198
- leaderboard_table_df = aci_leaderboard_df.copy()
199
- hidden_leader_board_df = aci_original_df
200
- elif subset == "soap":
201
- leaderboard_table_df = soap_leaderboard_df.copy()
202
- hidden_leader_board_df = soap_original_df
203
- elif subset == "healthbench":
204
- leaderboard_table_df = healthbench_leaderboard_df.copy()
205
- hidden_leader_board_df = healthbench_original_df
206
- elif subset == "healthbench_hard":
207
- leaderboard_table_df = healthbench_hard_leaderboard_df.copy()
208
- hidden_leader_board_df = healthbench_hard_original_df
209
- elif subset == "open_ended_arabic":
210
- leaderboard_table_df = open_ended_arabic_df.copy()
211
- hidden_leader_board_df = open_ended_arabic_df
212
- elif subset == "open_ended_french":
213
- leaderboard_table_df = open_ended_french_df.copy()
214
- hidden_leader_board_df = open_ended_french_df
215
- elif subset == "open_ended_portuguese":
216
- leaderboard_table_df = open_ended_portuguese_df.copy()
217
- hidden_leader_board_df = open_ended_portuguese_df
218
- elif subset == "open_ended_romanian":
219
- leaderboard_table_df = open_ended_romanian_df.copy()
220
- hidden_leader_board_df = open_ended_romanian_df
221
- elif subset == "open_ended_greek":
222
- leaderboard_table_df = open_ended_greek_df.copy()
223
- hidden_leader_board_df = open_ended_greek_df
224
- elif subset == "open_ended_spanish":
225
- leaderboard_table_df = open_ended_spanish_df.copy()
226
- hidden_leader_board_df = open_ended_spanish_df
227
- elif subset == "closed_ended_multilingual":
228
- leaderboard_table_df = closed_ended_multilingual_df.copy()
229
- hidden_leader_board_df = closed_ended_multilingual_df
230
-
231
-
232
- value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
233
- # breakpoint()
234
- return leaderboard_table_df[value_cols], hidden_leader_board_df
235
-
236
-
237
- # Searching and filtering
238
- def update_table(
239
- hidden_df: pd.DataFrame,
240
- columns: list,
241
- query: str = "",
242
- # type_query: list = None,
243
- domain_specific_query: list = None,
244
- size_query: list = None,
245
- precision_query: str = None,
246
- show_deleted: bool = False,
247
- ):
248
- # breakpoint()
249
- type_query = None
250
- filtered_df = filter_models(hidden_df, type_query, domain_specific_query, size_query, precision_query, show_deleted)
251
- # breakpoint()
252
- filtered_df = filter_queries(query, filtered_df)
253
- # breakpoint()
254
- df = select_columns(filtered_df, columns, list(hidden_df.columns))
255
- # breakpoint()
256
- return df
257
-
258
 
259
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
260
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
261
 
262
-
263
- def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
264
- always_here_cols = [
265
- AutoEvalColumn.model_type_symbol.name,
266
- AutoEvalColumn.model.name,
267
- ]
268
- # We use COLS to maintain sorting
269
- filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
270
- return filtered_df
271
-
272
-
273
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
274
  final_df = []
275
  if query != "":
@@ -285,8 +112,6 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
285
  filtered_df = filtered_df.drop_duplicates(
286
  subset=[
287
  AutoEvalColumn.model.name,
288
- # AutoEvalColumn.precision.name,
289
- # AutoEvalColumn.revision.name,
290
  ]
291
  )
292
 
@@ -296,11 +121,6 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
296
  def filter_models(
297
  df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
298
  ) -> pd.DataFrame:
299
- # Show all models
300
- # if show_deleted:
301
- # filtered_df = df
302
- # else: # Show only still on the hub models
303
- # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
304
 
305
  filtered_df = df
306
 
@@ -315,12 +135,7 @@ def filter_models(
315
  if "Generic models" in domain_specific_query:
316
  domain_specifics.append(False)
317
  filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
318
-
319
- # if architecture_query is not None:
320
- # arch_types = [t for t in architecture_query]
321
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
322
- # # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
323
-
324
  if precision_query is not None:
325
  if AutoEvalColumn.precision.name in df.columns:
326
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -333,1075 +148,225 @@ def filter_models(
333
 
334
  return filtered_df
335
 
336
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  demo = gr.Blocks(css=custom_css)
 
338
  with demo:
339
- print("hello")
340
  gr.HTML(LOGO)
341
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
342
- filter_columns_type = None
343
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
344
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
345
  with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
346
  LANGUAGES = {
347
- "πŸ‡ΊπŸ‡Έ English": "open_ended",
348
- "πŸ‡¦πŸ‡ͺ Arabic": "open_ended_arabic",
349
- "πŸ‡«πŸ‡· French": "open_ended_french",
350
- "πŸ‡ͺπŸ‡Έ Spanish": "open_ended_spanish",
351
- "πŸ‡΅πŸ‡Ή Portuguese": "open_ended_portuguese",
352
- "πŸ‡·πŸ‡΄ Romanian": "open_ended_romanian",
353
  "πŸ‡¬πŸ‡· Greek": "open_ended_greek",
354
  }
355
-
356
  for idx, (label, subset) in enumerate(LANGUAGES.items()):
357
  with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
358
- # Custom judge information for each language
359
- if label == "πŸ‡ΊπŸ‡Έ English":
360
- judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English."
361
- else:
362
- judge_text = "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
363
-
364
  gr.Markdown(judge_text, elem_classes="markdown-text")
365
 
366
- with gr.Row():
367
- with gr.Column():
368
- with gr.Row():
369
- search_bar = gr.Textbox(
370
- placeholder=f"πŸ” Search for your model in {label}...",
371
- show_label=False,
372
- elem_id=f"search-bar-{subset}",
373
- )
374
- with gr.Row():
375
- shown_columns = gr.CheckboxGroup(
376
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
377
- value=[
378
- c.name
379
- for c in fields(AutoEvalColumn)
380
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
381
- ],
382
- label="Select columns to show",
383
- elem_id=f"column-select-{subset}",
384
- interactive=True,
385
- )
386
- with gr.Column(min_width=320):
387
- # filter_columns_type = gr.CheckboxGroup(
388
- # label="Model Types",
389
- # choices=[t.to_str() for t in ModelType],
390
- # value=[t.to_str() for t in ModelType],
391
- # interactive=True,
392
- # elem_id=f"filter-columns-type-{subset}",
393
- # )
394
-
395
- filter_domain_specific = gr.CheckboxGroup(
396
- label="Domain Specificity",
397
- choices=["πŸ₯ Clinical models", "Generic models"],
398
- value=["πŸ₯ Clinical models", "Generic models"],
399
- interactive=True,
400
- elem_id=f"filter-columns-domain-{subset}",
401
- )
402
- filter_columns_size = gr.CheckboxGroup(
403
- label="Model sizes (in billions of parameters)",
404
- choices=list(NUMERIC_INTERVALS.keys()),
405
- value=list(NUMERIC_INTERVALS.keys()),
406
- interactive=True,
407
- elem_id=f"filter-columns-size-{subset}",
408
- )
409
-
410
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset=subset)
411
-
412
- leaderboard_table = gr.Dataframe(
413
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
414
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
415
- datatype=TYPES,
416
- elem_id=f"leaderboard-table-{subset}",
417
- interactive=False,
418
- visible=True,
419
  )
420
-
421
- hidden_leaderboard_table_for_search = gr.Dataframe(
422
- value=datasets_original_df[OPEN_ENDED_COLS],
423
- headers=OPEN_ENDED_COLS,
424
- datatype=TYPES,
425
- visible=False,
426
- )
427
-
428
- search_bar.submit(
429
- update_table,
430
- [
431
- hidden_leaderboard_table_for_search,
432
- shown_columns,
433
- search_bar,
434
- # filter_columns_type,
435
- filter_domain_specific,
436
- filter_columns_size
437
- ],
438
- leaderboard_table,
439
- )
440
-
441
- for selector in [
442
- shown_columns,
443
- # filter_columns_type,
444
- filter_domain_specific,
445
- filter_columns_size,
446
- ]:
447
- selector.change(
448
- update_table,
449
- [
450
- hidden_leaderboard_table_for_search,
451
- shown_columns,
452
- search_bar,
453
- # filter_columns_type,
454
- filter_domain_specific,
455
- filter_columns_size
456
- ],
457
- leaderboard_table,
458
- queue=True,
459
- )
460
-
461
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
462
  with gr.Accordion("Response generation", open=False):
463
  render_generation_templates(task="open_ended", generation_type="response_generation")
464
  with gr.Accordion("Scoring Rubric", open=False):
465
  render_generation_templates(task="open_ended", generation_type="scoring_rubric")
466
-
467
  with gr.TabItem("πŸ… Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
468
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
469
- with gr.Row():
470
- with gr.Column():
471
- with gr.Row():
472
- search_bar = gr.Textbox(
473
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
474
- show_label=False,
475
- elem_id="search-bar",
476
- )
477
- with gr.Row():
478
- shown_columns = gr.CheckboxGroup(
479
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
480
- value=[
481
- c.name
482
- for c in fields(AutoEvalColumn)
483
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
484
- ],
485
- label="Select columns to show",
486
- elem_id="column-select",
487
- interactive=True,
488
- )
489
- # with gr.Row():
490
- # deleted_models_visibility = gr.Checkbox(
491
- # value=False, label="Show gated/private/deleted models", interactive=True
492
- # )
493
- with gr.Column(min_width=320):
494
- # with gr.Box(elem_id="box-filter"):
495
- # filter_columns_type = gr.CheckboxGroup(
496
- # label="Model Types",
497
- # choices=[t.to_str() for t in ModelType],
498
- # value=[t.to_str() for t in ModelType],
499
- # interactive=True,
500
- # elem_id="filter-columns-type",
501
- # )
502
- # filter_columns_architecture = gr.CheckboxGroup(
503
- # label="Architecture Types",
504
- # choices=[i.value.name for i in ModelArch],
505
- # value=[i.value.name for i in ModelArch],
506
- # interactive=True,
507
- # elem_id="filter-columns-architecture",
508
- # )
509
- filter_domain_specific = gr.CheckboxGroup(
510
- label="Domain Specificity",
511
- choices=["πŸ₯ Clinical models", "Generic models"],
512
- value=["πŸ₯ Clinical models", "Generic models"],
513
- interactive=True,
514
- elem_id="filter-columns-type",
515
- )
516
- filter_columns_size = gr.CheckboxGroup(
517
- label="Model sizes (in billions of parameters)",
518
- choices=list(NUMERIC_INTERVALS.keys()),
519
- value=list(NUMERIC_INTERVALS.keys()),
520
- interactive=True,
521
- elem_id="filter-columns-size",
522
- )
523
-
524
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
525
-
526
- leaderboard_table = gr.components.Dataframe(
527
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
528
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
529
- datatype=TYPES,
530
- elem_id="leaderboard-table",
531
- interactive=False,
532
- visible=True,
533
- )
534
-
535
- # Dummy leaderboard for handling the case when the user uses backspace key
536
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
537
- value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
538
- headers=MEDICAL_SUMMARIZATION_COLS,
539
- datatype=TYPES,
540
- visible=False,
541
  )
542
-
543
-
544
- search_bar.submit(
545
- update_table,
546
- [
547
- hidden_leaderboard_table_for_search,
548
- shown_columns,
549
- search_bar,
550
- # filter_columns_type,
551
- filter_domain_specific,
552
- filter_columns_size
553
- # filter_columns_architecture
554
- ],
555
- leaderboard_table,
556
- )
557
- for selector in [
558
- shown_columns,
559
- # filter_columns_type,
560
- filter_domain_specific,
561
- filter_columns_size,
562
- # deleted_models_visibility,
563
- ]:
564
- selector.change(
565
- update_table,
566
- [
567
- hidden_leaderboard_table_for_search,
568
- shown_columns,
569
- search_bar,
570
- # filter_columns_type,
571
- filter_domain_specific,
572
- filter_columns_size
573
- ],
574
- leaderboard_table,
575
- queue=True,
576
- )
577
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
578
  with gr.Accordion("Response generation", open=False):
579
- system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
580
  with gr.Accordion("Question generation", open=False):
581
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
582
  with gr.Accordion("Cross Examination", open=False):
583
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
584
-
585
  with gr.TabItem("πŸ… Note generation", elem_id="llm-benchmark-tab-table", id=3):
586
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
587
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
588
- with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
589
- with gr.Row():
590
- with gr.Column():
591
- with gr.Row():
592
- search_bar = gr.Textbox(
593
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
594
- show_label=False,
595
- elem_id="search-bar",
596
- )
597
- with gr.Row():
598
- shown_columns = gr.CheckboxGroup(
599
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
600
- value=[
601
- c.name
602
- for c in fields(AutoEvalColumn)
603
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
604
- ],
605
- label="Select columns to show",
606
- elem_id="column-select",
607
- interactive=True,
608
- )
609
- # with gr.Row():
610
- # deleted_models_visibility = gr.Checkbox(
611
- # value=False, label="Show gated/private/deleted models", interactive=True
612
- # )
613
- with gr.Column(min_width=320):
614
- # with gr.Box(elem_id="box-filter"):
615
- # filter_columns_type = gr.CheckboxGroup(
616
- # label="Model Types",
617
- # choices=[t.to_str() for t in ModelType],
618
- # value=[t.to_str() for t in ModelType],
619
- # interactive=True,
620
- # elem_id="filter-columns-type",
621
- # )
622
- # filter_columns_architecture = gr.CheckboxGroup(
623
- # label="Architecture Types",
624
- # choices=[i.value.name for i in ModelArch],
625
- # value=[i.value.name for i in ModelArch],
626
- # interactive=True,
627
- # elem_id="filter-columns-architecture",
628
- # )
629
- filter_domain_specific = gr.CheckboxGroup(
630
- label="Domain Specificity",
631
- choices=["πŸ₯ Clinical models", "Generic models"],
632
- value=["πŸ₯ Clinical models", "Generic models"],
633
- interactive=True,
634
- elem_id="filter-columns-type",
635
- )
636
- filter_columns_size = gr.CheckboxGroup(
637
- label="Model sizes (in billions of parameters)",
638
- choices=list(NUMERIC_INTERVALS.keys()),
639
- value=list(NUMERIC_INTERVALS.keys()),
640
- interactive=True,
641
- elem_id="filter-columns-size",
642
- )
643
-
644
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
645
-
646
- leaderboard_table = gr.components.Dataframe(
647
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
648
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
649
- datatype=TYPES,
650
- elem_id="leaderboard-table",
651
- interactive=False,
652
- visible=True,
653
- )
654
-
655
- # Dummy leaderboard for handling the case when the user uses backspace key
656
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
657
- value=datasets_original_df[ACI_COLS],
658
- headers=ACI_COLS,
659
- datatype=TYPES,
660
- visible=False,
661
- )
662
-
663
-
664
- search_bar.submit(
665
- update_table,
666
- [
667
- hidden_leaderboard_table_for_search,
668
- shown_columns,
669
- search_bar,
670
- # filter_columns_type,
671
- filter_domain_specific,
672
- filter_columns_size
673
- # filter_columns_architecture
674
- ],
675
- leaderboard_table,
676
- )
677
- for selector in [
678
- shown_columns,
679
- # filter_columns_type,
680
- filter_domain_specific,
681
- filter_columns_size,
682
- # deleted_models_visibility,
683
- ]:
684
- selector.change(
685
- update_table,
686
- [
687
- hidden_leaderboard_table_for_search,
688
- shown_columns,
689
- search_bar,
690
- # filter_columns_type,
691
- filter_domain_specific,
692
- filter_columns_size
693
- ],
694
- leaderboard_table,
695
- queue=True,
696
- )
697
- with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
698
- with gr.Row():
699
- with gr.Column():
700
- with gr.Row():
701
- search_bar = gr.Textbox(
702
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
703
- show_label=False,
704
- elem_id="search-bar",
705
- )
706
- with gr.Row():
707
- shown_columns = gr.CheckboxGroup(
708
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
709
- value=[
710
- c.name
711
- for c in fields(AutoEvalColumn)
712
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
713
- ],
714
- label="Select columns to show",
715
- elem_id="column-select",
716
- interactive=True,
717
- )
718
- # with gr.Row():
719
- # deleted_models_visibility = gr.Checkbox(
720
- # value=False, label="Show gated/private/deleted models", interactive=True
721
- # )
722
- with gr.Column(min_width=320):
723
- # with gr.Box(elem_id="box-filter"):
724
- # filter_columns_type = gr.CheckboxGroup(
725
- # label="Model Types",
726
- # choices=[t.to_str() for t in ModelType],
727
- # value=[t.to_str() for t in ModelType],
728
- # interactive=True,
729
- # elem_id="filter-columns-type",
730
- # )
731
- # filter_columns_architecture = gr.CheckboxGroup(
732
- # label="Architecture Types",
733
- # choices=[i.value.name for i in ModelArch],
734
- # value=[i.value.name for i in ModelArch],
735
- # interactive=True,
736
- # elem_id="filter-columns-architecture",
737
- # )
738
- filter_domain_specific = gr.CheckboxGroup(
739
- label="Domain Specificity",
740
- choices=["πŸ₯ Clinical models", "Generic models"],
741
- value=["πŸ₯ Clinical models", "Generic models"],
742
- interactive=True,
743
- elem_id="filter-columns-type",
744
- )
745
- filter_columns_size = gr.CheckboxGroup(
746
- label="Model sizes (in billions of parameters)",
747
- choices=list(NUMERIC_INTERVALS.keys()),
748
- value=list(NUMERIC_INTERVALS.keys()),
749
- interactive=True,
750
- elem_id="filter-columns-size",
751
- )
752
-
753
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
754
-
755
- leaderboard_table = gr.components.Dataframe(
756
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
757
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
758
- datatype=TYPES,
759
- elem_id="leaderboard-table",
760
- interactive=False,
761
- visible=True,
762
- )
763
-
764
- # Dummy leaderboard for handling the case when the user uses backspace key
765
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
766
- value=datasets_original_df[SOAP_COLS],
767
- headers=SOAP_COLS,
768
- datatype=TYPES,
769
- visible=False,
770
  )
771
-
772
-
773
- search_bar.submit(
774
- update_table,
775
- [
776
- hidden_leaderboard_table_for_search,
777
- shown_columns,
778
- search_bar,
779
- # filter_columns_type,
780
- filter_domain_specific,
781
- filter_columns_size
782
- # filter_columns_architecture
783
- ],
784
- leaderboard_table,
785
  )
786
- for selector in [
787
- shown_columns,
788
- # filter_columns_type,
789
- filter_domain_specific,
790
- filter_columns_size,
791
- # deleted_models_visibility,
792
- ]:
793
- selector.change(
794
- update_table,
795
- [
796
- hidden_leaderboard_table_for_search,
797
- shown_columns,
798
- search_bar,
799
- # filter_columns_type,
800
- filter_domain_specific,
801
- filter_columns_size
802
- ],
803
- leaderboard_table,
804
- queue=True,
805
- )
806
- with gr.Accordion("πŸ’¬ Generation templates", open=False):
807
- with gr.Accordion("ACI-Bench Response generation", open=False):
808
- system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
809
- with gr.Accordion("SOAP Notes Response generation", open=False):
810
- system_prompt, user_prompt = render_generation_templates(task="soap", generation_type="response_generation")
811
- with gr.Accordion("Question generation", open=False):
812
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
813
- with gr.Accordion("Cross Examination", open=False):
814
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
815
 
816
  with gr.TabItem("πŸ… HealthBench", elem_id="llm-benchmark-tab-table", id=4):
817
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
818
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
819
- with gr.TabItem("HealthBench", elem_id="llm-benchmark-tab-table3", id=0):
820
- with gr.Row():
821
- with gr.Column():
822
- with gr.Row():
823
- search_bar = gr.Textbox(
824
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
825
- show_label=False,
826
- elem_id="search-bar",
827
- )
828
- with gr.Row():
829
- shown_columns = gr.CheckboxGroup(
830
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
831
- value=[
832
- c.name
833
- for c in fields(AutoEvalColumn)
834
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)
835
- ],
836
- label="Select columns to show",
837
- elem_id="column-select",
838
- interactive=True,
839
- )
840
- # with gr.Row():
841
- # deleted_models_visibility = gr.Checkbox(
842
- # value=False, label="Show gated/private/deleted models", interactive=True
843
- # )
844
- with gr.Column(min_width=320):
845
- # with gr.Box(elem_id="box-filter"):
846
- # filter_columns_type = gr.CheckboxGroup(
847
- # label="Model Types",
848
- # choices=[t.to_str() for t in ModelType],
849
- # value=[t.to_str() for t in ModelType],
850
- # interactive=True,
851
- # elem_id="filter-columns-type",
852
- # )
853
- # filter_columns_architecture = gr.CheckboxGroup(
854
- # label="Architecture Types",
855
- # choices=[i.value.name for i in ModelArch],
856
- # value=[i.value.name for i in ModelArch],
857
- # interactive=True,
858
- # elem_id="filter-columns-architecture",
859
- # )
860
- filter_domain_specific = gr.CheckboxGroup(
861
- label="Domain Specificity",
862
- choices=["πŸ₯ Clinical models", "Generic models"],
863
- value=["πŸ₯ Clinical models", "Generic models"],
864
- interactive=True,
865
- elem_id="filter-columns-type",
866
- )
867
- filter_columns_size = gr.CheckboxGroup(
868
- label="Model sizes (in billions of parameters)",
869
- choices=list(NUMERIC_INTERVALS.keys()),
870
- value=list(NUMERIC_INTERVALS.keys()),
871
- interactive=True,
872
- elem_id="filter-columns-size",
873
- )
874
-
875
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench")
876
-
877
- leaderboard_table = gr.components.Dataframe(
878
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
879
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
880
- datatype=TYPES,
881
- elem_id="leaderboard-table",
882
- interactive=False,
883
- visible=True,
884
- )
885
-
886
- # Dummy leaderboard for handling the case when the user uses backspace key
887
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
888
- value=datasets_original_df[HEALTHBENCH_COLS],
889
- headers=HEALTHBENCH_COLS,
890
- datatype=TYPES,
891
- visible=False,
892
- )
893
-
894
-
895
- search_bar.submit(
896
- update_table,
897
- [
898
- hidden_leaderboard_table_for_search,
899
- shown_columns,
900
- search_bar,
901
- # filter_columns_type,
902
- filter_domain_specific,
903
- filter_columns_size
904
- # filter_columns_architecture
905
- ],
906
- leaderboard_table,
907
- )
908
- for selector in [
909
- shown_columns,
910
- # filter_columns_type,
911
- filter_domain_specific,
912
- filter_columns_size,
913
- # deleted_models_visibility,
914
- ]:
915
- selector.change(
916
- update_table,
917
- [
918
- hidden_leaderboard_table_for_search,
919
- shown_columns,
920
- search_bar,
921
- # filter_columns_type,
922
- filter_domain_specific,
923
- filter_columns_size
924
- ],
925
- leaderboard_table,
926
- queue=True,
927
- )
928
- with gr.TabItem("HealthBench-Hard", elem_id="llm-benchmark-tab-table3", id=1):
929
- with gr.Row():
930
- with gr.Column():
931
- with gr.Row():
932
- search_bar = gr.Textbox(
933
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
934
- show_label=False,
935
- elem_id="search-bar",
936
- )
937
- with gr.Row():
938
- shown_columns = gr.CheckboxGroup(
939
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
940
- value=[
941
- c.name
942
- for c in fields(AutoEvalColumn)
943
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)
944
- ],
945
- label="Select columns to show",
946
- elem_id="column-select",
947
- interactive=True,
948
- )
949
- # with gr.Row():
950
- # deleted_models_visibility = gr.Checkbox(
951
- # value=False, label="Show gated/private/deleted models", interactive=True
952
- # )
953
- with gr.Column(min_width=320):
954
- # with gr.Box(elem_id="box-filter"):
955
- # filter_columns_type = gr.CheckboxGroup(
956
- # label="Model Types",
957
- # choices=[t.to_str() for t in ModelType],
958
- # value=[t.to_str() for t in ModelType],
959
- # interactive=True,
960
- # elem_id="filter-columns-type",
961
- # )
962
- # filter_columns_architecture = gr.CheckboxGroup(
963
- # label="Architecture Types",
964
- # choices=[i.value.name for i in ModelArch],
965
- # value=[i.value.name for i in ModelArch],
966
- # interactive=True,
967
- # elem_id="filter-columns-architecture",
968
- # )
969
- filter_domain_specific = gr.CheckboxGroup(
970
- label="Domain Specificity",
971
- choices=["πŸ₯ Clinical models", "Generic models"],
972
- value=["πŸ₯ Clinical models", "Generic models"],
973
- interactive=True,
974
- elem_id="filter-columns-type",
975
- )
976
- filter_columns_size = gr.CheckboxGroup(
977
- label="Model sizes (in billions of parameters)",
978
- choices=list(NUMERIC_INTERVALS.keys()),
979
- value=list(NUMERIC_INTERVALS.keys()),
980
- interactive=True,
981
- elem_id="filter-columns-size",
982
- )
983
-
984
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench_hard")
985
-
986
- leaderboard_table = gr.components.Dataframe(
987
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
988
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
989
- datatype=TYPES,
990
- elem_id="leaderboard-table",
991
- interactive=False,
992
- visible=True,
993
- )
994
-
995
- # Dummy leaderboard for handling the case when the user uses backspace key
996
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
997
- value=datasets_original_df[HEALTHBENCH_HARD_COLS],
998
- headers=HEALTHBENCH_HARD_COLS,
999
- datatype=TYPES,
1000
- visible=False,
1001
  )
1002
-
1003
-
1004
- search_bar.submit(
1005
- update_table,
1006
- [
1007
- hidden_leaderboard_table_for_search,
1008
- shown_columns,
1009
- search_bar,
1010
- # filter_columns_type,
1011
- filter_domain_specific,
1012
- filter_columns_size
1013
- # filter_columns_architecture
1014
- ],
1015
- leaderboard_table,
1016
  )
1017
- for selector in [
1018
- shown_columns,
1019
- # filter_columns_type,
1020
- filter_domain_specific,
1021
- filter_columns_size,
1022
- # deleted_models_visibility,
1023
- ]:
1024
- selector.change(
1025
- update_table,
1026
- [
1027
- hidden_leaderboard_table_for_search,
1028
- shown_columns,
1029
- search_bar,
1030
- # filter_columns_type,
1031
- filter_domain_specific,
1032
- filter_columns_size
1033
- ],
1034
- leaderboard_table,
1035
- queue=True,
1036
- )
1037
 
1038
  with gr.TabItem("πŸ… Med Safety", elem_id="llm-benchmark-tab-table", id=5):
1039
- with gr.Row():
1040
- with gr.Column():
1041
- with gr.Row():
1042
- search_bar = gr.Textbox(
1043
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
1044
- show_label=False,
1045
- elem_id="search-bar",
1046
- )
1047
- with gr.Row():
1048
- shown_columns = gr.CheckboxGroup(
1049
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
1050
- value=[
1051
- c.name
1052
- for c in fields(AutoEvalColumn)
1053
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
1054
- ],
1055
- label="Select columns to show",
1056
- elem_id="column-select",
1057
- interactive=True,
1058
- )
1059
- # with gr.Row():
1060
- # deleted_models_visibility = gr.Checkbox(
1061
- # value=False, label="Show gated/private/deleted models", interactive=True
1062
- # )
1063
- with gr.Column(min_width=320):
1064
- # with gr.Box(elem_id="box-filter"):
1065
- # filter_columns_type = gr.CheckboxGroup(
1066
- # label="Model Types",
1067
- # choices=[t.to_str() for t in ModelType],
1068
- # value=[t.to_str() for t in ModelType],
1069
- # interactive=True,
1070
- # elem_id="filter-columns-type",
1071
- # )
1072
- # filter_columns_architecture = gr.CheckboxGroup(
1073
- # label="Architecture Types",
1074
- # choices=[i.value.name for i in ModelArch],
1075
- # value=[i.value.name for i in ModelArch],
1076
- # interactive=True,
1077
- # elem_id="filter-columns-architecture",
1078
- # )
1079
- filter_domain_specific = gr.CheckboxGroup(
1080
- label="Domain Specificity",
1081
- choices=["πŸ₯ Clinical models", "Generic models"],
1082
- value=["πŸ₯ Clinical models", "Generic models"],
1083
- interactive=True,
1084
- elem_id="filter-columns-type",
1085
- )
1086
- filter_columns_size = gr.CheckboxGroup(
1087
- label="Model sizes (in billions of parameters)",
1088
- choices=list(NUMERIC_INTERVALS.keys()),
1089
- value=list(NUMERIC_INTERVALS.keys()),
1090
- interactive=True,
1091
- elem_id="filter-columns-size",
1092
- )
1093
-
1094
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
1095
-
1096
- leaderboard_table = gr.components.Dataframe(
1097
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1098
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1099
- datatype=TYPES,
1100
- elem_id="leaderboard-table",
1101
- interactive=False,
1102
- visible=True,
1103
  )
1104
-
1105
- # Dummy leaderboard for handling the case when the user uses backspace key
1106
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1107
- value=datasets_original_df[MED_SAFETY_COLS],
1108
- headers=MED_SAFETY_COLS,
1109
- datatype=TYPES,
1110
- visible=False,
1111
- )
1112
-
1113
-
1114
- search_bar.submit(
1115
- update_table,
1116
- [
1117
- hidden_leaderboard_table_for_search,
1118
- shown_columns,
1119
- search_bar,
1120
- # filter_columns_type,
1121
- filter_domain_specific,
1122
- filter_columns_size
1123
- # filter_columns_architecture
1124
- ],
1125
- leaderboard_table,
1126
- )
1127
- for selector in [
1128
- shown_columns,
1129
- # filter_columns_type,
1130
- filter_domain_specific,
1131
- filter_columns_size,
1132
- # deleted_models_visibility,
1133
- ]:
1134
- selector.change(
1135
- update_table,
1136
- [
1137
- hidden_leaderboard_table_for_search,
1138
- shown_columns,
1139
- search_bar,
1140
- # filter_columns_type,
1141
- filter_domain_specific,
1142
- filter_columns_size
1143
- ],
1144
- leaderboard_table,
1145
- queue=True,
1146
- )
1147
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
1148
  with gr.Accordion("Response generation", open=False):
1149
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
1150
  with gr.Accordion("Scoring Rubric", open=False):
1151
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
1152
-
1153
  with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
1154
- with gr.Tabs(elem_classes="tab-buttons2") as closed_tabs:
1155
- # ENGLISH TAB
1156
- with gr.TabItem("English", elem_id="llm-benchmark-tab-closed-english", id=0):
1157
- with gr.Row():
1158
- with gr.Column():
1159
- with gr.Row():
1160
- search_bar = gr.Textbox(
1161
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
1162
- show_label=False,
1163
- elem_id="search-bar-closed-english",
1164
- )
1165
- with gr.Row():
1166
- shown_columns = gr.CheckboxGroup(
1167
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
1168
- value=[
1169
- c.name
1170
- for c in fields(AutoEvalColumn)
1171
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
1172
- ],
1173
- label="Select columns to show",
1174
- elem_id="column-select-closed-english",
1175
- interactive=True,
1176
- )
1177
- with gr.Column(min_width=320):
1178
- # filter_columns_type = gr.CheckboxGroup(
1179
- # label="Model Types",
1180
- # choices=[t.to_str() for t in ModelType],
1181
- # value=[t.to_str() for t in ModelType],
1182
- # interactive=True,
1183
- # elem_id="filter-columns-type-closed-english",
1184
- # )
1185
- filter_domain_specific = gr.CheckboxGroup(
1186
- label="Domain Specificity",
1187
- choices=["πŸ₯ Clinical models", "Generic models"],
1188
- value=["πŸ₯ Clinical models", "Generic models"],
1189
- interactive=True,
1190
- elem_id="filter-domain-specific-closed-english",
1191
- )
1192
- filter_columns_size = gr.CheckboxGroup(
1193
- label="Model sizes (in billions of parameters)",
1194
- choices=list(NUMERIC_INTERVALS.keys()),
1195
- value=list(NUMERIC_INTERVALS.keys()),
1196
- interactive=True,
1197
- elem_id="filter-columns-size-closed-english",
1198
- )
1199
-
1200
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1201
- leaderboard_table = gr.components.Dataframe(
1202
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1203
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1204
- datatype=TYPES,
1205
- elem_id="leaderboard-table-english",
1206
- interactive=False,
1207
- visible=True,
1208
- )
1209
-
1210
- # Dummy leaderboard for handling the case when the user uses backspace key
1211
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1212
- value=datasets_original_df[DATASET_COLS],
1213
- headers=DATASET_COLS,
1214
- datatype=TYPES,
1215
- visible=False,
1216
  )
1217
-
1218
- search_bar.submit(
1219
- update_table,
1220
- [
1221
- hidden_leaderboard_table_for_search,
1222
- shown_columns,
1223
- search_bar,
1224
- # filter_columns_type,
1225
- filter_domain_specific,
1226
- filter_columns_size
1227
- ],
1228
- leaderboard_table,
1229
  )
1230
-
1231
- for selector in [
1232
- shown_columns,
1233
- # filter_columns_type,
1234
- filter_domain_specific,
1235
- filter_columns_size,
1236
- ]:
1237
- selector.change(
1238
- update_table,
1239
- [
1240
- hidden_leaderboard_table_for_search,
1241
- shown_columns,
1242
- search_bar,
1243
- # filter_columns_type,
1244
- filter_domain_specific,
1245
- filter_columns_size
1246
- ],
1247
- leaderboard_table,
1248
- queue=True,
1249
- )
1250
-
1251
- #MULTILINGUAL TAB - Same level as English tab
1252
- with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
1253
- with gr.Row():
1254
- gr.Markdown("πŸ“Š **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
1255
-
1256
- with gr.Row():
1257
- with gr.Column():
1258
- with gr.Row():
1259
- search_bar = gr.Textbox(
1260
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
1261
- show_label=False,
1262
- elem_id="search-bar",
1263
- )
1264
-
1265
- with gr.Row():
1266
- shown_columns = gr.CheckboxGroup(
1267
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
1268
- value=[
1269
- c.name
1270
- for c in fields(AutoEvalColumn)
1271
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
1272
- ],
1273
- label="Select columns to show",
1274
- elem_id="column-select",
1275
- interactive=True,
1276
- )
1277
- with gr.Column(min_width=320):
1278
- # with gr.Box(elem_id="box-filter"):
1279
- # filter_columns_type = gr.CheckboxGroup(
1280
- # label="Model Types",
1281
- # choices=[t.to_str() for t in ModelType],
1282
- # value=[t.to_str() for t in ModelType],
1283
- # interactive=True,
1284
- # elem_id="filter-columns-type",
1285
- # )
1286
- filter_domain_specific = gr.CheckboxGroup(
1287
- label="Domain Specificity",
1288
- choices=["πŸ₯ Clinical models", "Generic models"],
1289
- value=["πŸ₯ Clinical models", "Generic models"],
1290
- interactive=True,
1291
- elem_id="filter-columns-type",
1292
- )
1293
- filter_columns_size = gr.CheckboxGroup(
1294
- label="Model sizes (in billions of parameters)",
1295
- choices=list(NUMERIC_INTERVALS.keys()),
1296
- value=list(NUMERIC_INTERVALS.keys()),
1297
- interactive=True,
1298
- elem_id="filter-columns-size",
1299
- )
1300
 
1301
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1302
- leaderboard_table = gr.components.Dataframe(
1303
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1304
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1305
- datatype=TYPES,
1306
- elem_id="leaderboard-table",
1307
- interactive=False,
1308
- visible=True,
1309
- )
1310
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1311
- value=datasets_original_df[ClosedEndedMultilingual_COLS],
1312
- headers=ClosedEndedMultilingual_COLS,
1313
- datatype=TYPES,
1314
- visible=False,
1315
- )
1316
-
1317
- search_bar.submit(
1318
- update_table,
1319
- [
1320
- hidden_leaderboard_table_for_search,
1321
- shown_columns,
1322
- search_bar,
1323
- # filter_columns_type,
1324
- filter_domain_specific,
1325
- filter_columns_size
1326
- # filter_columns_architecture
1327
- ],
1328
- leaderboard_table,
1329
- )
1330
- for selector in [
1331
- shown_columns,
1332
- # filter_columns_type,
1333
- filter_domain_specific,
1334
- # filter_columns_architecture,
1335
- filter_columns_size,
1336
- # deleted_models_visibility,
1337
- ]:
1338
- selector.change(
1339
- update_table,
1340
- [
1341
- hidden_leaderboard_table_for_search,
1342
- shown_columns,
1343
- search_bar,
1344
- # filter_columns_type,
1345
- filter_domain_specific,
1346
- filter_columns_size
1347
- # filter_columns_architecture,
1348
- ],
1349
- leaderboard_table,
1350
- queue=True,
1351
- )
1352
-
1353
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7):
1354
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1355
  gr.HTML(FIVE_PILLAR_DIAGRAM)
1356
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1357
- # gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
1358
- # gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1359
- # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
1360
- # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
1361
 
1362
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=8):
 
1363
  with gr.Column():
1364
- with gr.Row():
1365
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1366
-
1367
- with gr.Column():
1368
- with gr.Accordion(
1369
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
1370
- open=False,
1371
- ):
1372
- with gr.Row():
1373
- finished_eval_table = gr.components.Dataframe(
1374
- value=finished_eval_queue_df,
1375
- headers=EVAL_COLS,
1376
- datatype=EVAL_TYPES,
1377
- row_count=5,
1378
- )
1379
- with gr.Accordion(
1380
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
1381
- open=False,
1382
- ):
1383
- with gr.Row():
1384
- running_eval_table = gr.components.Dataframe(
1385
- value=running_eval_queue_df,
1386
- headers=EVAL_COLS,
1387
- datatype=EVAL_TYPES,
1388
- row_count=5,
1389
- )
1390
-
1391
- with gr.Accordion(
1392
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
1393
- open=False,
1394
- ):
1395
- with gr.Row():
1396
- pending_eval_table = gr.components.Dataframe(
1397
- value=pending_eval_queue_df,
1398
- headers=EVAL_COLS,
1399
- datatype=EVAL_TYPES,
1400
- row_count=5,
1401
- )
1402
  with gr.Row():
1403
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
1404
-
1405
  with gr.Row():
1406
  with gr.Column():
1407
  model_name_textbox = gr.Textbox(label="Model name")
@@ -1459,10 +424,9 @@ with demo:
1459
  submission_result,
1460
  )
1461
 
1462
-
1463
  with gr.Row():
1464
  with gr.Accordion("πŸ“™ Citation", open=False):
1465
- citation_button = gr.Textbox(
1466
  value=CITATION_BUTTON_TEXT,
1467
  label=CITATION_BUTTON_LABEL,
1468
  lines=20,
@@ -1470,7 +434,9 @@ with demo:
1470
  show_copy_button=True,
1471
  )
1472
 
 
1473
  scheduler = BackgroundScheduler()
1474
- scheduler.add_job(restart_space, "interval", seconds=1800)
1475
  scheduler.start()
 
1476
  demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False)
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
  import time
6
+ import functools
7
+ import gc
8
+
9
+ import os
10
 
11
  from src.about import (
12
+ CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS,
14
+ NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, LOGO, FIVE_PILLAR_DIAGRAM
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
  from src.display.css_html_js import custom_css
 
17
  from src.display.utils import (
18
+ DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS,
19
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS,
20
+ HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS,
21
+ OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS,
22
+ HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS,
23
+ TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates,
24
+ OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS,
25
+ OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS,
26
+ OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS,
27
+ OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS,
28
+ ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
31
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
+ from src.submission.submit import add_new_eval
33
+
34
+ # =====================================================================================
35
+ # 1. SETUP AND DATA LOADING
36
+ # =====================================================================================
37
 
38
  def restart_space():
39
  API.restart_space(repo_id=REPO_ID)
40
 
41
 
42
+ print("Downloading evaluation data...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
+ snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
45
+ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN)
46
+ print("Downloads complete.")
47
+ except Exception as e:
48
+ print(f"An error occurred during download: {e}")
 
 
49
  restart_space()
50
 
51
+ print("Loading all dataframes into a central dictionary...")
 
 
52
  start_time = time.time()
53
 
54
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
 
 
 
55
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
 
 
 
56
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
 
 
 
57
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
 
 
 
58
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
 
 
 
59
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
 
 
 
60
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
 
 
61
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
 
 
 
62
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
63
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
64
  _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
 
67
  _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
68
  _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
69
 
70
+ ALL_DATASETS = {
71
+ "datasets": harness_datasets_original_df,
72
+ "open_ended": open_ended_original_df,
73
+ "med_safety": med_safety_original_df,
74
+ "medical_summarization": medical_summarization_original_df,
75
+ "aci": aci_original_df,
76
+ "soap": soap_original_df,
77
+ "healthbench": healthbench_original_df,
78
+ "healthbench_hard": healthbench_hard_original_df,
79
+ "open_ended_arabic": open_ended_arabic_df,
80
+ "open_ended_french": open_ended_french_df,
81
+ "open_ended_portuguese": open_ended_portuguese_df,
82
+ "open_ended_romanian": open_ended_romanian_df,
83
+ "open_ended_greek": open_ended_greek_df,
84
+ "open_ended_spanish": open_ended_spanish_df,
85
+ "closed_ended_multilingual": closed_ended_multilingual_df,
86
+ }
87
  end_time = time.time()
88
+ print(f"Dataframes loaded in {end_time - start_time:.2f} seconds.")
 
 
 
 
 
 
 
 
 
89
 
90
+ # Evaluation Queue DataFrames
91
+ (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
92
 
93
+ # =====================================================================================
94
+ # 2. EFFICIENT FILTERING LOGIC
95
+ # =====================================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
98
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
99
 
 
 
 
 
 
 
 
 
 
 
 
100
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
101
  final_df = []
102
  if query != "":
 
112
  filtered_df = filtered_df.drop_duplicates(
113
  subset=[
114
  AutoEvalColumn.model.name,
 
 
115
  ]
116
  )
117
 
 
121
  def filter_models(
122
  df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
123
  ) -> pd.DataFrame:
 
 
 
 
 
124
 
125
  filtered_df = df
126
 
 
135
  if "Generic models" in domain_specific_query:
136
  domain_specifics.append(False)
137
  filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
138
+
 
 
 
 
 
139
  if precision_query is not None:
140
  if AutoEvalColumn.precision.name in df.columns:
141
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
148
 
149
  return filtered_df
150
 
151
+ def get_filtered_table(
152
+ shown_columns: list,
153
+ query: str,
154
+ domain_specific_query: list,
155
+ size_query: list,
156
+ *, # force subset_name to be a keyword-only argument
157
+ subset_name: str
158
+ ):
159
+ original_df = ALL_DATASETS[subset_name]
160
+
161
+ type_query = None
162
+ filtered_df = filter_models(original_df, type_query, domain_specific_query, size_query, None, False)
163
+ filtered_df = filter_queries(query, filtered_df)
164
+
165
+ always_here_cols = [AutoEvalColumn.model.name]
166
+ available_cols = [c for c in shown_columns if c in filtered_df.columns]
167
+ final_df = filtered_df[always_here_cols + available_cols]
168
+
169
+ del filtered_df
170
+ gc.collect()
171
+
172
+
173
+ return final_df
174
+
175
+ # =====================================================================================
176
+ # 3. REUSABLE UI CREATION FUNCTION
177
+ # =====================================================================================
178
+
179
+ def create_leaderboard_ui(subset_name: str, column_choices: list, default_columns: list):
180
+ """Creates a full leaderboard UI block for a given subset."""
181
+ with gr.Row():
182
+ with gr.Column():
183
+ with gr.Row():
184
+ search_bar = gr.Textbox(
185
+ placeholder=f"πŸ” Search for models...",
186
+ show_label=False,
187
+ elem_id=f"search-bar-{subset_name}",
188
+ )
189
+ with gr.Row():
190
+ shown_columns = gr.CheckboxGroup(
191
+ choices=column_choices,
192
+ value=default_columns,
193
+ label="Select columns to show",
194
+ elem_id=f"column-select-{subset_name}",
195
+ interactive=True,
196
+ )
197
+ with gr.Column(min_width=320):
198
+ filter_domain_specific = gr.CheckboxGroup(
199
+ label="Domain Specificity",
200
+ choices=["πŸ₯ Clinical models", "Generic models"],
201
+ value=["πŸ₯ Clinical models", "Generic models"],
202
+ interactive=True,
203
+ elem_id=f"filter-domain-{subset_name}",
204
+ )
205
+ filter_columns_size = gr.CheckboxGroup(
206
+ label="Model sizes (in billions of parameters)",
207
+ choices=list(NUMERIC_INTERVALS.keys()),
208
+ value=list(NUMERIC_INTERVALS.keys()),
209
+ interactive=True,
210
+ elem_id=f"filter-size-{subset_name}",
211
+ )
212
+
213
+ update_fn = functools.partial(get_filtered_table, subset_name=subset_name)
214
+
215
+ initial_df = update_fn(
216
+ shown_columns=default_columns,
217
+ query="",
218
+ domain_specific_query=["πŸ₯ Clinical models", "Generic models"],
219
+ size_query=list(NUMERIC_INTERVALS.keys())
220
+ )
221
+
222
+ leaderboard_table = gr.Dataframe(
223
+ value=initial_df,
224
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + default_columns,
225
+ datatype=TYPES,
226
+ elem_id=f"leaderboard-table-{subset_name}",
227
+ interactive=False,
228
+ )
229
+
230
+ inputs = [shown_columns, search_bar, filter_domain_specific, filter_columns_size]
231
+
232
+ # Attach listeners to all input components
233
+ for component in inputs:
234
+ if isinstance(component, gr.Textbox):
235
+ component.submit(update_fn, inputs, leaderboard_table)
236
+ else:
237
+ component.change(update_fn, inputs, leaderboard_table)
238
+
239
+ return leaderboard_table
240
+
241
+ # =====================================================================================
242
+ # 4. GRADIO DEMO UI (Main application layout)
243
+ # =====================================================================================
244
+
245
  demo = gr.Blocks(css=custom_css)
246
+
247
  with demo:
 
248
  gr.HTML(LOGO)
249
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
250
+
251
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
252
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
253
  with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
254
  LANGUAGES = {
255
+ "πŸ‡ΊπŸ‡Έ English": "open_ended", "πŸ‡¦πŸ‡ͺ Arabic": "open_ended_arabic",
256
+ "πŸ‡«πŸ‡· French": "open_ended_french", "πŸ‡ͺπŸ‡Έ Spanish": "open_ended_spanish",
257
+ "πŸ‡΅πŸ‡Ή Portuguese": "open_ended_portuguese", "πŸ‡·πŸ‡΄ Romanian": "open_ended_romanian",
 
 
 
258
  "πŸ‡¬πŸ‡· Greek": "open_ended_greek",
259
  }
 
260
  for idx, (label, subset) in enumerate(LANGUAGES.items()):
261
  with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
262
+ judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English." if label == "πŸ‡ΊπŸ‡Έ English" else "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
 
 
 
 
 
263
  gr.Markdown(judge_text, elem_classes="markdown-text")
264
 
265
+ create_leaderboard_ui(
266
+ subset_name=subset,
267
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
268
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
271
  with gr.Accordion("Response generation", open=False):
272
  render_generation_templates(task="open_ended", generation_type="response_generation")
273
  with gr.Accordion("Scoring Rubric", open=False):
274
  render_generation_templates(task="open_ended", generation_type="scoring_rubric")
275
+
276
  with gr.TabItem("πŸ… Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
277
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
278
+ create_leaderboard_ui(
279
+ subset_name="medical_summarization",
280
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
281
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
284
  with gr.Accordion("Response generation", open=False):
285
+ render_generation_templates(task="medical_summarization", generation_type="response_generation")
286
  with gr.Accordion("Question generation", open=False):
287
+ render_generation_templates(task="ce", generation_type="question_generation")
288
  with gr.Accordion("Cross Examination", open=False):
289
+ render_generation_templates(task="ce", generation_type="cross_examination")
290
+
291
  with gr.TabItem("πŸ… Note generation", elem_id="llm-benchmark-tab-table", id=3):
292
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
293
+ with gr.Tabs(elem_classes="tab-buttons2"):
294
+ with gr.TabItem("ACI Bench", id=0):
295
+ create_leaderboard_ui(
296
+ subset_name="aci",
297
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
298
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  )
300
+ with gr.TabItem("SOAP Notes", id=1):
301
+ create_leaderboard_ui(
302
+ subset_name="soap",
303
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
304
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)]
 
 
 
 
 
 
 
 
 
305
  )
306
+ # Add accordions for this section if needed, similar to other tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  with gr.TabItem("πŸ… HealthBench", elem_id="llm-benchmark-tab-table", id=4):
309
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
310
+ with gr.Tabs(elem_classes="tab-buttons2"):
311
+ with gr.TabItem("HealthBench", id=0):
312
+ create_leaderboard_ui(
313
+ subset_name="healthbench",
314
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
315
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  )
317
+ with gr.TabItem("HealthBench-Hard", id=1):
318
+ create_leaderboard_ui(
319
+ subset_name="healthbench_hard",
320
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
321
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)]
 
 
 
 
 
 
 
 
 
322
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  with gr.TabItem("πŸ… Med Safety", elem_id="llm-benchmark-tab-table", id=5):
325
+ create_leaderboard_ui(
326
+ subset_name="med_safety",
327
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
328
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  with gr.Accordion("πŸ’¬ Generation templates", open=False):
331
  with gr.Accordion("Response generation", open=False):
332
+ render_generation_templates(task="med_safety", generation_type="response_generation")
333
  with gr.Accordion("Scoring Rubric", open=False):
334
+ render_generation_templates(task="med_safety", generation_type="scoring_rubric")
335
+
336
  with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
337
+ with gr.Tabs(elem_classes="tab-buttons2"):
338
+ with gr.TabItem("English", id=0):
339
+ create_leaderboard_ui(
340
+ subset_name="datasets",
341
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
342
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  )
344
+ with gr.TabItem("🌍 Multilingual", id=1):
345
+ gr.Markdown("πŸ“Š **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
346
+ create_leaderboard_ui(
347
+ subset_name="closed_ended_multilingual",
348
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
349
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)]
 
 
 
 
 
 
350
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7):
353
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
354
  gr.HTML(FIVE_PILLAR_DIAGRAM)
355
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
 
 
 
 
356
 
357
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=8):
358
+
359
  with gr.Column():
360
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
361
+ with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
362
+ gr.Dataframe(value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
363
+ with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
364
+ gr.Dataframe(value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
365
+ with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
366
+ gr.Dataframe(value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
367
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  with gr.Row():
369
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
 
370
  with gr.Row():
371
  with gr.Column():
372
  model_name_textbox = gr.Textbox(label="Model name")
 
424
  submission_result,
425
  )
426
 
 
427
  with gr.Row():
428
  with gr.Accordion("πŸ“™ Citation", open=False):
429
+ gr.Textbox(
430
  value=CITATION_BUTTON_TEXT,
431
  label=CITATION_BUTTON_LABEL,
432
  lines=20,
 
434
  show_copy_button=True,
435
  )
436
 
437
+
438
  scheduler = BackgroundScheduler()
439
+ scheduler.add_job(restart_space, "interval", seconds=86400)
440
  scheduler.start()
441
+
442
  demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False)