Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
Β·
1d6adda
1
Parent(s):
319b0b7
Added graphs tab
Browse filesAdded graphs tab to show the progress of all models over time against human baselines
app.py
CHANGED
|
@@ -16,6 +16,13 @@ from src.assets.text_content import (
|
|
| 16 |
LLM_BENCHMARKS_TEXT,
|
| 17 |
TITLE,
|
| 18 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
|
| 20 |
from src.display_models.utils import (
|
| 21 |
AutoEvalColumn,
|
|
@@ -97,6 +104,7 @@ else:
|
|
| 97 |
|
| 98 |
original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
| 99 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
|
|
|
| 100 |
|
| 101 |
to_be_dumped = f"models = {repr(models)}\n"
|
| 102 |
|
|
@@ -349,7 +357,6 @@ with demo:
|
|
| 349 |
interactive=True,
|
| 350 |
elem_id="filter-columns-size",
|
| 351 |
)
|
| 352 |
-
|
| 353 |
leaderboard_table = gr.components.Dataframe(
|
| 354 |
value=leaderboard_df[
|
| 355 |
[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
|
@@ -466,6 +473,19 @@ with demo:
|
|
| 466 |
leaderboard_table,
|
| 467 |
queue=True,
|
| 468 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 470 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 471 |
|
|
@@ -588,4 +608,4 @@ with demo:
|
|
| 588 |
scheduler = BackgroundScheduler()
|
| 589 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 590 |
scheduler.start()
|
| 591 |
-
demo.queue(concurrency_count=40).launch()
|
|
|
|
| 16 |
LLM_BENCHMARKS_TEXT,
|
| 17 |
TITLE,
|
| 18 |
)
|
| 19 |
+
from src.display_models.plot_results import (
|
| 20 |
+
create_metric_plot_obj,
|
| 21 |
+
create_scores_df,
|
| 22 |
+
create_plot_df,
|
| 23 |
+
join_model_info_with_results,
|
| 24 |
+
HUMAN_BASELINES,
|
| 25 |
+
)
|
| 26 |
from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
|
| 27 |
from src.display_models.utils import (
|
| 28 |
AutoEvalColumn,
|
|
|
|
| 104 |
|
| 105 |
original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
| 106 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
| 107 |
+
plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
| 108 |
|
| 109 |
to_be_dumped = f"models = {repr(models)}\n"
|
| 110 |
|
|
|
|
| 357 |
interactive=True,
|
| 358 |
elem_id="filter-columns-size",
|
| 359 |
)
|
|
|
|
| 360 |
leaderboard_table = gr.components.Dataframe(
|
| 361 |
value=leaderboard_df[
|
| 362 |
[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
|
|
|
| 473 |
leaderboard_table,
|
| 474 |
queue=True,
|
| 475 |
)
|
| 476 |
+
with gr.TabItem("π Benchmark Graphs", elem_id="llm-benchmark-tab-table", id=4):
|
| 477 |
+
with gr.Row():
|
| 478 |
+
with gr.Column():
|
| 479 |
+
chart = create_metric_plot_obj(plot_df, ["Average β¬οΈ"], HUMAN_BASELINES).properties(
|
| 480 |
+
title="Average of Top Scores and Human Baseline Over Time"
|
| 481 |
+
)
|
| 482 |
+
gr.Plot(value=chart, interactive=False, width=500, height=500)
|
| 483 |
+
with gr.Column():
|
| 484 |
+
chart = create_metric_plot_obj(
|
| 485 |
+
plot_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], HUMAN_BASELINES
|
| 486 |
+
).properties(title="Top Scores and Human Baseline Over Time")
|
| 487 |
+
gr.Plot(value=chart, interactive=False, width=500, height=500)
|
| 488 |
+
|
| 489 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 490 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 491 |
|
|
|
|
| 608 |
scheduler = BackgroundScheduler()
|
| 609 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 610 |
scheduler.start()
|
| 611 |
+
demo.queue(concurrency_count=40).launch()
|