Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1147

Clémentine commited on May 31, 2024

Commit

294422e

1 Parent(s): 388bfbd

added plots back

Browse files

Files changed (4) hide show

app.py +21 -21
src/display/utils.py +1 -0
src/populate.py +3 -3
src/tools/plots.py +5 -9

app.py CHANGED Viewed

@@ -135,9 +135,9 @@ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queu
 # Data processing for plots now only on demand in the respective Gradio tab
-#def load_and_create_plots():
-#    plot_df = create_plot_df(create_scores_df(leaderboard_df))
-#    return plot_df
 def init_leaderboard(dataframe):
     return Leaderboard(
@@ -182,24 +182,24 @@ with demo:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(leaderboard_df)
-        #with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
-        #    with gr.Row():
-        #        with gr.Column():
-        #            plot_df = load_and_create_plots()
-        #            chart = create_metric_plot_obj(
-        #                plot_df,
-        #                [AutoEvalColumn.average.name],
-        #                title="Average of Top Scores and Human Baseline Over Time (from last update)",
-        #            )
-        #            gr.Plot(value=chart, min_width=500)
-        #        with gr.Column():
-        #            plot_df = load_and_create_plots()
-        #            chart = create_metric_plot_obj(
-        #                plot_df,
-        #                BENCHMARK_COLS,
-        #                title="Top Scores and Human Baseline Over Time (from last update)",
-        #            )
-        #            gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

 # Data processing for plots now only on demand in the respective Gradio tab
+def load_and_create_plots():
+    plot_df = create_plot_df(create_scores_df(leaderboard_df))
+    return plot_df
 def init_leaderboard(dataframe):
     return Leaderboard(
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(leaderboard_df)
+        with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
+            with gr.Row():
+                with gr.Column():
+                    plot_df = load_and_create_plots()
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        [AutoEvalColumn.average.name],
+                        title="Average of Top Scores and Human Baseline Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
+                with gr.Column():
+                    plot_df = load_and_create_plots()
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        BENCHMARK_COLS,
+                        title="Top Scores and Human Baseline Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

src/display/utils.py CHANGED Viewed

@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])

 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
+auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])

src/populate.py CHANGED Viewed

@@ -43,10 +43,10 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
     """Retrieve and process leaderboard data."""
     all_data_json = leaderboard_dataset.to_dict()
     num_items = leaderboard_dataset.num_rows
-    all_data_json = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
-    filter_models_flags(all_data_json)
-    df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]

     """Retrieve and process leaderboard data."""
     all_data_json = leaderboard_dataset.to_dict()
     num_items = leaderboard_dataset.num_rows
+    all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
+    filter_models_flags(all_data_json_list)
+    df = pd.DataFrame.from_records(all_data_json_list)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]

src/tools/plots.py CHANGED Viewed

@@ -28,22 +28,18 @@ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
         last_date = ""
         column = task.col_name
         for _, row in results_df.iterrows():
-            current_model = row["full_model"]
             # We ignore models that are flagged/no longer on the hub/not finished
             to_ignore = (
-                not row["still_on_hub"]
-                or not row["not_flagged"]
                 or current_model in FLAGGED_MODELS
-                or row["status"] != "FINISHED"
             )
             if to_ignore:
                 continue
-            current_date = row["date"]
-            if task.benchmark == "Average":
-                current_score = np.mean(list(row["results"].values()))
-            else:
-                current_score = row["results"][task.benchmark]
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0:

         last_date = ""
         column = task.col_name
         for _, row in results_df.iterrows():
+            current_model = row[AutoEvalColumn.fullname.name]
             # We ignore models that are flagged/no longer on the hub/not finished
             to_ignore = (
+                not row[AutoEvalColumn.still_on_hub.name]
+                or not row[AutoEvalColumn.not_flagged.name]
                 or current_model in FLAGGED_MODELS
             )
             if to_ignore:
                 continue
+            current_date = row[AutoEvalColumn.date.name]
+            current_score = row[task.col_name]
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0: