Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clรฉmentine
commited on
Commit
ยท
294422e
1
Parent(s):
388bfbd
added plots back
Browse files- app.py +21 -21
- src/display/utils.py +1 -0
- src/populate.py +3 -3
- src/tools/plots.py +5 -9
app.py
CHANGED
|
@@ -135,9 +135,9 @@ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queu
|
|
| 135 |
|
| 136 |
|
| 137 |
# Data processing for plots now only on demand in the respective Gradio tab
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
|
| 142 |
def init_leaderboard(dataframe):
|
| 143 |
return Leaderboard(
|
|
@@ -182,24 +182,24 @@ with demo:
|
|
| 182 |
with gr.TabItem("๐
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 183 |
leaderboard = init_leaderboard(leaderboard_df)
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
|
| 204 |
with gr.TabItem("๐ About", elem_id="llm-benchmark-tab-table", id=3):
|
| 205 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
# Data processing for plots now only on demand in the respective Gradio tab
|
| 138 |
+
def load_and_create_plots():
|
| 139 |
+
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
| 140 |
+
return plot_df
|
| 141 |
|
| 142 |
def init_leaderboard(dataframe):
|
| 143 |
return Leaderboard(
|
|
|
|
| 182 |
with gr.TabItem("๐
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 183 |
leaderboard = init_leaderboard(leaderboard_df)
|
| 184 |
|
| 185 |
+
with gr.TabItem("๐ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
| 186 |
+
with gr.Row():
|
| 187 |
+
with gr.Column():
|
| 188 |
+
plot_df = load_and_create_plots()
|
| 189 |
+
chart = create_metric_plot_obj(
|
| 190 |
+
plot_df,
|
| 191 |
+
[AutoEvalColumn.average.name],
|
| 192 |
+
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
| 193 |
+
)
|
| 194 |
+
gr.Plot(value=chart, min_width=500)
|
| 195 |
+
with gr.Column():
|
| 196 |
+
plot_df = load_and_create_plots()
|
| 197 |
+
chart = create_metric_plot_obj(
|
| 198 |
+
plot_df,
|
| 199 |
+
BENCHMARK_COLS,
|
| 200 |
+
title="Top Scores and Human Baseline Over Time (from last update)",
|
| 201 |
+
)
|
| 202 |
+
gr.Plot(value=chart, min_width=500)
|
| 203 |
|
| 204 |
with gr.TabItem("๐ About", elem_id="llm-benchmark-tab-table", id=3):
|
| 205 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/display/utils.py
CHANGED
|
@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
|
|
| 93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
|
|
|
| 96 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 97 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
| 98 |
|
|
|
|
| 93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
| 96 |
+
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
|
| 97 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 98 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
| 99 |
|
src/populate.py
CHANGED
|
@@ -43,10 +43,10 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
|
|
| 43 |
"""Retrieve and process leaderboard data."""
|
| 44 |
all_data_json = leaderboard_dataset.to_dict()
|
| 45 |
num_items = leaderboard_dataset.num_rows
|
| 46 |
-
|
| 47 |
-
filter_models_flags(
|
| 48 |
|
| 49 |
-
df = pd.DataFrame.from_records(
|
| 50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 51 |
df = df[cols].round(decimals=2)
|
| 52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
|
| 43 |
"""Retrieve and process leaderboard data."""
|
| 44 |
all_data_json = leaderboard_dataset.to_dict()
|
| 45 |
num_items = leaderboard_dataset.num_rows
|
| 46 |
+
all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
| 47 |
+
filter_models_flags(all_data_json_list)
|
| 48 |
|
| 49 |
+
df = pd.DataFrame.from_records(all_data_json_list)
|
| 50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 51 |
df = df[cols].round(decimals=2)
|
| 52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
src/tools/plots.py
CHANGED
|
@@ -28,22 +28,18 @@ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
|
|
| 28 |
last_date = ""
|
| 29 |
column = task.col_name
|
| 30 |
for _, row in results_df.iterrows():
|
| 31 |
-
current_model = row[
|
| 32 |
# We ignore models that are flagged/no longer on the hub/not finished
|
| 33 |
to_ignore = (
|
| 34 |
-
not row[
|
| 35 |
-
or not row[
|
| 36 |
or current_model in FLAGGED_MODELS
|
| 37 |
-
or row["status"] != "FINISHED"
|
| 38 |
)
|
| 39 |
if to_ignore:
|
| 40 |
continue
|
| 41 |
|
| 42 |
-
current_date = row[
|
| 43 |
-
|
| 44 |
-
current_score = np.mean(list(row["results"].values()))
|
| 45 |
-
else:
|
| 46 |
-
current_score = row["results"][task.benchmark]
|
| 47 |
|
| 48 |
if current_score > current_max:
|
| 49 |
if current_date == last_date and len(scores[column]) > 0:
|
|
|
|
| 28 |
last_date = ""
|
| 29 |
column = task.col_name
|
| 30 |
for _, row in results_df.iterrows():
|
| 31 |
+
current_model = row[AutoEvalColumn.fullname.name]
|
| 32 |
# We ignore models that are flagged/no longer on the hub/not finished
|
| 33 |
to_ignore = (
|
| 34 |
+
not row[AutoEvalColumn.still_on_hub.name]
|
| 35 |
+
or not row[AutoEvalColumn.not_flagged.name]
|
| 36 |
or current_model in FLAGGED_MODELS
|
|
|
|
| 37 |
)
|
| 38 |
if to_ignore:
|
| 39 |
continue
|
| 40 |
|
| 41 |
+
current_date = row[AutoEvalColumn.date.name]
|
| 42 |
+
current_score = row[task.col_name]
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
if current_score > current_max:
|
| 45 |
if current_date == last_date and len(scores[column]) > 0:
|