Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
359d8a9
1
Parent(s):
ebb5810
Evaluation time metric and plot
Browse files- app.py +13 -2
- src/display/utils.py +8 -3
- src/leaderboard/filter_models.py +2 -2
- src/leaderboard/read_evals.py +5 -2
- src/tools/plots.py +55 -0
app.py
CHANGED
|
@@ -38,6 +38,7 @@ from src.tools.plots import (
|
|
| 38 |
create_metric_plot_obj,
|
| 39 |
create_plot_df,
|
| 40 |
create_scores_df,
|
|
|
|
| 41 |
)
|
| 42 |
|
| 43 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
|
@@ -344,7 +345,7 @@ with demo:
|
|
| 344 |
queue=True,
|
| 345 |
)
|
| 346 |
|
| 347 |
-
with gr.TabItem("📈 Metrics
|
| 348 |
with gr.Row():
|
| 349 |
with gr.Column():
|
| 350 |
chart = create_metric_plot_obj(
|
|
@@ -359,7 +360,17 @@ with demo:
|
|
| 359 |
BENCHMARK_COLS,
|
| 360 |
title="Top Scores and Human Baseline Over Time (from last update)",
|
| 361 |
)
|
| 362 |
-
gr.Plot(value=chart, min_width=500)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 364 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 365 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
|
|
|
| 38 |
create_metric_plot_obj,
|
| 39 |
create_plot_df,
|
| 40 |
create_scores_df,
|
| 41 |
+
create_lat_score_mem_plot_obj
|
| 42 |
)
|
| 43 |
|
| 44 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
|
|
|
| 345 |
queue=True,
|
| 346 |
)
|
| 347 |
|
| 348 |
+
with gr.TabItem("📈 Metrics", elem_id="llm-benchmark-tab-table", id=4):
|
| 349 |
with gr.Row():
|
| 350 |
with gr.Column():
|
| 351 |
chart = create_metric_plot_obj(
|
|
|
|
| 360 |
BENCHMARK_COLS,
|
| 361 |
title="Top Scores and Human Baseline Over Time (from last update)",
|
| 362 |
)
|
| 363 |
+
gr.Plot(value=chart, min_width=500)
|
| 364 |
+
with gr.Row():
|
| 365 |
+
with gr.Column():
|
| 366 |
+
fig = create_lat_score_mem_plot_obj(leaderboard_df)
|
| 367 |
+
plot = gr.components.Plot(
|
| 368 |
+
value=fig,
|
| 369 |
+
elem_id="plot",
|
| 370 |
+
show_label=False,
|
| 371 |
+
)
|
| 372 |
+
gr.HTML("👆 Hover over the points 👆 for additional information. ",elem_id="text")
|
| 373 |
+
gr.HTML('This plot the Evaluation Time from our backend GPU (Nvdia A100-80G) to run all the benchmarks, it\'s not a very precise performance benchmark of the models, for that look for the <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard" target="_blank">🤗 LLM-Perf Leaderboard</a>',elem_id="text")
|
| 374 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 375 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 376 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
src/display/utils.py
CHANGED
|
@@ -109,8 +109,11 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
|
|
| 109 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 110 |
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 111 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
|
|
|
| 112 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 113 |
-
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 116 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
@@ -147,7 +150,8 @@ baseline_row = {
|
|
| 147 |
AutoEvalColumn.likes.name: 0,
|
| 148 |
AutoEvalColumn.license.name: "",
|
| 149 |
AutoEvalColumn.still_on_hub.name: False,
|
| 150 |
-
AutoEvalColumn.moe.name: False
|
|
|
|
| 151 |
}
|
| 152 |
|
| 153 |
baseline_list = []
|
|
@@ -187,7 +191,8 @@ human_baseline_row = {
|
|
| 187 |
AutoEvalColumn.likes.name: 0,
|
| 188 |
AutoEvalColumn.license.name: "",
|
| 189 |
AutoEvalColumn.still_on_hub.name: False,
|
| 190 |
-
AutoEvalColumn.moe.name: False
|
|
|
|
| 191 |
}
|
| 192 |
|
| 193 |
baseline_list = []
|
|
|
|
| 109 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 110 |
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 111 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
| 112 |
+
auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
|
| 113 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 114 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
| 115 |
+
|
| 116 |
+
|
| 117 |
|
| 118 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 119 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 150 |
AutoEvalColumn.likes.name: 0,
|
| 151 |
AutoEvalColumn.license.name: "",
|
| 152 |
AutoEvalColumn.still_on_hub.name: False,
|
| 153 |
+
AutoEvalColumn.moe.name: False,
|
| 154 |
+
AutoEvalColumn.eval_time.name: 0.0
|
| 155 |
}
|
| 156 |
|
| 157 |
baseline_list = []
|
|
|
|
| 191 |
AutoEvalColumn.likes.name: 0,
|
| 192 |
AutoEvalColumn.license.name: "",
|
| 193 |
AutoEvalColumn.still_on_hub.name: False,
|
| 194 |
+
AutoEvalColumn.moe.name: False,
|
| 195 |
+
AutoEvalColumn.eval_time.name: 0.0
|
| 196 |
}
|
| 197 |
|
| 198 |
baseline_list = []
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -99,7 +99,7 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 99 |
if model_data[AutoEvalColumn.flagged.name] == True:
|
| 100 |
flag_key = "merged"
|
| 101 |
else:
|
| 102 |
-
flag_key = model_data[
|
| 103 |
|
| 104 |
if flag_key in FLAGGED_MODELS:
|
| 105 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
@@ -118,7 +118,7 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 118 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
| 119 |
indices_to_remove = []
|
| 120 |
for ix, model in enumerate(leaderboard_data):
|
| 121 |
-
if model[
|
| 122 |
indices_to_remove.append(ix)
|
| 123 |
|
| 124 |
for ix in reversed(indices_to_remove):
|
|
|
|
| 99 |
if model_data[AutoEvalColumn.flagged.name] == True:
|
| 100 |
flag_key = "merged"
|
| 101 |
else:
|
| 102 |
+
flag_key = model_data[AutoEvalColumn.dummy.name]
|
| 103 |
|
| 104 |
if flag_key in FLAGGED_MODELS:
|
| 105 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
|
|
| 118 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
| 119 |
indices_to_remove = []
|
| 120 |
for ix, model in enumerate(leaderboard_data):
|
| 121 |
+
if model[AutoEvalColumn.dummy.name] in DO_NOT_SUBMIT_MODELS:
|
| 122 |
indices_to_remove.append(ix)
|
| 123 |
|
| 124 |
for ix in reversed(indices_to_remove):
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -36,6 +36,7 @@ class EvalResult:
|
|
| 36 |
status: str = "FINISHED"
|
| 37 |
tags: list = None
|
| 38 |
json_filename: str = None
|
|
|
|
| 39 |
|
| 40 |
@classmethod
|
| 41 |
def init_from_json_file(self, json_filepath):
|
|
@@ -103,7 +104,8 @@ class EvalResult:
|
|
| 103 |
results=results,
|
| 104 |
precision=precision,
|
| 105 |
revision= config.get("model_sha", ""),
|
| 106 |
-
json_filename=json_filename
|
|
|
|
| 107 |
)
|
| 108 |
|
| 109 |
def update_with_request_file(self, requests_path):
|
|
@@ -151,7 +153,8 @@ class EvalResult:
|
|
| 151 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 152 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
| 153 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
| 154 |
-
AutoEvalColumn.flagged.name: self.flagged
|
|
|
|
| 155 |
}
|
| 156 |
|
| 157 |
for task in Tasks:
|
|
|
|
| 36 |
status: str = "FINISHED"
|
| 37 |
tags: list = None
|
| 38 |
json_filename: str = None
|
| 39 |
+
eval_time: float = 0.0
|
| 40 |
|
| 41 |
@classmethod
|
| 42 |
def init_from_json_file(self, json_filepath):
|
|
|
|
| 104 |
results=results,
|
| 105 |
precision=precision,
|
| 106 |
revision= config.get("model_sha", ""),
|
| 107 |
+
json_filename=json_filename,
|
| 108 |
+
eval_time=config.get("total_evaluation_time_seconds", 0.0)
|
| 109 |
)
|
| 110 |
|
| 111 |
def update_with_request_file(self, requests_path):
|
|
|
|
| 153 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 154 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
| 155 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
| 156 |
+
AutoEvalColumn.flagged.name: self.flagged,
|
| 157 |
+
AutoEvalColumn.eval_time.name: self.eval_time,
|
| 158 |
}
|
| 159 |
|
| 160 |
for task in Tasks:
|
src/tools/plots.py
CHANGED
|
@@ -151,6 +151,61 @@ def create_metric_plot_obj(
|
|
| 151 |
|
| 152 |
return fig
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Example Usage:
|
| 156 |
# human_baselines dictionary is defined.
|
|
|
|
| 151 |
|
| 152 |
return fig
|
| 153 |
|
| 154 |
+
def create_lat_score_mem_plot_obj(leaderboard_df):
|
| 155 |
+
copy_df = leaderboard_df.copy()
|
| 156 |
+
copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
|
| 157 |
+
# plot
|
| 158 |
+
SCORE_MEMORY_LATENCY_DATA = [
|
| 159 |
+
AutoEvalColumn.dummy.name,
|
| 160 |
+
AutoEvalColumn.average.name,
|
| 161 |
+
AutoEvalColumn.params.name,
|
| 162 |
+
AutoEvalColumn.architecture.name,
|
| 163 |
+
"Evaluation Time (min)"
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
|
| 167 |
+
copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60
|
| 168 |
+
|
| 169 |
+
#copy_df["size"] = copy_df[AutoEvalColumn.params.name]
|
| 170 |
+
copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
|
| 171 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
|
| 172 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
|
| 173 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
|
| 174 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 20 else x)
|
| 175 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 4.5 if 20 <= x < 40 else x)
|
| 176 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 7.0 if x > 40 else x)
|
| 177 |
+
|
| 178 |
+
fig = px.scatter(
|
| 179 |
+
copy_df,
|
| 180 |
+
x="Evaluation Time (min)",
|
| 181 |
+
y="LLM Average Score",
|
| 182 |
+
size="size",
|
| 183 |
+
color=AutoEvalColumn.architecture.name,
|
| 184 |
+
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
| 185 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 186 |
+
log_x=True
|
| 187 |
+
)
|
| 188 |
+
fig.update_traces(
|
| 189 |
+
hovertemplate="<br>".join(
|
| 190 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
|
| 191 |
+
)
|
| 192 |
+
)
|
| 193 |
+
fig.update_layout(
|
| 194 |
+
title={
|
| 195 |
+
"text": "Eval Time vs. Score vs. #Params",
|
| 196 |
+
"y": 0.95,
|
| 197 |
+
"x": 0.5,
|
| 198 |
+
"xanchor": "center",
|
| 199 |
+
"yanchor": "top",
|
| 200 |
+
},
|
| 201 |
+
xaxis_title="Time To Evaluate (min)",
|
| 202 |
+
yaxis_title="LLM Average Score",
|
| 203 |
+
legend_title="LLM Architecture",
|
| 204 |
+
width=1200,
|
| 205 |
+
height=600,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
return fig
|
| 209 |
|
| 210 |
# Example Usage:
|
| 211 |
# human_baselines dictionary is defined.
|