Spaces:
Running
Running
Commit
Β·
67cbded
1
Parent(s):
bf0a261
made scores clickable
Browse files- app.py +9 -30
- src/assets/css_html_js.py +0 -36
- src/assets/text_content.py +7 -9
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
import json
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
@@ -21,7 +20,7 @@ COLUMNS_MAPPING = {
|
|
| 21 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
| 22 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
| 23 |
}
|
| 24 |
-
COLUMNS_DATATYPES = ["markdown", "str", "str", "
|
| 25 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
| 26 |
|
| 27 |
|
|
@@ -39,8 +38,8 @@ def get_benchmark_df(benchmark):
|
|
| 39 |
scores_df = pd.read_csv(
|
| 40 |
f"./llm-perf-dataset/reports/average_scores.csv")
|
| 41 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
# preprocess
|
| 46 |
bench_df["model"] = bench_df["model"].apply(make_clickable_model)
|
|
@@ -54,33 +53,19 @@ def get_benchmark_df(benchmark):
|
|
| 54 |
return bench_df
|
| 55 |
|
| 56 |
|
| 57 |
-
# def change_tab(query_param):
|
| 58 |
-
# query_param = query_param.replace("'", '"')
|
| 59 |
-
# query_param = json.loads(query_param)
|
| 60 |
-
|
| 61 |
-
# if (
|
| 62 |
-
# isinstance(query_param, dict)
|
| 63 |
-
# and "tab" in query_param
|
| 64 |
-
# and query_param["tab"] == "evaluation"
|
| 65 |
-
# ):
|
| 66 |
-
# return gr.Tabs.update(selected=1)
|
| 67 |
-
# else:
|
| 68 |
-
# return gr.Tabs.update(selected=0)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
def submit_query(text, backends, datatypes, threshold, raw_df):
|
| 72 |
|
| 73 |
# extract the average score (float) from the clickable score (clickable markdown)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
filtered_df = raw_df[
|
| 77 |
raw_df["Model π€"].str.lower().str.contains(text.lower()) &
|
| 78 |
raw_df["Backend π"].isin(backends) &
|
| 79 |
raw_df["Datatype π₯"].isin(datatypes) &
|
| 80 |
(raw_df["Average H4 Score β¬οΈ"] >= threshold)
|
| 81 |
]
|
| 82 |
-
|
| 83 |
-
|
| 84 |
|
| 85 |
return filtered_df
|
| 86 |
|
|
@@ -91,6 +76,7 @@ with demo:
|
|
| 91 |
gr.HTML(TITLE)
|
| 92 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 93 |
|
|
|
|
| 94 |
with gr.Row():
|
| 95 |
search_bar = gr.Textbox(
|
| 96 |
label="Model π€",
|
|
@@ -127,6 +113,7 @@ with demo:
|
|
| 127 |
elem_id="submit-button",
|
| 128 |
)
|
| 129 |
|
|
|
|
| 130 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 131 |
with gr.TabItem("π₯οΈ A100-80GB Benchmark ποΈ", elem_id="A100-benchmark", id=0):
|
| 132 |
gr.HTML(SINGLE_A100_TEXT)
|
|
@@ -166,14 +153,6 @@ with demo:
|
|
| 166 |
elem_id="citation-button",
|
| 167 |
).style(show_copy_button=True)
|
| 168 |
|
| 169 |
-
# dummy = gr.Textbox(visible=False)
|
| 170 |
-
# demo.load(
|
| 171 |
-
# change_tab,
|
| 172 |
-
# dummy,
|
| 173 |
-
# tabs,
|
| 174 |
-
# _js=get_window_url_params,
|
| 175 |
-
# )
|
| 176 |
-
|
| 177 |
# Restart space every hour
|
| 178 |
scheduler = BackgroundScheduler()
|
| 179 |
scheduler.add_job(restart_space, "interval", seconds=3600,
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 20 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
| 21 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
| 22 |
}
|
| 23 |
+
COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
|
| 24 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
| 25 |
|
| 26 |
|
|
|
|
| 38 |
scores_df = pd.read_csv(
|
| 39 |
f"./llm-perf-dataset/reports/average_scores.csv")
|
| 40 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
| 41 |
+
bench_df["average"] = bench_df["average"].apply(
|
| 42 |
+
make_clickable_score)
|
| 43 |
|
| 44 |
# preprocess
|
| 45 |
bench_df["model"] = bench_df["model"].apply(make_clickable_model)
|
|
|
|
| 53 |
return bench_df
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def submit_query(text, backends, datatypes, threshold, raw_df):
|
| 57 |
|
| 58 |
# extract the average score (float) from the clickable score (clickable markdown)
|
| 59 |
+
raw_df["Average H4 Score β¬οΈ"] = raw_df["Average H4 Score β¬οΈ"].apply(
|
| 60 |
+
extract_score_from_clickable)
|
| 61 |
filtered_df = raw_df[
|
| 62 |
raw_df["Model π€"].str.lower().str.contains(text.lower()) &
|
| 63 |
raw_df["Backend π"].isin(backends) &
|
| 64 |
raw_df["Datatype π₯"].isin(datatypes) &
|
| 65 |
(raw_df["Average H4 Score β¬οΈ"] >= threshold)
|
| 66 |
]
|
| 67 |
+
filtered_df["Average H4 Score β¬οΈ"] = filtered_df["Average H4 Score β¬οΈ"].apply(
|
| 68 |
+
make_clickable_score)
|
| 69 |
|
| 70 |
return filtered_df
|
| 71 |
|
|
|
|
| 76 |
gr.HTML(TITLE)
|
| 77 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 78 |
|
| 79 |
+
# controls
|
| 80 |
with gr.Row():
|
| 81 |
search_bar = gr.Textbox(
|
| 82 |
label="Model π€",
|
|
|
|
| 113 |
elem_id="submit-button",
|
| 114 |
)
|
| 115 |
|
| 116 |
+
# leaderboard tabs
|
| 117 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 118 |
with gr.TabItem("π₯οΈ A100-80GB Benchmark ποΈ", elem_id="A100-benchmark", id=0):
|
| 119 |
gr.HTML(SINGLE_A100_TEXT)
|
|
|
|
| 153 |
elem_id="citation-button",
|
| 154 |
).style(show_copy_button=True)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
# Restart space every hour
|
| 157 |
scheduler = BackgroundScheduler()
|
| 158 |
scheduler.add_job(restart_space, "interval", seconds=3600,
|
src/assets/css_html_js.py
CHANGED
|
@@ -1,12 +1,4 @@
|
|
| 1 |
custom_css = """
|
| 2 |
-
#changelog-text {
|
| 3 |
-
font-size: 16px !important;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
#changelog-text h2 {
|
| 7 |
-
font-size: 18px !important;
|
| 8 |
-
}
|
| 9 |
-
|
| 10 |
.markdown-text {
|
| 11 |
font-size: 16px !important;
|
| 12 |
}
|
|
@@ -28,26 +20,11 @@ custom_css = """
|
|
| 28 |
transform: scale(1.3);
|
| 29 |
}
|
| 30 |
|
| 31 |
-
#leaderboard-table {
|
| 32 |
-
margin-top: 15px
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
#leaderboard-table-lite {
|
| 36 |
-
margin-top: 15px
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
#search-bar-table-box > div:first-child {
|
| 40 |
background: none;
|
| 41 |
border: none;
|
| 42 |
}
|
| 43 |
|
| 44 |
-
|
| 45 |
-
/* Hides the final AutoEvalColumn */
|
| 46 |
-
#llm-benchmark-tab-table table td:last-child,
|
| 47 |
-
#llm-benchmark-tab-table table th:last-child {
|
| 48 |
-
display: none;
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
| 52 |
table td:first-child,
|
| 53 |
table th:first-child {
|
|
@@ -59,19 +36,6 @@ table th:first-child {
|
|
| 59 |
.tab-buttons button {
|
| 60 |
font-size: 20px;
|
| 61 |
}
|
| 62 |
-
|
| 63 |
-
#scale-logo {
|
| 64 |
-
border-style: none !important;
|
| 65 |
-
box-shadow: none;
|
| 66 |
-
display: block;
|
| 67 |
-
margin-left: auto;
|
| 68 |
-
margin-right: auto;
|
| 69 |
-
max-width: 600px;
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
#scale-logo .download {
|
| 73 |
-
display: none;
|
| 74 |
-
}
|
| 75 |
"""
|
| 76 |
|
| 77 |
get_window_url_params = """
|
|
|
|
| 1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
.markdown-text {
|
| 3 |
font-size: 16px !important;
|
| 4 |
}
|
|
|
|
| 20 |
transform: scale(1.3);
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
#search-bar-table-box > div:first-child {
|
| 24 |
background: none;
|
| 25 |
border: none;
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
| 29 |
table td:first-child,
|
| 30 |
table th:first-child {
|
|
|
|
| 36 |
.tab-buttons button {
|
| 37 |
font-size: 20px;
|
| 38 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
|
| 41 |
get_window_url_params = """
|
src/assets/text_content.py
CHANGED
|
@@ -1,22 +1,20 @@
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
INTRODUCTION_TEXT = f"""
|
| 4 |
-
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs)
|
| 5 |
-
Anyone from the community can request a model or a hardware+backend configuration for automated benchmarking:
|
| 6 |
-
- Model requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ once they're publicly available.
|
| 7 |
-
- Hardware+Backend requests should be made in the π€ Open LLM-Perf Leaderboard ποΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions).
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
-
SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
|
| 13 |
<ul>
|
| 14 |
<li>Singleton Batch (1)</li>
|
| 15 |
<li>Thousand Tokens (1000)</li>
|
| 16 |
</ul>
|
| 17 |
"""
|
| 18 |
|
| 19 |
-
|
| 20 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
| 21 |
CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
| 22 |
author = {Ilyas Moutawwakil},
|
|
@@ -25,8 +23,8 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
|
| 25 |
publisher = {Hugging Face},
|
| 26 |
howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
|
| 27 |
@software{optimum-benchmark,
|
| 28 |
-
author
|
| 29 |
publisher = {Hugging Face},
|
| 30 |
-
title
|
| 31 |
}
|
| 32 |
"""
|
|
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
INTRODUCTION_TEXT = f"""
|
| 4 |
+
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
Anyone from the community can request a model or a hardware+backend+optimization configuration for automated benchmarking:
|
| 7 |
+
- Model requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically once they're publicly available. That's mostly because we don't want to benchmark models that don't have an evaluation score yet.
|
| 8 |
+
- Hardware+Backend+Optimization requests should be made in the π€ Open LLM-Perf Leaderboard ποΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
|
| 12 |
<ul>
|
| 13 |
<li>Singleton Batch (1)</li>
|
| 14 |
<li>Thousand Tokens (1000)</li>
|
| 15 |
</ul>
|
| 16 |
"""
|
| 17 |
|
|
|
|
| 18 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
| 19 |
CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
| 20 |
author = {Ilyas Moutawwakil},
|
|
|
|
| 23 |
publisher = {Hugging Face},
|
| 24 |
howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
|
| 25 |
@software{optimum-benchmark,
|
| 26 |
+
author = {Ilyas Moutawwakil},
|
| 27 |
publisher = {Hugging Face},
|
| 28 |
+
title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.},
|
| 29 |
}
|
| 30 |
"""
|