Spaces:
Running
Running
Commit
Β·
e2e1ee9
1
Parent(s):
5a7b8dd
move model name to the end of table
Browse files- app.py +8 -8
- src/assets/css_html_js.py +0 -7
- src/assets/text_content.py +5 -9
app.py
CHANGED
|
@@ -27,38 +27,38 @@ LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
|
| 27 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
| 28 |
|
| 29 |
ALL_COLUMNS_MAPPING = {
|
| 30 |
-
"weight_class": "
|
| 31 |
-
"model_type": "
|
| 32 |
-
"best_scored_model": "Best Scored LLM π",
|
| 33 |
#
|
| 34 |
"backend.name": "Backend π",
|
| 35 |
"backend.torch_dtype": "Dtype π₯",
|
| 36 |
"quantization": "Quantization ποΈ",
|
| 37 |
"optimizations": "Optimizations π οΈ",
|
| 38 |
#
|
| 39 |
-
"best_score": "Best Score (%) β¬οΈ",
|
| 40 |
"generate.peak_memory(MB)": "Memory (MB) β¬οΈ",
|
| 41 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
| 42 |
"generate.energy_consumption(kWh/token)": "Energy (kWh/token) β¬οΈ",
|
|
|
|
| 43 |
#
|
|
|
|
| 44 |
}
|
| 45 |
ALL_COLUMNS_DATATYPES = [
|
| 46 |
"str",
|
| 47 |
"str",
|
| 48 |
-
"markdown",
|
| 49 |
#
|
| 50 |
"str",
|
| 51 |
"str",
|
| 52 |
"str",
|
| 53 |
"str",
|
| 54 |
#
|
| 55 |
-
"str",
|
| 56 |
"number",
|
| 57 |
"number",
|
| 58 |
"number",
|
|
|
|
| 59 |
#
|
|
|
|
| 60 |
]
|
| 61 |
-
SORTING_COLUMN = ["
|
| 62 |
|
| 63 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
| 64 |
|
|
@@ -110,7 +110,7 @@ def get_benchmark_table(bench_df):
|
|
| 110 |
# add * to quantized models score since we can't garantee the score is the same
|
| 111 |
copy_df["best_score"] = copy_df.apply(
|
| 112 |
lambda x: f"{x['best_score']}**"
|
| 113 |
-
if x["backend.quantization_strategy"]
|
| 114 |
else x["best_score"],
|
| 115 |
axis=1,
|
| 116 |
)
|
|
|
|
| 27 |
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
|
| 28 |
|
| 29 |
ALL_COLUMNS_MAPPING = {
|
| 30 |
+
"weight_class": "Class ποΈ",
|
| 31 |
+
"model_type": "Type π€",
|
|
|
|
| 32 |
#
|
| 33 |
"backend.name": "Backend π",
|
| 34 |
"backend.torch_dtype": "Dtype π₯",
|
| 35 |
"quantization": "Quantization ποΈ",
|
| 36 |
"optimizations": "Optimizations π οΈ",
|
| 37 |
#
|
|
|
|
| 38 |
"generate.peak_memory(MB)": "Memory (MB) β¬οΈ",
|
| 39 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
| 40 |
"generate.energy_consumption(kWh/token)": "Energy (kWh/token) β¬οΈ",
|
| 41 |
+
"best_score": "Best Score (%) β¬οΈ",
|
| 42 |
#
|
| 43 |
+
"best_scored_model": "Best Scored LLM π",
|
| 44 |
}
|
| 45 |
ALL_COLUMNS_DATATYPES = [
|
| 46 |
"str",
|
| 47 |
"str",
|
|
|
|
| 48 |
#
|
| 49 |
"str",
|
| 50 |
"str",
|
| 51 |
"str",
|
| 52 |
"str",
|
| 53 |
#
|
|
|
|
| 54 |
"number",
|
| 55 |
"number",
|
| 56 |
"number",
|
| 57 |
+
"str",
|
| 58 |
#
|
| 59 |
+
"markdown",
|
| 60 |
]
|
| 61 |
+
SORTING_COLUMN = ["generate.throughput(tokens/s)"]
|
| 62 |
|
| 63 |
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
|
| 64 |
|
|
|
|
| 110 |
# add * to quantized models score since we can't garantee the score is the same
|
| 111 |
copy_df["best_score"] = copy_df.apply(
|
| 112 |
lambda x: f"{x['best_score']}**"
|
| 113 |
+
if x["backend.quantization_strategy"] in ["bnb", "gptq"]
|
| 114 |
else x["best_score"],
|
| 115 |
axis=1,
|
| 116 |
)
|
src/assets/css_html_js.py
CHANGED
|
@@ -16,13 +16,6 @@ custom_css = """
|
|
| 16 |
transform: scale(1.3);
|
| 17 |
}
|
| 18 |
|
| 19 |
-
table td:first-child,
|
| 20 |
-
table th:first-child {
|
| 21 |
-
max-width: 300px;
|
| 22 |
-
overflow: auto;
|
| 23 |
-
white-space: nowrap;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
.hardware-tabs button {
|
| 27 |
font-size: 20px;
|
| 28 |
}
|
|
|
|
| 16 |
transform: scale(1.3);
|
| 17 |
}
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
.hardware-tabs button {
|
| 20 |
font-size: 20px;
|
| 21 |
}
|
src/assets/text_content.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
INTRODUCTION_TEXT = f"""
|
| 4 |
-
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency &
|
| 5 |
|
| 6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
| 7 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically.
|
|
@@ -11,11 +11,10 @@ Anyone from the community can request a model or a hardware/backend/optimization
|
|
| 11 |
ABOUT_TEXT = """<h3>About the π€ Open LLM-Perf Leaderboard ποΈ</h3>
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
-
<li>LLMs are evaluated on a singleton batch and generating 1000 tokens.</li>
|
| 15 |
-
<li>Peak memory is measured in MB during the
|
| 16 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
| 17 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
| 18 |
-
<li>Ranking is based on a composite metric which is the euclidean distance from the "Perfect LLM" (i.e. 0 latency and 100% accuracy).</li>
|
| 19 |
</ul>
|
| 20 |
"""
|
| 21 |
|
|
@@ -39,9 +38,6 @@ hydra:
|
|
| 39 |
experiment_name: {experiment_name}
|
| 40 |
|
| 41 |
model: {model}
|
| 42 |
-
hub_kwargs:
|
| 43 |
-
revision: {revision}
|
| 44 |
-
trust_remote_code: {trust_remote_code}
|
| 45 |
|
| 46 |
device: cuda
|
| 47 |
|
|
@@ -49,7 +45,7 @@ backend:
|
|
| 49 |
no_weights: true
|
| 50 |
delete_cache: true
|
| 51 |
torch_dtype: float16
|
| 52 |
-
|
| 53 |
bettertransformer: true
|
| 54 |
|
| 55 |
benchmark:
|
|
@@ -57,7 +53,7 @@ benchmark:
|
|
| 57 |
|
| 58 |
input_shapes:
|
| 59 |
batch_size: 1
|
| 60 |
-
sequence_length:
|
| 61 |
|
| 62 |
new_tokens: 1000
|
| 63 |
```
|
|
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
INTRODUCTION_TEXT = f"""
|
| 4 |
+
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput & memory) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
| 5 |
|
| 6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
| 7 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically.
|
|
|
|
| 11 |
ABOUT_TEXT = """<h3>About the π€ Open LLM-Perf Leaderboard ποΈ</h3>
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
+
<li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
|
| 15 |
+
<li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
|
| 16 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
| 17 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
|
|
|
| 18 |
</ul>
|
| 19 |
"""
|
| 20 |
|
|
|
|
| 38 |
experiment_name: {experiment_name}
|
| 39 |
|
| 40 |
model: {model}
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
device: cuda
|
| 43 |
|
|
|
|
| 45 |
no_weights: true
|
| 46 |
delete_cache: true
|
| 47 |
torch_dtype: float16
|
| 48 |
+
quantization_strategy: gptq
|
| 49 |
bettertransformer: true
|
| 50 |
|
| 51 |
benchmark:
|
|
|
|
| 53 |
|
| 54 |
input_shapes:
|
| 55 |
batch_size: 1
|
| 56 |
+
sequence_length: 512
|
| 57 |
|
| 58 |
new_tokens: 1000
|
| 59 |
```
|