Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

BenchmarkBot commited on Sep 3, 2023

Commit

e2e1ee9

1 Parent(s): 5a7b8dd

move model name to the end of table

Browse files

Files changed (3) hide show

app.py +8 -8
src/assets/css_html_js.py +0 -7
src/assets/text_content.py +5 -9

app.py CHANGED Viewed

@@ -27,38 +27,38 @@ LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
 OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
 ALL_COLUMNS_MAPPING = {
-    "weight_class": "Weight Class 🏋️",
-    "model_type": "LLM Type 🤗",
-    "best_scored_model": "Best Scored LLM 🏆",
     #
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "quantization": "Quantization 🗜️",
     "optimizations": "Optimizations 🛠️",
     #
-    "best_score": "Best Score (%) ⬆️",
     "generate.peak_memory(MB)": "Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "generate.energy_consumption(kWh/token)": "Energy (kWh/token) ⬇️",
     #
 }
 ALL_COLUMNS_DATATYPES = [
     "str",
     "str",
-    "markdown",
     #
     "str",
     "str",
     "str",
     "str",
     #
-    "str",
     "number",
     "number",
     "number",
     #
 ]
-SORTING_COLUMN = ["perf_distance"]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
@@ -110,7 +110,7 @@ def get_benchmark_table(bench_df):
     # add * to quantized models score since we can't garantee the score is the same
     copy_df["best_score"] = copy_df.apply(
         lambda x: f"{x['best_score']}**"
-        if x["backend.quantization_strategy"]
         else x["best_score"],
         axis=1,
     )

 OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
 ALL_COLUMNS_MAPPING = {
+    "weight_class": "Class 🏋️",
+    "model_type": "Type 🤗",
     #
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "quantization": "Quantization 🗜️",
     "optimizations": "Optimizations 🛠️",
     #
     "generate.peak_memory(MB)": "Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "generate.energy_consumption(kWh/token)": "Energy (kWh/token) ⬇️",
+    "best_score": "Best Score (%) ⬆️",
     #
+    "best_scored_model": "Best Scored LLM 🏆",
 }
 ALL_COLUMNS_DATATYPES = [
     "str",
     "str",
     #
     "str",
     "str",
     "str",
     "str",
     #
     "number",
     "number",
     "number",
+    "str",
     #
+    "markdown",
 ]
+SORTING_COLUMN = ["generate.throughput(tokens/s)"]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
     # add * to quantized models score since we can't garantee the score is the same
     copy_df["best_score"] = copy_df.apply(
         lambda x: f"{x['best_score']}**"
+        if x["backend.quantization_strategy"] in ["bnb", "gptq"]
         else x["best_score"],
         axis=1,
     )

src/assets/css_html_js.py CHANGED Viewed

@@ -16,13 +16,6 @@ custom_css = """
     transform: scale(1.3);
 }
-table td:first-child,
-table th:first-child {
-    max-width: 300px;
-    overflow: auto;
-    white-space: nowrap;
-}
 .hardware-tabs button {
     font-size: 20px;
 }

     transform: scale(1.3);
 }
 .hardware-tabs button {
     font-size: 20px;
 }

src/assets/text_content.py CHANGED Viewed

@@ -1,7 +1,7 @@
 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM-Perf Leaderboard 🏋️</h1>"""
 INTRODUCTION_TEXT = f"""
-The 🤗 Open LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 - Model evaluation requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the 🤗 Open LLM-Perf Leaderboard 🏋️ automatically.
@@ -11,11 +11,10 @@ Anyone from the community can request a model or a hardware/backend/optimization
 ABOUT_TEXT = """<h3>About the 🤗 Open LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
-    <li>LLMs are evaluated on a singleton batch and generating 1000 tokens.</li>
-    <li>Peak memory is measured in MB during the first forward pass of the LLM (no warmup).</li>
     <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
-    <li>Ranking is based on a composite metric which is the euclidean distance from the "Perfect LLM" (i.e. 0 latency and 100% accuracy).</li>
 </ul>
 """
@@ -39,9 +38,6 @@ hydra:
 experiment_name: {experiment_name}
 model: {model}
-hub_kwargs:
-  revision: {revision}
-  trust_remote_code: {trust_remote_code}
 device: cuda
@@ -49,7 +45,7 @@ backend:
   no_weights: true
   delete_cache: true
   torch_dtype: float16
-  load_in_4bit: true
   bettertransformer: true
 benchmark:
@@ -57,7 +53,7 @@ benchmark:
   input_shapes:
     batch_size: 1
-    sequence_length: 1
   new_tokens: 1000
 ```

 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM-Perf Leaderboard 🏋️</h1>"""
 INTRODUCTION_TEXT = f"""
+The 🤗 Open LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency, throughput & memory) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 - Model evaluation requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the 🤗 Open LLM-Perf Leaderboard 🏋️ automatically.
 ABOUT_TEXT = """<h3>About the 🤗 Open LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
+    <li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
+    <li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
     <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
 </ul>
 """
 experiment_name: {experiment_name}
 model: {model}
 device: cuda
   no_weights: true
   delete_cache: true
   torch_dtype: float16
+  quantization_strategy: gptq
   bettertransformer: true
 benchmark:
   input_shapes:
     batch_size: 1
+    sequence_length: 512
   new_tokens: 1000
 ```