Spaces:
Running
Running
Commit
Β·
08604d0
1
Parent(s):
affd732
added awq
Browse files- src/content.py +1 -1
- src/control_panel.py +26 -8
- src/exllama.py +5 -5
- src/llm_perf.py +1 -0
- src/utils.py +4 -0
src/content.py
CHANGED
|
@@ -7,7 +7,7 @@ The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency
|
|
| 7 |
|
| 8 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
| 9 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
| 10 |
-
- Hardware/Backend/Optimization performance requests should be made in the [
|
| 11 |
"""
|
| 12 |
|
| 13 |
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
|
|
| 7 |
|
| 8 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
| 9 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
| 10 |
+
- Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
| 11 |
"""
|
| 12 |
|
| 13 |
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
src/control_panel.py
CHANGED
|
@@ -10,7 +10,7 @@ from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
|
|
| 10 |
|
| 11 |
def create_control_panel(machine: str = "hf-dgx-01"):
|
| 12 |
# descriptive text
|
| 13 |
-
gr.HTML("Use this control panel to filter
|
| 14 |
# controls
|
| 15 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
| 16 |
with gr.Row():
|
|
@@ -21,14 +21,14 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 21 |
elem_id="search-bar",
|
| 22 |
)
|
| 23 |
with gr.Row():
|
| 24 |
-
with gr.Column(scale=1):
|
| 25 |
score_slider = gr.Slider(
|
| 26 |
label="Open LLM Score (%) π",
|
| 27 |
info="ποΈ Slide to minimum Open LLM score",
|
| 28 |
value=0,
|
| 29 |
elem_id="threshold-slider",
|
| 30 |
)
|
| 31 |
-
with gr.Column(scale=1):
|
| 32 |
memory_slider = gr.Slider(
|
| 33 |
label="Peak Memory (MB) π",
|
| 34 |
info="ποΈ Slide to maximum Peak Memory",
|
|
@@ -46,7 +46,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 46 |
elem_id="backend-checkboxes",
|
| 47 |
)
|
| 48 |
with gr.Row():
|
| 49 |
-
with gr.Column(scale=1):
|
| 50 |
datatype_checkboxes = gr.CheckboxGroup(
|
| 51 |
label="Load DTypes π₯",
|
| 52 |
choices=["float32", "float16", "bfloat16"],
|
|
@@ -54,7 +54,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 54 |
info="βοΈ Select the load data types",
|
| 55 |
elem_id="dtype-checkboxes",
|
| 56 |
)
|
| 57 |
-
with gr.Column(scale=1):
|
| 58 |
optimization_checkboxes = gr.CheckboxGroup(
|
| 59 |
label="Optimizations π οΈ",
|
| 60 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
|
@@ -62,11 +62,29 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
| 62 |
info="βοΈ Select the optimization",
|
| 63 |
elem_id="optimization-checkboxes",
|
| 64 |
)
|
| 65 |
-
with gr.Column(scale=
|
| 66 |
quantization_checkboxes = gr.CheckboxGroup(
|
| 67 |
label="Quantizations ποΈ",
|
| 68 |
-
choices=[
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
info="βοΈ Select the quantization schemes",
|
| 71 |
elem_id="quantization-checkboxes",
|
| 72 |
)
|
|
|
|
| 10 |
|
| 11 |
def create_control_panel(machine: str = "hf-dgx-01"):
|
| 12 |
# descriptive text
|
| 13 |
+
gr.HTML("Use this control panel to filter the leaderboard.", elem_id="text")
|
| 14 |
# controls
|
| 15 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
| 16 |
with gr.Row():
|
|
|
|
| 21 |
elem_id="search-bar",
|
| 22 |
)
|
| 23 |
with gr.Row():
|
| 24 |
+
with gr.Column(scale=1, variant="panel"):
|
| 25 |
score_slider = gr.Slider(
|
| 26 |
label="Open LLM Score (%) π",
|
| 27 |
info="ποΈ Slide to minimum Open LLM score",
|
| 28 |
value=0,
|
| 29 |
elem_id="threshold-slider",
|
| 30 |
)
|
| 31 |
+
with gr.Column(scale=1, variant="panel"):
|
| 32 |
memory_slider = gr.Slider(
|
| 33 |
label="Peak Memory (MB) π",
|
| 34 |
info="ποΈ Slide to maximum Peak Memory",
|
|
|
|
| 46 |
elem_id="backend-checkboxes",
|
| 47 |
)
|
| 48 |
with gr.Row():
|
| 49 |
+
with gr.Column(scale=1, variant="panel"):
|
| 50 |
datatype_checkboxes = gr.CheckboxGroup(
|
| 51 |
label="Load DTypes π₯",
|
| 52 |
choices=["float32", "float16", "bfloat16"],
|
|
|
|
| 54 |
info="βοΈ Select the load data types",
|
| 55 |
elem_id="dtype-checkboxes",
|
| 56 |
)
|
| 57 |
+
with gr.Column(scale=1, variant="panel"):
|
| 58 |
optimization_checkboxes = gr.CheckboxGroup(
|
| 59 |
label="Optimizations π οΈ",
|
| 60 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
|
|
|
| 62 |
info="βοΈ Select the optimization",
|
| 63 |
elem_id="optimization-checkboxes",
|
| 64 |
)
|
| 65 |
+
with gr.Column(scale=2):
|
| 66 |
quantization_checkboxes = gr.CheckboxGroup(
|
| 67 |
label="Quantizations ποΈ",
|
| 68 |
+
choices=[
|
| 69 |
+
"None",
|
| 70 |
+
"BnB.4bit",
|
| 71 |
+
"BnB.8bit",
|
| 72 |
+
"GPTQ.4bit",
|
| 73 |
+
"GPTQ.4bit+ExllamaV1",
|
| 74 |
+
"GPTQ.4bit+ExllamaV2",
|
| 75 |
+
"AWQ.4bit+GEMM",
|
| 76 |
+
"AWQ.4bit+GEMV",
|
| 77 |
+
],
|
| 78 |
+
value=[
|
| 79 |
+
"None",
|
| 80 |
+
"BnB.4bit",
|
| 81 |
+
"BnB.8bit",
|
| 82 |
+
"GPTQ.4bit",
|
| 83 |
+
"GPTQ.4bit+ExllamaV1",
|
| 84 |
+
"GPTQ.4bit+ExllamaV2",
|
| 85 |
+
"AWQ.4bit+GEMM",
|
| 86 |
+
"AWQ.4bit+GEMV",
|
| 87 |
+
],
|
| 88 |
info="βοΈ Select the quantization schemes",
|
| 89 |
elem_id="quantization-checkboxes",
|
| 90 |
)
|
src/exllama.py
CHANGED
|
@@ -29,11 +29,11 @@ EXLLAMA_DATA = [
|
|
| 29 |
|
| 30 |
|
| 31 |
def get_exllama_df(llm_perf_df):
|
| 32 |
-
|
| 33 |
-
# seperate
|
| 34 |
-
gptq_df =
|
| 35 |
-
exllamav1_df =
|
| 36 |
-
exllamav2_df =
|
| 37 |
# merge the three dataframes
|
| 38 |
exllamav1_df = pd.merge(
|
| 39 |
gptq_df,
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def get_exllama_df(llm_perf_df):
|
| 32 |
+
copy_df = llm_perf_df.copy()
|
| 33 |
+
# seperate vanilla GPTQ experiments from Exllama experiments
|
| 34 |
+
gptq_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit")]
|
| 35 |
+
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
| 36 |
+
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
| 37 |
# merge the three dataframes
|
| 38 |
exllamav1_df = pd.merge(
|
| 39 |
gptq_df,
|
src/llm_perf.py
CHANGED
|
@@ -94,6 +94,7 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
| 94 |
[
|
| 95 |
"backend.quantization_scheme",
|
| 96 |
"backend.quantization_config.bits",
|
|
|
|
| 97 |
"backend.quantization_config.load_in_4bit",
|
| 98 |
"backend.quantization_config.load_in_8bit",
|
| 99 |
"backend.quantization_config.exllama_config.version",
|
|
|
|
| 94 |
[
|
| 95 |
"backend.quantization_scheme",
|
| 96 |
"backend.quantization_config.bits",
|
| 97 |
+
"backend.quantization_config.version",
|
| 98 |
"backend.quantization_config.load_in_4bit",
|
| 99 |
"backend.quantization_config.load_in_8bit",
|
| 100 |
"backend.quantization_config.exllama_config.version",
|
src/utils.py
CHANGED
|
@@ -62,6 +62,10 @@ def process_quantization_scheme(x):
|
|
| 62 |
return "GPTQ.4bit+ExllamaV2"
|
| 63 |
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
| 64 |
return "GPTQ.4bit"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
else:
|
| 66 |
return "None"
|
| 67 |
|
|
|
|
| 62 |
return "GPTQ.4bit+ExllamaV2"
|
| 63 |
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
| 64 |
return "GPTQ.4bit"
|
| 65 |
+
elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
|
| 66 |
+
return "AWQ.4bit+GEMM"
|
| 67 |
+
elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
|
| 68 |
+
return "AWQ.4bit+GEMV"
|
| 69 |
else:
|
| 70 |
return "None"
|
| 71 |
|