Update README.md
Browse files
README.md
CHANGED
|
@@ -187,8 +187,8 @@ We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-h
|
|
| 187 |
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
-
| | google/gemma-3-12b-it |
|
| 191 |
-
| mmlu |
|
| 192 |
|
| 193 |
|
| 194 |
<details>
|
|
@@ -204,7 +204,7 @@ lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it --tasks mmlu --
|
|
| 204 |
|
| 205 |
## FP8
|
| 206 |
```Shell
|
| 207 |
-
export MODEL=
|
| 208 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 209 |
```
|
| 210 |
</details>
|
|
@@ -218,8 +218,8 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
|
|
| 218 |
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
-
| | google/gemma-3-12b-it |
|
| 222 |
-
| Peak Memory (GB) |
|
| 223 |
|
| 224 |
|
| 225 |
|
|
@@ -279,7 +279,8 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-FP8 |
|
| 282 |
-
| latency (batch_size=1) |
|
|
|
|
| 283 |
|
| 284 |
<details>
|
| 285 |
<summary> Reproduce Model Performance Results </summary>
|
|
@@ -311,48 +312,6 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
| 311 |
export MODEL=jerryzh168/gemma-3-12b-it-FP8
|
| 312 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 313 |
```
|
| 314 |
-
|
| 315 |
-
## benchmark_serving
|
| 316 |
-
|
| 317 |
-
We benchmarked the throughput in a serving environment.
|
| 318 |
-
|
| 319 |
-
Download sharegpt dataset:
|
| 320 |
-
|
| 321 |
-
```Shell
|
| 322 |
-
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
| 323 |
-
```
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
Other datasets can be found in: https://github.com/vllm-project/vllm/tree/main/benchmarks
|
| 328 |
-
|
| 329 |
-
Note: you can change the number of prompts to be benchmarked with `--num-prompts` argument for `benchmark_serving` script.
|
| 330 |
-
|
| 331 |
-
### baseline
|
| 332 |
-
Server:
|
| 333 |
-
```Shell
|
| 334 |
-
export MODEL=google/gemma-3-12b-it
|
| 335 |
-
vllm serve $MODEL --tokenizer $MODEL -O3
|
| 336 |
-
```
|
| 337 |
-
|
| 338 |
-
Client:
|
| 339 |
-
```Shell
|
| 340 |
-
export MODEL=google/gemma-3-12b-it
|
| 341 |
-
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 342 |
-
```
|
| 343 |
-
|
| 344 |
-
### FP8
|
| 345 |
-
Server:
|
| 346 |
-
```Shell
|
| 347 |
-
export MODEL=jerryzh168/gemma-3-12b-it-FP8
|
| 348 |
-
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
Client:
|
| 352 |
-
```Shell
|
| 353 |
-
export MODEL=jerryzh168/gemma-3-12b-it-FP8
|
| 354 |
-
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 355 |
-
```
|
| 356 |
</details>
|
| 357 |
|
| 358 |
|
|
|
|
| 187 |
|
| 188 |
| Benchmark | | |
|
| 189 |
|----------------------------------|----------------|---------------------------|
|
| 190 |
+
| | google/gemma-3-12b-it | pytorch/gemma-3-12b-it-FP8 |
|
| 191 |
+
| mmlu | 71.51 | 71.30 |
|
| 192 |
|
| 193 |
|
| 194 |
<details>
|
|
|
|
| 204 |
|
| 205 |
## FP8
|
| 206 |
```Shell
|
| 207 |
+
export MODEL=pytorch/gemma-3-12b-it-FP8
|
| 208 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 209 |
```
|
| 210 |
</details>
|
|
|
|
| 218 |
|
| 219 |
| Benchmark | | |
|
| 220 |
|------------------|----------------|--------------------------------|
|
| 221 |
+
| | google/gemma-3-12b-it | pytorch/gemma-3-12b-it-FP8 |
|
| 222 |
+
| Peak Memory (GB) | 24.50 | 15.47 (37% reduction) |
|
| 223 |
|
| 224 |
|
| 225 |
|
|
|
|
| 279 |
| Benchmark (Latency) | | |
|
| 280 |
|----------------------------------|----------------|--------------------------|
|
| 281 |
| | google/gemma-3-12b-it | jerryzh168/gemma-3-12b-it-FP8 |
|
| 282 |
+
| latency (batch_size=1) | 3.73s | 2.76s (1.35x speedup) |
|
| 283 |
+
| latency (batch_size=256) | 13.63s | 11.49s (1.19x speedup) |
|
| 284 |
|
| 285 |
<details>
|
| 286 |
<summary> Reproduce Model Performance Results </summary>
|
|
|
|
| 312 |
export MODEL=jerryzh168/gemma-3-12b-it-FP8
|
| 313 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 314 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
</details>
|
| 316 |
|
| 317 |
|