add bf16 inference
Browse files
README.md
CHANGED
|
@@ -177,6 +177,27 @@ So, the sum of 100, 520, and 60 is 680.
|
|
| 177 |
"""
|
| 178 |
```
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
### INT4 Inference with Transformers and Intel Extension for Transformers
|
| 181 |
```python
|
| 182 |
from transformers import AutoTokenizer, TextStreamer
|
|
|
|
| 177 |
"""
|
| 178 |
```
|
| 179 |
|
| 180 |
+
### BF16 Inference with Intel Extension for Transformers and Intel Extension for Pytorch
|
| 181 |
+
```python
|
| 182 |
+
from transformers import AutoTokenizer, TextStreamer
|
| 183 |
+
import torch
|
| 184 |
+
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
|
| 185 |
+
import intel_extension_for_pytorch as ipex
|
| 186 |
+
|
| 187 |
+
model_name = "Intel/neural-chat-7b-v3-1"
|
| 188 |
+
prompt = "Once upon a time, there existed a little girl,"
|
| 189 |
+
|
| 190 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 191 |
+
inputs = tokenizer(prompt, return_tensors="pt").input_ids
|
| 192 |
+
streamer = TextStreamer(tokenizer)
|
| 193 |
+
|
| 194 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
|
| 195 |
+
model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True, level="O1", auto_kernel_selection=True)
|
| 196 |
+
|
| 197 |
+
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
|
| 201 |
### INT4 Inference with Transformers and Intel Extension for Transformers
|
| 202 |
```python
|
| 203 |
from transformers import AutoTokenizer, TextStreamer
|