Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
0e29f16
1
Parent(s):
1bb2b37
Load mode once
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import subprocess
|
| 3 |
import threading
|
| 4 |
|
| 5 |
-
subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
|
| 6 |
|
| 7 |
import spaces
|
| 8 |
import gradio as gr
|
|
@@ -22,11 +22,14 @@ def load_model():
|
|
| 22 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 23 |
model = AutoModelForCausalLM.from_pretrained(
|
| 24 |
MODEL_ID,
|
| 25 |
-
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
|
| 26 |
-
device_map="auto" if device == "cuda" else None,
|
| 27 |
-
)
|
|
|
|
| 28 |
return model, tokenizer, device
|
| 29 |
|
|
|
|
|
|
|
| 30 |
|
| 31 |
@spaces.GPU
|
| 32 |
def respond(
|
|
@@ -37,11 +40,10 @@ def respond(
|
|
| 37 |
temperature,
|
| 38 |
top_p,
|
| 39 |
):
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
# Build conversation
|
| 44 |
-
messages =
|
| 45 |
{"role": "user", "content": message}
|
| 46 |
]
|
| 47 |
|
|
@@ -50,7 +52,7 @@ def respond(
|
|
| 50 |
tokenize=False,
|
| 51 |
add_generation_prompt=True
|
| 52 |
)
|
| 53 |
-
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
| 54 |
|
| 55 |
# Streamer setup
|
| 56 |
streamer = TextIteratorStreamer(
|
|
|
|
| 2 |
import subprocess
|
| 3 |
import threading
|
| 4 |
|
| 5 |
+
# subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
|
| 6 |
|
| 7 |
import spaces
|
| 8 |
import gradio as gr
|
|
|
|
| 22 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 23 |
model = AutoModelForCausalLM.from_pretrained(
|
| 24 |
MODEL_ID,
|
| 25 |
+
#torch_dtype=torch.bfloat16, #if device == "cuda" else torch.float32,
|
| 26 |
+
#device_map="auto"# if device == "cuda" else None,
|
| 27 |
+
).to("cuda")
|
| 28 |
+
print(f"Selected device:", device)
|
| 29 |
return model, tokenizer, device
|
| 30 |
|
| 31 |
+
# Load model/tokenizer each request → allows zeroGPU to cold start & then release
|
| 32 |
+
model, tokenizer, device = load_model()
|
| 33 |
|
| 34 |
@spaces.GPU
|
| 35 |
def respond(
|
|
|
|
| 40 |
temperature,
|
| 41 |
top_p,
|
| 42 |
):
|
| 43 |
+
|
| 44 |
+
# [{"role": "system", "content": system_message}] +
|
|
|
|
| 45 |
# Build conversation
|
| 46 |
+
messages = history + [
|
| 47 |
{"role": "user", "content": message}
|
| 48 |
]
|
| 49 |
|
|
|
|
| 52 |
tokenize=False,
|
| 53 |
add_generation_prompt=True
|
| 54 |
)
|
| 55 |
+
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")# .to(device)
|
| 56 |
|
| 57 |
# Streamer setup
|
| 58 |
streamer = TextIteratorStreamer(
|