iamthewalrus67 commited on
Commit
0e29f16
·
1 Parent(s): 1bb2b37

Load mode once

Browse files
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import subprocess
3
  import threading
4
 
5
- subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
6
 
7
  import spaces
8
  import gradio as gr
@@ -22,11 +22,14 @@ def load_model():
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
23
  model = AutoModelForCausalLM.from_pretrained(
24
  MODEL_ID,
25
- torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
26
- device_map="auto" if device == "cuda" else None,
27
- )
 
28
  return model, tokenizer, device
29
 
 
 
30
 
31
  @spaces.GPU
32
  def respond(
@@ -37,11 +40,10 @@ def respond(
37
  temperature,
38
  top_p,
39
  ):
40
- # Load model/tokenizer each request → allows zeroGPU to cold start & then release
41
- model, tokenizer, device = load_model()
42
-
43
  # Build conversation
44
- messages = [{"role": "system", "content": system_message}] + history + [
45
  {"role": "user", "content": message}
46
  ]
47
 
@@ -50,7 +52,7 @@ def respond(
50
  tokenize=False,
51
  add_generation_prompt=True
52
  )
53
- inputs = tokenizer(input_text, return_tensors="pt").to(device)
54
 
55
  # Streamer setup
56
  streamer = TextIteratorStreamer(
 
2
  import subprocess
3
  import threading
4
 
5
+ # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
6
 
7
  import spaces
8
  import gradio as gr
 
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
23
  model = AutoModelForCausalLM.from_pretrained(
24
  MODEL_ID,
25
+ #torch_dtype=torch.bfloat16, #if device == "cuda" else torch.float32,
26
+ #device_map="auto"# if device == "cuda" else None,
27
+ ).to("cuda")
28
+ print(f"Selected device:", device)
29
  return model, tokenizer, device
30
 
31
+ # Load model/tokenizer each request → allows zeroGPU to cold start & then release
32
+ model, tokenizer, device = load_model()
33
 
34
  @spaces.GPU
35
  def respond(
 
40
  temperature,
41
  top_p,
42
  ):
43
+
44
+ # [{"role": "system", "content": system_message}] +
 
45
  # Build conversation
46
+ messages = history + [
47
  {"role": "user", "content": message}
48
  ]
49
 
 
52
  tokenize=False,
53
  add_generation_prompt=True
54
  )
55
+ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")# .to(device)
56
 
57
  # Streamer setup
58
  streamer = TextIteratorStreamer(