Spaces:
Running
on
Zero
Running
on
Zero
Vladyslav Humennyy
commited on
Commit
·
381e299
1
Parent(s):
1af0400
Add token
Browse files
app.py
CHANGED
|
@@ -20,6 +20,8 @@ from typing import Any
|
|
| 20 |
|
| 21 |
#torch._dynamo.config.disable = True
|
| 22 |
|
|
|
|
|
|
|
| 23 |
#MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
|
| 24 |
MODEL_ID = "le-llm/lapa-v0.1-instruct"
|
| 25 |
MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
|
|
@@ -34,10 +36,10 @@ def _begin_analytics_session():
|
|
| 34 |
def load_model():
|
| 35 |
"""Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
|
| 36 |
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
| 37 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 38 |
processor = None
|
| 39 |
try:
|
| 40 |
-
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 41 |
except Exception as err: # pragma: no cover - informative fallback
|
| 42 |
print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
|
| 43 |
|
|
@@ -46,6 +48,7 @@ def load_model():
|
|
| 46 |
dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
|
| 47 |
device_map="auto", # if device == "cuda" else None,
|
| 48 |
attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
|
|
|
|
| 49 |
) # .cuda()
|
| 50 |
print(f"Selected device:", device)
|
| 51 |
return model, tokenizer, processor, device
|
|
|
|
| 20 |
|
| 21 |
#torch._dynamo.config.disable = True
|
| 22 |
|
| 23 |
+
HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
|
| 24 |
+
|
| 25 |
#MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
|
| 26 |
MODEL_ID = "le-llm/lapa-v0.1-instruct"
|
| 27 |
MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
|
|
|
|
| 36 |
def load_model():
|
| 37 |
"""Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
|
| 38 |
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
|
| 40 |
processor = None
|
| 41 |
try:
|
| 42 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
|
| 43 |
except Exception as err: # pragma: no cover - informative fallback
|
| 44 |
print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
|
| 45 |
|
|
|
|
| 48 |
dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
|
| 49 |
device_map="auto", # if device == "cuda" else None,
|
| 50 |
attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
|
| 51 |
+
token=HF_LE_LLM_READ_TOKEN
|
| 52 |
) # .cuda()
|
| 53 |
print(f"Selected device:", device)
|
| 54 |
return model, tokenizer, processor, device
|