Spaces:

lapa-llm
/

lapa

Running on Zero

Vladyslav Humennyy commited on Oct 3

Commit

381e299

1 Parent(s): 1af0400

Add token

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,6 +20,8 @@ from typing import Any
 #torch._dynamo.config.disable = True
 #MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
 MODEL_ID = "le-llm/lapa-v0.1-instruct"
 MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
@@ -34,10 +36,10 @@ def _begin_analytics_session():
 def load_model():
     """Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
     device = "cuda"  # if torch.cuda.is_available() else "cpu"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     processor = None
     try:
-        processor = AutoProcessor.from_pretrained(MODEL_ID)
     except Exception as err:  # pragma: no cover - informative fallback
         print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
@@ -46,6 +48,7 @@ def load_model():
         dtype=torch.bfloat16,  # if device == "cuda" else torch.float32,
         device_map="auto",  # if device == "cuda" else None,
         attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", #  #
     )  # .cuda()
     print(f"Selected device:", device)
     return model, tokenizer, processor, device

 #torch._dynamo.config.disable = True
+HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
 #MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
 MODEL_ID = "le-llm/lapa-v0.1-instruct"
 MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
 def load_model():
     """Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
     device = "cuda"  # if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
     processor = None
     try:
+        processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
     except Exception as err:  # pragma: no cover - informative fallback
         print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
         dtype=torch.bfloat16,  # if device == "cuda" else torch.float32,
         device_map="auto",  # if device == "cuda" else None,
         attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", #  #
+        token=HF_LE_LLM_READ_TOKEN
     )  # .cuda()
     print(f"Selected device:", device)
     return model, tokenizer, processor, device