Vladyslav Humennyy commited on
Commit
381e299
·
1 Parent(s): 1af0400
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -20,6 +20,8 @@ from typing import Any
20
 
21
  #torch._dynamo.config.disable = True
22
 
 
 
23
  #MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
24
  MODEL_ID = "le-llm/lapa-v0.1-instruct"
25
  MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
@@ -34,10 +36,10 @@ def _begin_analytics_session():
34
  def load_model():
35
  """Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
36
  device = "cuda" # if torch.cuda.is_available() else "cpu"
37
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
38
  processor = None
39
  try:
40
- processor = AutoProcessor.from_pretrained(MODEL_ID)
41
  except Exception as err: # pragma: no cover - informative fallback
42
  print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
43
 
@@ -46,6 +48,7 @@ def load_model():
46
  dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
47
  device_map="auto", # if device == "cuda" else None,
48
  attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
 
49
  ) # .cuda()
50
  print(f"Selected device:", device)
51
  return model, tokenizer, processor, device
 
20
 
21
  #torch._dynamo.config.disable = True
22
 
23
+ HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
24
+
25
  #MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
26
  MODEL_ID = "le-llm/lapa-v0.1-instruct"
27
  MODEL_ID = "le-llm/lapa-v0.1-matt-instruction-5e06"
 
36
  def load_model():
37
  """Lazy-load model, tokenizer, and optional processor (for zeroGPU)."""
38
  device = "cuda" # if torch.cuda.is_available() else "cpu"
39
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
40
  processor = None
41
  try:
42
+ processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_LE_LLM_READ_TOKEN)
43
  except Exception as err: # pragma: no cover - informative fallback
44
  print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.")
45
 
 
48
  dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
49
  device_map="auto", # if device == "cuda" else None,
50
  attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
51
+ token=HF_LE_LLM_READ_TOKEN
52
  ) # .cuda()
53
  print(f"Selected device:", device)
54
  return model, tokenizer, processor, device