llama1

Runtime error

thinkingnew commited on Mar 27

Commit

4ec308a

1 Parent(s): 8c6d7e5

updated

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,18 +5,39 @@ import torch
 app = FastAPI()
-# Load Model from Hugging Face Hub
 base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
 adapter_path = "thinkingnew/llama_invs_adapter"
 base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_path, torch_dtype=torch.float16, device_map="auto"
-)
-model = PeftModel.from_pretrained(base_model, adapter_path)
 tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 @app.post("/generate/")
 async def generate_text(prompt: str):
-    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
-    result = pipe(f"<s>[INST] {prompt} [/INST]")
     return {"response": result[0]['generated_text']}

 app = FastAPI()
+# Define paths
 base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
 adapter_path = "thinkingnew/llama_invs_adapter"
+# Check if GPU is available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load base model
 base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_path, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto"
+).to(device)
+# Load adapter
+model = PeftModel.from_pretrained(base_model, adapter_path).to(device)
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+# Load pipeline once (for better performance)
+text_pipe = pipeline(
+    task="text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_length=512
+)
+# Root endpoint for testing
+@app.get("/")
+async def root():
+    return {"message": "Model is running! Use /generate/ for text generation."}
+# Text generation endpoint
 @app.post("/generate/")
 async def generate_text(prompt: str):
+    result = text_pipe(f"<s>[INST] {prompt} [/INST]")
     return {"response": result[0]['generated_text']}