Spaces:

zach9111
/

llama_startup

Sleeping

App Files Files Community

llama_startup / app.py

zach9111

updated

8788df8 8 months ago

raw

history blame contribute delete

1.7 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	import torch

	app = FastAPI()

	# Define paths
	base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
	adapter_path = "zach9111/llama_startup_adapter"

	# Check if GPU is available
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load base model with `device_map="auto"` to handle GPUs automatically
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_path, torch_dtype=torch.float16, device_map="auto"
	)

	# Load adapter and ensure it is on the correct device
	model = PeftModel.from_pretrained(base_model, adapter_path).to(device)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(base_model_path)


	class GenerateRequest(BaseModel):
	prompt: str

	# Use `model.generate()` instead of `pipeline()`
	def generate_text_from_model(prompt: str):
	try:
	input_ids = tokenizer(f"<s>[INST] {prompt} [/INST]", return_tensors="pt").input_ids.to(device)
	output_ids = model.generate(input_ids, max_length=512)
	generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	return generated_text
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	# Root endpoint for testing
	@app.get("/")
	async def root():
	return {"message": "Model is running! Use /generate/ for text generation."}

	# Text generation endpoint
	@app.post("/generate/")
	async def generate_text(request: GenerateRequest):
	response = generate_text_from_model(request.prompt)
	return {"response": response}