Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from peft import PeftModel | |
| import torch | |
| app = FastAPI() | |
| # Define paths | |
| base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B" | |
| adapter_path = "thinkingnew/llama_invs_adapter" | |
| # Check if GPU is available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load base model | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_path, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto" | |
| ).to(device) | |
| # Load adapter | |
| model = PeftModel.from_pretrained(base_model, adapter_path).to(device) | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(base_model_path) | |
| # Load pipeline once (for better performance) | |
| text_pipe = pipeline( | |
| task="text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_length=512 | |
| ) | |
| # Root endpoint for testing | |
| async def root(): | |
| return {"message": "Model is running! Use /generate/ for text generation."} | |
| # Text generation endpoint | |
| async def generate_text(prompt: str): | |
| result = text_pipe(f"<s>[INST] {prompt} [/INST]") | |
| return {"response": result[0]['generated_text']} | |