Spaces:
Runtime error
Runtime error
File size: 2,775 Bytes
c456490 05334b7 6498586 5ce85fd c456490 05334b7 c456490 5ce85fd 19cb7b0 05334b7 6498586 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 7f7ed2c c456490 5ce85fd 6498586 c456490 05334b7 6498586 c456490 5ce85fd c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 c456490 05334b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import os
# =========================
# CONFIG
# =========================
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL = "Delta0723/techmind-pro-v9"
# Crear carpeta para offload si no existe
os.makedirs("offload", exist_ok=True)
# =========================
# FastAPI Setup
# =========================
app = FastAPI(title="TechMind Pro API")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"]
)
# =========================
# Load Model
# =========================
print("π Cargando modelo y tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(load_in_4bit=True)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
trust_remote_code=True,
offload_folder="offload",
quantization_config=quant_config
)
model = PeftModel.from_pretrained(base_model, LORA_MODEL)
model.eval()
print("β
Modelo listo para usar")
except Exception as e:
print("β Error al cargar el modelo:", e)
raise e
# =========================
# Data Models
# =========================
class Query(BaseModel):
question: str
max_tokens: Optional[int] = 300
temperature: Optional[float] = 0.7
# =========================
# Utilidades
# =========================
def generate_answer(question: str, max_tokens=300, temperature=0.7) -> str:
prompt = f"<s>[INST] {question} [/INST]"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded
# =========================
# Endpoints
# =========================
@app.get("/")
def root():
return {"TechMind": "Mistral-7B Instruct + LoRA v9", "status": "online"}
@app.post("/ask")
def ask_q(req: Query):
try:
result = generate_answer(req.question, req.max_tokens, req.temperature)
return {"response": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|