techmind-pro / app.py
Delta0723's picture
Update app.py
dfd7796 verified
# app.py para Hugging Face Spaces
# Usa CPU con optimizaciones máximas
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os
# =========================
# CONFIG
# =========================
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL = "Delta0723/techmind-pro-v9"
OFFLOAD_DIR = "./offload_folder"
os.makedirs(OFFLOAD_DIR, exist_ok=True)
# =========================
# FastAPI Setup
# =========================
app = FastAPI(title="TechMind Pro v9")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"]
)
# Variable global para modelo
model = None
tokenizer = None
# =========================
# Load Model (lazy loading)
# =========================
def load_model():
global model, tokenizer
if model is not None:
return
print("🚀 Cargando modelo...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
# Cargar en CPU con int8 (más ligero que 4bit para CPU)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map={"": "cpu"},
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
offload_folder=OFFLOAD_DIR,
offload_state_dict=True
)
# Cargar LoRA
model = PeftModel.from_pretrained(
base_model,
LORA_MODEL,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR
)
model.eval()
print("✅ Modelo cargado")
# =========================
# Data Models
# =========================
class Query(BaseModel):
question: str
max_tokens: Optional[int] = 200
temperature: Optional[float] = 0.7
# =========================
# Utilidades
# =========================
def generate_answer(question: str, max_tokens=200, temperature=0.7) -> str:
load_model() # Carga lazy
prompt = f"<s>[INST] {question} [/INST]"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
num_beams=1 # Velocidad
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded
# =========================
# Endpoints
# =========================
@app.get("/")
def root():
return {
"model": "TechMind Pro v9",
"base": BASE_MODEL,
"lora": LORA_MODEL,
"status": "online"
}
@app.post("/ask")
def ask_q(req: Query):
try:
result = generate_answer(req.question, req.max_tokens, req.temperature)
return {"response": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =========================
# README.md para el Space
# =========================
"""
---
title: TechMind Pro v9
emoji: 🤖
colorFrom: blue
colorTo: purple
sdk: docker
pinned: false
---
# TechMind Pro v9
API para el modelo TechMind Pro v9 (Mistral-7B + LoRA fine-tuned)
## Uso
```bash
curl -X POST "https://YOUR-SPACE.hf.space/ask" \
-H "Content-Type: application/json" \
-d '{"question": "¿Qué es Python?"}'
```
"""
# =========================
# Dockerfile para el Space
# =========================
"""
FROM python:3.10-slim
WORKDIR /app
RUN apt-get update && apt-get install -y \
git \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
"""
# =========================
# requirements.txt
# =========================
"""
fastapi
uvicorn[standard]
transformers>=4.35.0
peft
torch
accelerate
sentencepiece
protobuf
"""