# app.py para Hugging Face Spaces # Usa CPU con optimizaciones máximas from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import Optional from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch import os # ========================= # CONFIG # ========================= BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3" LORA_MODEL = "Delta0723/techmind-pro-v9" OFFLOAD_DIR = "./offload_folder" os.makedirs(OFFLOAD_DIR, exist_ok=True) # ========================= # FastAPI Setup # ========================= app = FastAPI(title="TechMind Pro v9") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"] ) # Variable global para modelo model = None tokenizer = None # ========================= # Load Model (lazy loading) # ========================= def load_model(): global model, tokenizer if model is not None: return print("🚀 Cargando modelo...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False) tokenizer.pad_token = tokenizer.eos_token # Cargar en CPU con int8 (más ligero que 4bit para CPU) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map={"": "cpu"}, torch_dtype=torch.float16, low_cpu_mem_usage=True, offload_folder=OFFLOAD_DIR, offload_state_dict=True ) # Cargar LoRA model = PeftModel.from_pretrained( base_model, LORA_MODEL, device_map={"": "cpu"}, offload_folder=OFFLOAD_DIR ) model.eval() print("✅ Modelo cargado") # ========================= # Data Models # ========================= class Query(BaseModel): question: str max_tokens: Optional[int] = 200 temperature: Optional[float] = 0.7 # ========================= # Utilidades # ========================= def generate_answer(question: str, max_tokens=200, temperature=0.7) -> str: load_model() # Carga lazy prompt = f"[INST] {question} [/INST]" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=0.95, do_sample=True, pad_token_id=tokenizer.eos_token_id, num_beams=1 # Velocidad ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded # ========================= # Endpoints # ========================= @app.get("/") def root(): return { "model": "TechMind Pro v9", "base": BASE_MODEL, "lora": LORA_MODEL, "status": "online" } @app.post("/ask") def ask_q(req: Query): try: result = generate_answer(req.question, req.max_tokens, req.temperature) return {"response": result} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ========================= # README.md para el Space # ========================= """ --- title: TechMind Pro v9 emoji: 🤖 colorFrom: blue colorTo: purple sdk: docker pinned: false --- # TechMind Pro v9 API para el modelo TechMind Pro v9 (Mistral-7B + LoRA fine-tuned) ## Uso ```bash curl -X POST "https://YOUR-SPACE.hf.space/ask" \ -H "Content-Type: application/json" \ -d '{"question": "¿Qué es Python?"}' ``` """ # ========================= # Dockerfile para el Space # ========================= """ FROM python:3.10-slim WORKDIR /app RUN apt-get update && apt-get install -y \ git \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] """ # ========================= # requirements.txt # ========================= """ fastapi uvicorn[standard] transformers>=4.35.0 peft torch accelerate sentencepiece protobuf """