File size: 2,775 Bytes
c456490
05334b7
 
6498586
 
5ce85fd
c456490
05334b7
 
c456490
 
 
5ce85fd
 
19cb7b0
05334b7
6498586
 
 
c456490
 
 
05334b7
c456490
05334b7
 
 
 
 
c456490
05334b7
 
c456490
 
 
05334b7
c456490
05334b7
c456490
7f7ed2c
c456490
5ce85fd
6498586
 
 
 
 
 
 
 
 
c456490
 
 
05334b7
6498586
 
c456490
 
 
5ce85fd
c456490
 
 
05334b7
c456490
05334b7
c456490
05334b7
 
c456490
 
 
05334b7
c456490
05334b7
c456490
 
05334b7
 
 
 
 
c456490
05334b7
c456490
05334b7
 
c456490
 
05334b7
c456490
 
 
05334b7
 
 
c456490
05334b7
c456490
 
05334b7
c456490
 
05334b7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import os

# =========================
# CONFIG
# =========================

BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL = "Delta0723/techmind-pro-v9"

# Crear carpeta para offload si no existe
os.makedirs("offload", exist_ok=True)

# =========================
# FastAPI Setup
# =========================

app = FastAPI(title="TechMind Pro API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"]
)

# =========================
# Load Model
# =========================

print("πŸš€ Cargando modelo y tokenizer...")

try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token

    quant_config = BitsAndBytesConfig(load_in_4bit=True)

    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        trust_remote_code=True,
        offload_folder="offload",
        quantization_config=quant_config
    )

    model = PeftModel.from_pretrained(base_model, LORA_MODEL)
    model.eval()

    print("βœ… Modelo listo para usar")

except Exception as e:
    print("❌ Error al cargar el modelo:", e)
    raise e

# =========================
# Data Models
# =========================

class Query(BaseModel):
    question: str
    max_tokens: Optional[int] = 300
    temperature: Optional[float] = 0.7

# =========================
# Utilidades
# =========================

def generate_answer(question: str, max_tokens=300, temperature=0.7) -> str:
    prompt = f"<s>[INST] {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded

# =========================
# Endpoints
# =========================

@app.get("/")
def root():
    return {"TechMind": "Mistral-7B Instruct + LoRA v9", "status": "online"}

@app.post("/ask")
def ask_q(req: Query):
    try:
        result = generate_answer(req.question, req.max_tokens, req.temperature)
        return {"response": result}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))