Spaces:

dbmoradi60
/

gpt-oss-20b-cpu

Runtime error

File size: 4,449 Bytes

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import shutil
import json
from huggingface_hub import hf_hub_download

app = FastAPI(title="GPT-OSS-20B API")

# Set environment variables for Hugging Face cache
os.environ["HF_HOME"] = "/app/cache/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Model ID and local directory
MODEL_ID = "openai/gpt-oss-20b"
MODEL_DIR = "/app/gpt-oss-20b"

# Clear cache directory if lock files exist
cache_dir = os.environ["HF_HOME"]
if os.path.exists(cache_dir):
    print(f"Clearing cache directory: {cache_dir}")
    for item in os.listdir(cache_dir):
        item_path = os.path.join(cache_dir, item)
        if os.path.isdir(item_path):
            shutil.rmtree(item_path, ignore_errors=True)
        else:
            os.remove(item_path) if os.path.exists(item_path) else None

# Create cache and model directories
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Download model files
print("Downloading model files...")
try:
    for file in ["config.json", "dtypes.json", "model.safetensors"]:
        hf_hub_download(
            repo_id=MODEL_ID,
            filename=f"original/{file}",
            local_dir=MODEL_DIR,
            cache_dir=cache_dir
        )
    print("Model files downloaded successfully.")
except Exception as e:
    raise RuntimeError(f"Failed to download model files: {str(e)}")

# Fix config.json if model_type is missing
config_path = os.path.join(MODEL_DIR, "original/config.json")
try:
    with open(config_path, "r") as f:
        config = json.load(f)
    if "model_type" not in config or config["model_type"] != "gpt_oss":
        print("Fixing config.json: setting model_type to 'gpt_oss'")
        config["model_type"] = "gpt_oss"
        with open(config_path, "w") as f:
            json.dump(config, f, indent=2)
except Exception as e:
    print(f"Warning: Failed to check or fix config.json: {str(e)}")

# Load tokenizer
print("Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID,  # Load directly from Hub
        cache_dir=cache_dir,
        trust_remote_code=True
    )
except Exception as e:
    raise RuntimeError(f"Failed to load tokenizer: {str(e)}")

# Load model with CPU offloading
print("Loading model (this may take several minutes)...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,  # Load directly from Hub
        cache_dir=cache_dir,
        device_map="auto",  # Automatically place on CPU
        torch_dtype="auto",  # Automatic precision
        offload_folder="/app/offload",  # Offload weights to disk
        max_memory={0: "14GB", "cpu": "15GB"},  # Adjusted memory constraints
        trust_remote_code=True
    )
    print(f"Model loaded on: {model.device}")
    print(f"Model dtype: {model.dtype}")
except Exception as e:
    raise RuntimeError(f"Failed to load model: {str(e)}")

# Enable gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()

class ChatRequest(BaseModel):
    message: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/chat")
async def chat_endpoint(request: ChatRequest):
    try:
        # Prepare input
        messages = [{"role": "user", "content": request.message}]
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True
        ).to("cpu")

        # Generate response
        with torch.no_grad():
            generated = model.generate(
                **inputs,
                max_new_tokens=request.max_tokens,
                temperature=request.temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1
            )

        # Decode response
        response = tokenizer.decode(
            generated[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )
        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Clear cache regularly to manage memory
torch.cuda.empty_cache()

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)