gpt-oss-20b-cpu / app.py
dbmoradi60's picture
Update app.py
1cf2a4d verified
raw
history blame
4.45 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import shutil
import json
from huggingface_hub import hf_hub_download
app = FastAPI(title="GPT-OSS-20B API")
# Set environment variables for Hugging Face cache
os.environ["HF_HOME"] = "/app/cache/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Model ID and local directory
MODEL_ID = "openai/gpt-oss-20b"
MODEL_DIR = "/app/gpt-oss-20b"
# Clear cache directory if lock files exist
cache_dir = os.environ["HF_HOME"]
if os.path.exists(cache_dir):
print(f"Clearing cache directory: {cache_dir}")
for item in os.listdir(cache_dir):
item_path = os.path.join(cache_dir, item)
if os.path.isdir(item_path):
shutil.rmtree(item_path, ignore_errors=True)
else:
os.remove(item_path) if os.path.exists(item_path) else None
# Create cache and model directories
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
# Download model files
print("Downloading model files...")
try:
for file in ["config.json", "dtypes.json", "model.safetensors"]:
hf_hub_download(
repo_id=MODEL_ID,
filename=f"original/{file}",
local_dir=MODEL_DIR,
cache_dir=cache_dir
)
print("Model files downloaded successfully.")
except Exception as e:
raise RuntimeError(f"Failed to download model files: {str(e)}")
# Fix config.json if model_type is missing
config_path = os.path.join(MODEL_DIR, "original/config.json")
try:
with open(config_path, "r") as f:
config = json.load(f)
if "model_type" not in config or config["model_type"] != "gpt_oss":
print("Fixing config.json: setting model_type to 'gpt_oss'")
config["model_type"] = "gpt_oss"
with open(config_path, "w") as f:
json.dump(config, f, indent=2)
except Exception as e:
print(f"Warning: Failed to check or fix config.json: {str(e)}")
# Load tokenizer
print("Loading tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID, # Load directly from Hub
cache_dir=cache_dir,
trust_remote_code=True
)
except Exception as e:
raise RuntimeError(f"Failed to load tokenizer: {str(e)}")
# Load model with CPU offloading
print("Loading model (this may take several minutes)...")
try:
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, # Load directly from Hub
cache_dir=cache_dir,
device_map="auto", # Automatically place on CPU
torch_dtype="auto", # Automatic precision
offload_folder="/app/offload", # Offload weights to disk
max_memory={0: "14GB", "cpu": "15GB"}, # Adjusted memory constraints
trust_remote_code=True
)
print(f"Model loaded on: {model.device}")
print(f"Model dtype: {model.dtype}")
except Exception as e:
raise RuntimeError(f"Failed to load model: {str(e)}")
# Enable gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()
class ChatRequest(BaseModel):
message: str
max_tokens: int = 256
temperature: float = 0.7
@app.post("/chat")
async def chat_endpoint(request: ChatRequest):
try:
# Prepare input
messages = [{"role": "user", "content": request.message}]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True
).to("cpu")
# Generate response
with torch.no_grad():
generated = model.generate(
**inputs,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1
)
# Decode response
response = tokenizer.decode(
generated[0][inputs["input_ids"].shape[-1]:],
skip_special_tokens=True
)
return {"response": response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Clear cache regularly to manage memory
torch.cuda.empty_cache()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)