Spaces:

dbmoradi60
/

gpt-oss-20b-cpu

Runtime error

App Files Files Community

gpt-oss-20b-cpu / app.py

dbmoradi60

Update app.py

1cf2a4d verified 4 months ago

raw

history blame

4.45 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import os
	import shutil
	import json
	from huggingface_hub import hf_hub_download

	app = FastAPI(title="GPT-OSS-20B API")

	# Set environment variables for Hugging Face cache
	os.environ["HF_HOME"] = "/app/cache/huggingface"
	os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	# Model ID and local directory
	MODEL_ID = "openai/gpt-oss-20b"
	MODEL_DIR = "/app/gpt-oss-20b"

	# Clear cache directory if lock files exist
	cache_dir = os.environ["HF_HOME"]
	if os.path.exists(cache_dir):
	print(f"Clearing cache directory: {cache_dir}")
	for item in os.listdir(cache_dir):
	item_path = os.path.join(cache_dir, item)
	if os.path.isdir(item_path):
	shutil.rmtree(item_path, ignore_errors=True)
	else:
	os.remove(item_path) if os.path.exists(item_path) else None

	# Create cache and model directories
	os.makedirs(cache_dir, exist_ok=True)
	os.makedirs(MODEL_DIR, exist_ok=True)

	# Download model files
	print("Downloading model files...")
	try:
	for file in ["config.json", "dtypes.json", "model.safetensors"]:
	hf_hub_download(
	repo_id=MODEL_ID,
	filename=f"original/{file}",
	local_dir=MODEL_DIR,
	cache_dir=cache_dir
	)
	print("Model files downloaded successfully.")
	except Exception as e:
	raise RuntimeError(f"Failed to download model files: {str(e)}")

	# Fix config.json if model_type is missing
	config_path = os.path.join(MODEL_DIR, "original/config.json")
	try:
	with open(config_path, "r") as f:
	config = json.load(f)
	if "model_type" not in config or config["model_type"] != "gpt_oss":
	print("Fixing config.json: setting model_type to 'gpt_oss'")
	config["model_type"] = "gpt_oss"
	with open(config_path, "w") as f:
	json.dump(config, f, indent=2)
	except Exception as e:
	print(f"Warning: Failed to check or fix config.json: {str(e)}")

	# Load tokenizer
	print("Loading tokenizer...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID, # Load directly from Hub
	cache_dir=cache_dir,
	trust_remote_code=True
	)
	except Exception as e:
	raise RuntimeError(f"Failed to load tokenizer: {str(e)}")

	# Load model with CPU offloading
	print("Loading model (this may take several minutes)...")
	try:
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, # Load directly from Hub
	cache_dir=cache_dir,
	device_map="auto", # Automatically place on CPU
	torch_dtype="auto", # Automatic precision
	offload_folder="/app/offload", # Offload weights to disk
	max_memory={0: "14GB", "cpu": "15GB"}, # Adjusted memory constraints
	trust_remote_code=True
	)
	print(f"Model loaded on: {model.device}")
	print(f"Model dtype: {model.dtype}")
	except Exception as e:
	raise RuntimeError(f"Failed to load model: {str(e)}")

	# Enable gradient checkpointing to reduce memory usage
	model.gradient_checkpointing_enable()

	class ChatRequest(BaseModel):
	message: str
	max_tokens: int = 256
	temperature: float = 0.7

	@app.post("/chat")
	async def chat_endpoint(request: ChatRequest):
	try:
	# Prepare input
	messages = [{"role": "user", "content": request.message}]
	inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt",
	return_dict=True
	).to("cpu")

	# Generate response
	with torch.no_grad():
	generated = model.generate(
	**inputs,
	max_new_tokens=request.max_tokens,
	temperature=request.temperature,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=1.1
	)

	# Decode response
	response = tokenizer.decode(
	generated[0][inputs["input_ids"].shape[-1]:],
	skip_special_tokens=True
	)
	return {"response": response}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	# Clear cache regularly to manage memory
	torch.cuda.empty_cache()

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)