Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| import os | |
| import shutil | |
| import json | |
| from huggingface_hub import hf_hub_download | |
| app = FastAPI(title="GPT-OSS-20B API") | |
| # Set environment variables for Hugging Face cache | |
| os.environ["HF_HOME"] = "/app/cache/huggingface" | |
| os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub" | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
| # Model ID and local directory | |
| MODEL_ID = "openai/gpt-oss-20b" | |
| MODEL_DIR = "/app/gpt-oss-20b" | |
| # Clear cache directory if lock files exist | |
| cache_dir = os.environ["HF_HOME"] | |
| if os.path.exists(cache_dir): | |
| print(f"Clearing cache directory: {cache_dir}") | |
| for item in os.listdir(cache_dir): | |
| item_path = os.path.join(cache_dir, item) | |
| if os.path.isdir(item_path): | |
| shutil.rmtree(item_path, ignore_errors=True) | |
| else: | |
| os.remove(item_path) if os.path.exists(item_path) else None | |
| # Create cache and model directories | |
| os.makedirs(cache_dir, exist_ok=True) | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # Download model files | |
| print("Downloading model files...") | |
| try: | |
| for file in ["config.json", "dtypes.json", "model.safetensors"]: | |
| hf_hub_download( | |
| repo_id=MODEL_ID, | |
| filename=f"original/{file}", | |
| local_dir=MODEL_DIR, | |
| cache_dir=cache_dir | |
| ) | |
| print("Model files downloaded successfully.") | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to download model files: {str(e)}") | |
| # Fix config.json if model_type is missing | |
| config_path = os.path.join(MODEL_DIR, "original/config.json") | |
| try: | |
| with open(config_path, "r") as f: | |
| config = json.load(f) | |
| if "model_type" not in config or config["model_type"] != "gpt_oss": | |
| print("Fixing config.json: setting model_type to 'gpt_oss'") | |
| config["model_type"] = "gpt_oss" | |
| with open(config_path, "w") as f: | |
| json.dump(config, f, indent=2) | |
| except Exception as e: | |
| print(f"Warning: Failed to check or fix config.json: {str(e)}") | |
| # Load tokenizer | |
| print("Loading tokenizer...") | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, # Load directly from Hub | |
| cache_dir=cache_dir, | |
| trust_remote_code=True | |
| ) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load tokenizer: {str(e)}") | |
| # Load model with CPU offloading | |
| print("Loading model (this may take several minutes)...") | |
| try: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, # Load directly from Hub | |
| cache_dir=cache_dir, | |
| device_map="auto", # Automatically place on CPU | |
| torch_dtype="auto", # Automatic precision | |
| offload_folder="/app/offload", # Offload weights to disk | |
| max_memory={0: "14GB", "cpu": "15GB"}, # Adjusted memory constraints | |
| trust_remote_code=True | |
| ) | |
| print(f"Model loaded on: {model.device}") | |
| print(f"Model dtype: {model.dtype}") | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load model: {str(e)}") | |
| # Enable gradient checkpointing to reduce memory usage | |
| model.gradient_checkpointing_enable() | |
| class ChatRequest(BaseModel): | |
| message: str | |
| max_tokens: int = 256 | |
| temperature: float = 0.7 | |
| async def chat_endpoint(request: ChatRequest): | |
| try: | |
| # Prepare input | |
| messages = [{"role": "user", "content": request.message}] | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| return_dict=True | |
| ).to("cpu") | |
| # Generate response | |
| with torch.no_grad(): | |
| generated = model.generate( | |
| **inputs, | |
| max_new_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| repetition_penalty=1.1 | |
| ) | |
| # Decode response | |
| response = tokenizer.decode( | |
| generated[0][inputs["input_ids"].shape[-1]:], | |
| skip_special_tokens=True | |
| ) | |
| return {"response": response} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # Clear cache regularly to manage memory | |
| torch.cuda.empty_cache() | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=8000) |