Spaces:

dbmoradi60
/

gpt-oss-20b-cpu

Runtime error

App Files Files Community

dbmoradi60 commited on Aug 7

Commit

667d8e1

verified ·

1 Parent(s): cd5f64c

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -81

app.py CHANGED Viewed

@@ -1,24 +1,23 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 import os
 import shutil
-import json
 from huggingface_hub import hf_hub_download
 app = FastAPI(title="GPT-OSS-20B API")
-# Set environment variables for Hugging Face cache
 os.environ["HF_HOME"] = "/app/cache/huggingface"
 os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Model ID and local directory
-MODEL_ID = "openai/gpt-oss-20b"
 MODEL_DIR = "/app/gpt-oss-20b"
-# Clear cache directory if lock files exist
 cache_dir = os.environ["HF_HOME"]
 if os.path.exists(cache_dir):
     print(f"Clearing cache directory: {cache_dir}")
@@ -29,68 +28,35 @@ if os.path.exists(cache_dir):
         else:
             os.remove(item_path) if os.path.exists(item_path) else None
-# Create cache and model directories
 os.makedirs(cache_dir, exist_ok=True)
 os.makedirs(MODEL_DIR, exist_ok=True)
-# Download model files
-print("Downloading model files...")
 try:
-    for file in ["config.json", "dtypes.json", "model.safetensors"]:
-        hf_hub_download(
-            repo_id=MODEL_ID,
-            filename=f"original/{file}",
-            local_dir=MODEL_DIR,
-            cache_dir=cache_dir
-        )
-    print("Model files downloaded successfully.")
-except Exception as e:
-    raise RuntimeError(f"Failed to download model files: {str(e)}")
-# Fix config.json if model_type is missing
-config_path = os.path.join(MODEL_DIR, "original/config.json")
-try:
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    if "model_type" not in config or config["model_type"] != "gpt_oss":
-        print("Fixing config.json: setting model_type to 'gpt_oss'")
-        config["model_type"] = "gpt_oss"
-        with open(config_path, "w") as f:
-            json.dump(config, f, indent=2)
-except Exception as e:
-    print(f"Warning: Failed to check or fix config.json: {str(e)}")
-# Load tokenizer
-print("Loading tokenizer...")
-try:
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_ID,  # Load directly from Hub
-        cache_dir=cache_dir,
-        trust_remote_code=True
     )
 except Exception as e:
-    raise RuntimeError(f"Failed to load tokenizer: {str(e)}")
-# Load model with CPU offloading
-print("Loading model (this may take several minutes)...")
 try:
     model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,  # Load directly from Hub
-        cache_dir=cache_dir,
-        device_map="auto",  # Automatically place on CPU
-        torch_dtype="auto",  # Automatic precision
-        offload_folder="/app/offload",  # Offload weights to disk
-        max_memory={0: "14GB", "cpu": "15GB"},  # Adjusted memory constraints
-        trust_remote_code=True
     )
-    print(f"Model loaded on: {model.device}")
-    print(f"Model dtype: {model.dtype}")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
-# Enable gradient checkpointing to reduce memory usage
-model.gradient_checkpointing_enable()
 class ChatRequest(BaseModel):
     message: str
     max_tokens: int = 256
@@ -99,38 +65,16 @@ class ChatRequest(BaseModel):
 @app.post("/chat")
 async def chat_endpoint(request: ChatRequest):
     try:
-        # Prepare input
-        messages = [{"role": "user", "content": request.message}]
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            return_dict=True
-        ).to("cpu")
         # Generate response
-        with torch.no_grad():
-            generated = model.generate(
-                **inputs,
-                max_new_tokens=request.max_tokens,
-                temperature=request.temperature,
-                do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                repetition_penalty=1.1
-            )
-        # Decode response
-        response = tokenizer.decode(
-            generated[0][inputs["input_ids"].shape[-1]:],
-            skip_special_tokens=True
         )
         return {"response": response}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-# Clear cache regularly to manage memory
-torch.cuda.empty_cache()
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from ctransformers import AutoModelForCausalLM
 import os
 import shutil
 from huggingface_hub import hf_hub_download
 app = FastAPI(title="GPT-OSS-20B API")
+# Set environment variables
 os.environ["HF_HOME"] = "/app/cache/huggingface"
 os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Model ID and local directory
+MODEL_ID = "unsloth/gpt-oss-20b-GGUF"
 MODEL_DIR = "/app/gpt-oss-20b"
+MODEL_FILE = "gpt-oss-20b.Q4_K_M.gguf"  # Adjust based on actual filename
+# Clear cache directory
 cache_dir = os.environ["HF_HOME"]
 if os.path.exists(cache_dir):
     print(f"Clearing cache directory: {cache_dir}")
         else:
             os.remove(item_path) if os.path.exists(item_path) else None
+# Create directories
 os.makedirs(cache_dir, exist_ok=True)
 os.makedirs(MODEL_DIR, exist_ok=True)
+# Download model file
+print("Downloading model file...")
 try:
+    hf_hub_download(
+        repo_id=MODEL_ID,
+        filename=MODEL_FILE,
+        local_dir=MODEL_DIR,
+        cache_dir=cache_dir
     )
+    print("Model file downloaded successfully.")
 except Exception as e:
+    raise RuntimeError(f"Failed to download model: {str(e)}")
+# Load model
+print("Loading model...")
 try:
     model = AutoModelForCausalLM.from_pretrained(
+        MODEL_DIR,
+        model_type="gguf",
+        model_file=MODEL_FILE
     )
+    print("Model loaded successfully.")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
 class ChatRequest(BaseModel):
     message: str
     max_tokens: int = 256
 @app.post("/chat")
 async def chat_endpoint(request: ChatRequest):
     try:
         # Generate response
+        response = model(
+            request.message,
+            max_new_tokens=request.max_tokens,
+            temperature=request.temperature
         )
         return {"response": response}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)