dbmoradi60 commited on
Commit
667d8e1
·
verified ·
1 Parent(s): cd5f64c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -81
app.py CHANGED
@@ -1,24 +1,23 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
- import torch
5
  import os
6
  import shutil
7
- import json
8
  from huggingface_hub import hf_hub_download
9
 
10
  app = FastAPI(title="GPT-OSS-20B API")
11
 
12
- # Set environment variables for Hugging Face cache
13
  os.environ["HF_HOME"] = "/app/cache/huggingface"
14
  os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
15
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16
 
17
  # Model ID and local directory
18
- MODEL_ID = "openai/gpt-oss-20b"
19
  MODEL_DIR = "/app/gpt-oss-20b"
 
20
 
21
- # Clear cache directory if lock files exist
22
  cache_dir = os.environ["HF_HOME"]
23
  if os.path.exists(cache_dir):
24
  print(f"Clearing cache directory: {cache_dir}")
@@ -29,68 +28,35 @@ if os.path.exists(cache_dir):
29
  else:
30
  os.remove(item_path) if os.path.exists(item_path) else None
31
 
32
- # Create cache and model directories
33
  os.makedirs(cache_dir, exist_ok=True)
34
  os.makedirs(MODEL_DIR, exist_ok=True)
35
 
36
- # Download model files
37
- print("Downloading model files...")
38
  try:
39
- for file in ["config.json", "dtypes.json", "model.safetensors"]:
40
- hf_hub_download(
41
- repo_id=MODEL_ID,
42
- filename=f"original/{file}",
43
- local_dir=MODEL_DIR,
44
- cache_dir=cache_dir
45
- )
46
- print("Model files downloaded successfully.")
47
- except Exception as e:
48
- raise RuntimeError(f"Failed to download model files: {str(e)}")
49
-
50
- # Fix config.json if model_type is missing
51
- config_path = os.path.join(MODEL_DIR, "original/config.json")
52
- try:
53
- with open(config_path, "r") as f:
54
- config = json.load(f)
55
- if "model_type" not in config or config["model_type"] != "gpt_oss":
56
- print("Fixing config.json: setting model_type to 'gpt_oss'")
57
- config["model_type"] = "gpt_oss"
58
- with open(config_path, "w") as f:
59
- json.dump(config, f, indent=2)
60
- except Exception as e:
61
- print(f"Warning: Failed to check or fix config.json: {str(e)}")
62
-
63
- # Load tokenizer
64
- print("Loading tokenizer...")
65
- try:
66
- tokenizer = AutoTokenizer.from_pretrained(
67
- MODEL_ID, # Load directly from Hub
68
- cache_dir=cache_dir,
69
- trust_remote_code=True
70
  )
 
71
  except Exception as e:
72
- raise RuntimeError(f"Failed to load tokenizer: {str(e)}")
73
 
74
- # Load model with CPU offloading
75
- print("Loading model (this may take several minutes)...")
76
  try:
77
  model = AutoModelForCausalLM.from_pretrained(
78
- MODEL_ID, # Load directly from Hub
79
- cache_dir=cache_dir,
80
- device_map="auto", # Automatically place on CPU
81
- torch_dtype="auto", # Automatic precision
82
- offload_folder="/app/offload", # Offload weights to disk
83
- max_memory={0: "14GB", "cpu": "15GB"}, # Adjusted memory constraints
84
- trust_remote_code=True
85
  )
86
- print(f"Model loaded on: {model.device}")
87
- print(f"Model dtype: {model.dtype}")
88
  except Exception as e:
89
  raise RuntimeError(f"Failed to load model: {str(e)}")
90
 
91
- # Enable gradient checkpointing to reduce memory usage
92
- model.gradient_checkpointing_enable()
93
-
94
  class ChatRequest(BaseModel):
95
  message: str
96
  max_tokens: int = 256
@@ -99,38 +65,16 @@ class ChatRequest(BaseModel):
99
  @app.post("/chat")
100
  async def chat_endpoint(request: ChatRequest):
101
  try:
102
- # Prepare input
103
- messages = [{"role": "user", "content": request.message}]
104
- inputs = tokenizer.apply_chat_template(
105
- messages,
106
- add_generation_prompt=True,
107
- return_tensors="pt",
108
- return_dict=True
109
- ).to("cpu")
110
-
111
  # Generate response
112
- with torch.no_grad():
113
- generated = model.generate(
114
- **inputs,
115
- max_new_tokens=request.max_tokens,
116
- temperature=request.temperature,
117
- do_sample=True,
118
- pad_token_id=tokenizer.eos_token_id,
119
- repetition_penalty=1.1
120
- )
121
-
122
- # Decode response
123
- response = tokenizer.decode(
124
- generated[0][inputs["input_ids"].shape[-1]:],
125
- skip_special_tokens=True
126
  )
127
  return {"response": response}
128
  except Exception as e:
129
  raise HTTPException(status_code=500, detail=str(e))
130
 
131
- # Clear cache regularly to manage memory
132
- torch.cuda.empty_cache()
133
-
134
  if __name__ == "__main__":
135
  import uvicorn
136
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
+ from ctransformers import AutoModelForCausalLM
 
4
  import os
5
  import shutil
 
6
  from huggingface_hub import hf_hub_download
7
 
8
  app = FastAPI(title="GPT-OSS-20B API")
9
 
10
+ # Set environment variables
11
  os.environ["HF_HOME"] = "/app/cache/huggingface"
12
  os.environ["HUGGINGFACE_HUB_CACHE"] = "/app/cache/huggingface/hub"
13
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
14
 
15
  # Model ID and local directory
16
+ MODEL_ID = "unsloth/gpt-oss-20b-GGUF"
17
  MODEL_DIR = "/app/gpt-oss-20b"
18
+ MODEL_FILE = "gpt-oss-20b.Q4_K_M.gguf" # Adjust based on actual filename
19
 
20
+ # Clear cache directory
21
  cache_dir = os.environ["HF_HOME"]
22
  if os.path.exists(cache_dir):
23
  print(f"Clearing cache directory: {cache_dir}")
 
28
  else:
29
  os.remove(item_path) if os.path.exists(item_path) else None
30
 
31
+ # Create directories
32
  os.makedirs(cache_dir, exist_ok=True)
33
  os.makedirs(MODEL_DIR, exist_ok=True)
34
 
35
+ # Download model file
36
+ print("Downloading model file...")
37
  try:
38
+ hf_hub_download(
39
+ repo_id=MODEL_ID,
40
+ filename=MODEL_FILE,
41
+ local_dir=MODEL_DIR,
42
+ cache_dir=cache_dir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
+ print("Model file downloaded successfully.")
45
  except Exception as e:
46
+ raise RuntimeError(f"Failed to download model: {str(e)}")
47
 
48
+ # Load model
49
+ print("Loading model...")
50
  try:
51
  model = AutoModelForCausalLM.from_pretrained(
52
+ MODEL_DIR,
53
+ model_type="gguf",
54
+ model_file=MODEL_FILE
 
 
 
 
55
  )
56
+ print("Model loaded successfully.")
 
57
  except Exception as e:
58
  raise RuntimeError(f"Failed to load model: {str(e)}")
59
 
 
 
 
60
  class ChatRequest(BaseModel):
61
  message: str
62
  max_tokens: int = 256
 
65
  @app.post("/chat")
66
  async def chat_endpoint(request: ChatRequest):
67
  try:
 
 
 
 
 
 
 
 
 
68
  # Generate response
69
+ response = model(
70
+ request.message,
71
+ max_new_tokens=request.max_tokens,
72
+ temperature=request.temperature
 
 
 
 
 
 
 
 
 
 
73
  )
74
  return {"response": response}
75
  except Exception as e:
76
  raise HTTPException(status_code=500, detail=str(e))
77
 
 
 
 
78
  if __name__ == "__main__":
79
  import uvicorn
80
  uvicorn.run(app, host="0.0.0.0", port=8000)