wu981526092 commited on
Commit
4d77f4f
·
1 Parent(s): fea46d2

Deploy Edge LLM to Hugging Face Space

Browse files
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # You will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.responses import FileResponse
5
+ from pydantic import BaseModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ import torch
8
+ from typing import Optional, Dict, Any
9
+ import os
10
+
11
+ app = FastAPI(title="Edge LLM API")
12
+
13
+ # Enable CORS for Hugging Face Space
14
+ app.add_middleware(
15
+ CORSMiddleware,
16
+ allow_origins=["*"], # Allow all origins for HF Space
17
+ allow_credentials=True,
18
+ allow_methods=["*"],
19
+ allow_headers=["*"],
20
+ )
21
+
22
+ # Mount static files
23
+ app.mount("/assets", StaticFiles(directory="static/assets"), name="assets")
24
+
25
+ # Available models
26
+ AVAILABLE_MODELS = {
27
+ "Qwen/Qwen3-4B-Thinking-2507": {
28
+ "name": "Qwen3-4B-Thinking-2507",
29
+ "supports_thinking": True,
30
+ "description": "Shows thinking process",
31
+ "size_gb": "~8GB"
32
+ },
33
+ "Qwen/Qwen3-4B-Instruct-2507": {
34
+ "name": "Qwen3-4B-Instruct-2507",
35
+ "supports_thinking": False,
36
+ "description": "Direct instruction following",
37
+ "size_gb": "~8GB"
38
+ }
39
+ }
40
+
41
+ # Global model cache
42
+ models_cache: Dict[str, Dict[str, Any]] = {}
43
+ current_model_name = None # No model loaded by default
44
+
45
+ class PromptRequest(BaseModel):
46
+ prompt: str
47
+ system_prompt: Optional[str] = None
48
+ model_name: Optional[str] = None
49
+ temperature: Optional[float] = 0.7
50
+ max_new_tokens: Optional[int] = 1024
51
+
52
+ class PromptResponse(BaseModel):
53
+ thinking_content: str
54
+ content: str
55
+ model_used: str
56
+ supports_thinking: bool
57
+
58
+ class ModelInfo(BaseModel):
59
+ model_name: str
60
+ name: str
61
+ supports_thinking: bool
62
+ description: str
63
+ size_gb: str
64
+ is_loaded: bool
65
+
66
+ class ModelsResponse(BaseModel):
67
+ models: list[ModelInfo]
68
+ current_model: str
69
+
70
+ class ModelLoadRequest(BaseModel):
71
+ model_name: str
72
+
73
+ class ModelUnloadRequest(BaseModel):
74
+ model_name: str
75
+
76
+ def load_model_by_name(model_name: str):
77
+ """Load a model into the cache"""
78
+ global models_cache
79
+
80
+ if model_name in models_cache:
81
+ return True
82
+
83
+ if model_name not in AVAILABLE_MODELS:
84
+ return False
85
+
86
+ try:
87
+ print(f"Loading model: {model_name}")
88
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
89
+ model = AutoModelForCausalLM.from_pretrained(
90
+ model_name,
91
+ torch_dtype=torch.float16,
92
+ device_map="auto"
93
+ )
94
+
95
+ models_cache[model_name] = {
96
+ "model": model,
97
+ "tokenizer": tokenizer
98
+ }
99
+ print(f"Model {model_name} loaded successfully")
100
+ return True
101
+ except Exception as e:
102
+ print(f"Error loading model {model_name}: {e}")
103
+ return False
104
+
105
+ def unload_model_by_name(model_name: str):
106
+ """Unload a model from the cache"""
107
+ global models_cache, current_model_name
108
+
109
+ if model_name in models_cache:
110
+ del models_cache[model_name]
111
+ if current_model_name == model_name:
112
+ current_model_name = None
113
+ print(f"Model {model_name} unloaded")
114
+ return True
115
+ return False
116
+
117
+ @app.on_event("startup")
118
+ async def startup_event():
119
+ """Startup event - don't load models by default"""
120
+ print("🚀 Edge LLM API is starting up...")
121
+ print("💡 Models will be loaded on demand")
122
+
123
+ @app.get("/")
124
+ async def read_index():
125
+ """Serve the React app"""
126
+ return FileResponse('static/index.html')
127
+
128
+ @app.get("/health")
129
+ async def health_check():
130
+ return {"status": "healthy", "message": "Edge LLM API is running"}
131
+
132
+ @app.get("/models", response_model=ModelsResponse)
133
+ async def get_models():
134
+ """Get available models and their status"""
135
+ global current_model_name
136
+
137
+ models = []
138
+ for model_name, info in AVAILABLE_MODELS.items():
139
+ models.append(ModelInfo(
140
+ model_name=model_name,
141
+ name=info["name"],
142
+ supports_thinking=info["supports_thinking"],
143
+ description=info["description"],
144
+ size_gb=info["size_gb"],
145
+ is_loaded=model_name in models_cache
146
+ ))
147
+
148
+ return ModelsResponse(
149
+ models=models,
150
+ current_model=current_model_name or ""
151
+ )
152
+
153
+ @app.post("/load-model")
154
+ async def load_model(request: ModelLoadRequest):
155
+ """Load a specific model"""
156
+ global current_model_name
157
+
158
+ if request.model_name not in AVAILABLE_MODELS:
159
+ raise HTTPException(
160
+ status_code=400,
161
+ detail=f"Model {request.model_name} not available"
162
+ )
163
+
164
+ success = load_model_by_name(request.model_name)
165
+ if success:
166
+ current_model_name = request.model_name
167
+ return {
168
+ "message": f"Model {request.model_name} loaded successfully",
169
+ "current_model": current_model_name
170
+ }
171
+ else:
172
+ raise HTTPException(
173
+ status_code=500,
174
+ detail=f"Failed to load model {request.model_name}"
175
+ )
176
+
177
+ @app.post("/unload-model")
178
+ async def unload_model(request: ModelUnloadRequest):
179
+ """Unload a specific model"""
180
+ global current_model_name
181
+
182
+ success = unload_model_by_name(request.model_name)
183
+ if success:
184
+ return {
185
+ "message": f"Model {request.model_name} unloaded successfully",
186
+ "current_model": current_model_name or ""
187
+ }
188
+ else:
189
+ raise HTTPException(
190
+ status_code=404,
191
+ detail=f"Model {request.model_name} not found in cache"
192
+ )
193
+
194
+ @app.post("/set-current-model")
195
+ async def set_current_model(request: ModelLoadRequest):
196
+ """Set the current active model"""
197
+ global current_model_name
198
+
199
+ if request.model_name not in models_cache:
200
+ raise HTTPException(
201
+ status_code=400,
202
+ detail=f"Model {request.model_name} is not loaded. Please load it first."
203
+ )
204
+
205
+ current_model_name = request.model_name
206
+ return {
207
+ "message": f"Current model set to {current_model_name}",
208
+ "current_model": current_model_name
209
+ }
210
+
211
+ @app.post("/generate", response_model=PromptResponse)
212
+ async def generate_text(request: PromptRequest):
213
+ """Generate text using the loaded model"""
214
+ global current_model_name
215
+
216
+ # Use the model specified in request, or fall back to current model
217
+ model_to_use = request.model_name if request.model_name else current_model_name
218
+
219
+ if not model_to_use:
220
+ raise HTTPException(
221
+ status_code=400,
222
+ detail="No model specified. Please load a model first."
223
+ )
224
+
225
+ if model_to_use not in models_cache:
226
+ raise HTTPException(
227
+ status_code=400,
228
+ detail=f"Model {model_to_use} is not loaded. Please load it first."
229
+ )
230
+
231
+ try:
232
+ model = models_cache[model_to_use]["model"]
233
+ tokenizer = models_cache[model_to_use]["tokenizer"]
234
+ model_info = AVAILABLE_MODELS[model_to_use]
235
+
236
+ # Build the prompt
237
+ messages = []
238
+ if request.system_prompt:
239
+ messages.append({"role": "system", "content": request.system_prompt})
240
+ messages.append({"role": "user", "content": request.prompt})
241
+
242
+ # Apply chat template
243
+ formatted_prompt = tokenizer.apply_chat_template(
244
+ messages,
245
+ tokenize=False,
246
+ add_generation_prompt=True
247
+ )
248
+
249
+ # Tokenize
250
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
251
+
252
+ # Generate
253
+ with torch.no_grad():
254
+ outputs = model.generate(
255
+ **inputs,
256
+ max_new_tokens=request.max_new_tokens,
257
+ temperature=request.temperature,
258
+ do_sample=True,
259
+ pad_token_id=tokenizer.eos_token_id
260
+ )
261
+
262
+ # Decode
263
+ generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
264
+ generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
265
+
266
+ # Parse thinking vs final content for thinking models
267
+ thinking_content = ""
268
+ final_content = generated_text
269
+
270
+ if model_info["supports_thinking"] and "<thinking>" in generated_text:
271
+ parts = generated_text.split("<thinking>")
272
+ if len(parts) > 1:
273
+ thinking_part = parts[1]
274
+ if "</thinking>" in thinking_part:
275
+ thinking_content = thinking_part.split("</thinking>")[0].strip()
276
+ remaining = thinking_part.split("</thinking>", 1)[1] if "</thinking>" in thinking_part else ""
277
+ final_content = remaining.strip()
278
+
279
+ return PromptResponse(
280
+ thinking_content=thinking_content,
281
+ content=final_content,
282
+ model_used=model_to_use,
283
+ supports_thinking=model_info["supports_thinking"]
284
+ )
285
+
286
+ except Exception as e:
287
+ print(f"Generation error: {e}")
288
+ raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
289
+
290
+ if __name__ == "__main__":
291
+ import uvicorn
292
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi>=0.104.1
2
+ uvicorn>=0.24.0
3
+ transformers>=4.40.0
4
+ torch>=2.1.0
5
+ accelerate>=0.24.0
6
+ pydantic>=2.5.0
7
+ python-multipart>=0.0.6
static/assets/index-5d859784.css ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Tailwind CSS styles - simplified for HF Space */
2
+ *,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}
3
+ :before,:after{--tw-content: ""}
4
+ html,:host{line-height:1.5;-webkit-text-size-adjust:100%;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji"}
5
+ body{margin:0;line-height:inherit;background-color:#f9fafb;color:#111827}
6
+ :root{--background: 0 0% 100%;--foreground: 0 0% 3.9%;--card: 0 0% 100%;--card-foreground: 0 0% 3.9%;--popover: 0 0% 100%;--popover-foreground: 0 0% 3.9%;--primary: 0 0% 9%;--primary-foreground: 0 0% 98%;--secondary: 0 0% 96.1%;--secondary-foreground: 0 0% 9%;--muted: 0 0% 96.1%;--muted-foreground: 0 0% 45.1%;--accent: 0 0% 96.1%;--accent-foreground: 0 0% 9%;--destructive: 0 84.2% 60.2%;--destructive-foreground: 0 0% 98%;--border: 0 0% 89.8%;--input: 0 0% 89.8%;--ring: 0 0% 3.9%;--radius: .5rem}
7
+ *{border-color:hsl(var(--border))}
8
+ body{background-color:hsl(var(--background));color:hsl(var(--foreground))}
9
+ .container{width:100%;margin:0 auto;padding:0 1rem}
10
+ .flex{display:flex}
11
+ .hidden{display:none}
12
+ .items-center{align-items:center}
13
+ .justify-center{justify-content:center}
14
+ .gap-4{gap:1rem}
15
+ .rounded{border-radius:.25rem}
16
+ .bg-primary{background-color:hsl(var(--primary))}
17
+ .text-primary-foreground{color:hsl(var(--primary-foreground))}
18
+ .p-4{padding:1rem}
19
+ .text-center{text-align:center}
20
+ .text-2xl{font-size:1.5rem;line-height:2rem}
21
+ .font-bold{font-weight:700}
22
+ .min-h-screen{min-height:100vh}
static/assets/index-5d859784.css~ ADDED
File without changes
static/assets/index-9cfccc0c.js ADDED
The diff for this file is too large to render. See raw diff
 
static/index.html ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/vite.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Edge LLM</title>
8
+ <script type="module" crossorigin src="/assets/index-9cfccc0c.js"></script>
9
+ <link rel="stylesheet" crossorigin href="/assets/index-5d859784.css">
10
+ </head>
11
+ <body>
12
+ <div id="root"></div>
13
+ </body>
14
+ </html>