Semnykcz commited on
Commit
d342ec0
·
verified ·
1 Parent(s): c0f04fe

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +233 -94
  2. requirements.txt +8 -9
app.py CHANGED
@@ -1,8 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
- AI Chat Application for HuggingFace Spaces
4
- Integration with Qwen/Qwen3-Coder-30B-A3B-Instruct model
5
- OPENAI API compatibility features
6
  """
7
 
8
  import os
@@ -13,31 +12,165 @@ import time
13
  from typing import Optional, Dict, Any, Generator, List
14
  import torch
15
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
16
- import gradio as gr
17
  from fastapi import FastAPI, HTTPException, Response
18
- from fastapi.responses import StreamingResponse
19
  from fastapi.staticfiles import StaticFiles
20
  from fastapi.middleware.cors import CORSMiddleware
21
- import redis
22
  import asyncio
23
  import threading
24
  from threading import Thread
25
-
26
- # Import utility modules
27
- from utils.model_utils import ModelManager
28
- from utils.conversation import ConversationManager
29
- from utils.api_compat import ChatRequest, ChatResponse, convert_openai_request_to_model_input, create_openai_response, format_messages_for_frontend
30
 
31
  # Configure logging
32
  logging.basicConfig(level=logging.INFO)
33
  logger = logging.getLogger(__name__)
34
 
35
- # Initialize managers
36
- conversation_manager = ConversationManager()
37
- model_manager = ModelManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # FastAPI app for OPENAI API compatibility
40
- app = FastAPI(title="AI Chat API", description="OPENAI API compatible interface for Qwen model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Add CORS middleware
43
  app.add_middleware(
@@ -48,94 +181,100 @@ app.add_middleware(
48
  allow_headers=["*"],
49
  )
50
 
51
- # Mount static files
52
- app.mount("/public", StaticFiles(directory="public"), name="public")
 
 
 
53
 
54
- @app.head("/ping")
55
- @app.get("/ping")
56
  async def health_check():
57
- """Health check endpoint for connection monitoring"""
58
- return {"status": "ok", "timestamp": time.time()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- @app.post("/v1/chat/completions", response_model=ChatResponse)
61
  async def chat_completion(request: ChatRequest):
62
- """OPENAI API compatible chat completion endpoint"""
63
  try:
64
- # Convert messages to prompt
65
- prompt = convert_openai_request_to_model_input(request)
66
-
67
- # Generate response
68
- response_text = model_manager.generate_response(
69
- prompt,
70
- request.max_tokens or 1024,
71
- request.temperature or 0.7
72
- )
73
-
74
- # Return in OPENAI format
75
- return create_openai_response(response_text, request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  except Exception as e:
77
  logger.error(f"Error in chat completion: {e}")
78
  raise HTTPException(status_code=500, detail=str(e))
79
 
80
- @app.post("/chat")
81
- async def chat_endpoint(request: dict):
82
- """Endpoint for frontend chat interface"""
83
- try:
84
- message = request.get("message", "")
85
- history = request.get("history", [])
86
-
87
- # Convert history to prompt
88
- prompt = ""
89
- for msg in history:
90
- if msg["role"] == "user":
91
- prompt += f"User: {msg['content']}\n"
92
- elif msg["role"] == "assistant":
93
- prompt += f"Assistant: {msg['content']}\n"
94
- prompt += f"User: {message}\nAssistant:"
95
-
96
- # Return streaming response
97
- return StreamingResponse(
98
- model_manager.generate_streaming_response(prompt),
99
- media_type="text/plain"
100
- )
101
- except Exception as e:
102
- logger.error(f"Error in chat endpoint: {e}")
103
- raise HTTPException(status_code=500, detail=str(e))
104
-
105
- # Gradio interface
106
- def predict(message, history):
107
- """Gradio prediction function"""
108
- # Convert history to prompt
109
- prompt = ""
110
- for human, ai in history:
111
- prompt += f"User: {human}\nAssistant: {ai}\n"
112
- prompt += f"User: {message}\nAssistant:"
113
-
114
- # Generate response
115
- response = model_manager.generate_response(prompt)
116
- return response
117
-
118
- # Create Gradio interface
119
- gradio_interface = gr.ChatInterface(
120
- fn=predict,
121
- title="AI Chat with Qwen Coder",
122
- description="Chat with Qwen/Qwen3-Coder-30B-A3B-Instruct model",
123
- examples=[
124
- ["Hello, how are you today?"],
125
- ["Can you explain quantum computing in simple terms?"],
126
- ["Write a Python function to calculate Fibonacci numbers"]
127
- ],
128
- cache_examples=False
129
- )
130
 
131
- # Mount Gradio interface
132
- app.mount("/", gradio_interface.app)
133
 
134
- # Run the app
135
- def launch_app():
136
- """Launch the combined FastAPI and Gradio app"""
137
- import uvicorn
138
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
139
 
140
  if __name__ == "__main__":
141
- launch_app()
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ AI Chat Application - Pure FastAPI Backend
4
+ Serves custom frontend with OpenAI compatible API
 
5
  """
6
 
7
  import os
 
12
  from typing import Optional, Dict, Any, Generator, List
13
  import torch
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 
15
  from fastapi import FastAPI, HTTPException, Response
16
+ from fastapi.responses import StreamingResponse, FileResponse
17
  from fastapi.staticfiles import StaticFiles
18
  from fastapi.middleware.cors import CORSMiddleware
 
19
  import asyncio
20
  import threading
21
  from threading import Thread
22
+ from pydantic import BaseModel
 
 
 
 
23
 
24
  # Configure logging
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
+ # Pydantic models for API requests/responses
29
+ class ChatMessage(BaseModel):
30
+ role: str
31
+ content: str
32
+
33
+ class ChatRequest(BaseModel):
34
+ messages: List[ChatMessage]
35
+ model: Optional[str] = "qwen-coder-3-30b"
36
+ temperature: Optional[float] = 0.7
37
+ max_tokens: Optional[int] = 2048
38
+ stream: Optional[bool] = False
39
+
40
+ class ChatResponse(BaseModel):
41
+ id: str
42
+ object: str = "chat.completion"
43
+ created: int
44
+ model: str
45
+ choices: List[Dict[str, Any]]
46
+
47
+ # Global model variables
48
+ tokenizer = None
49
+ model = None
50
 
51
+ def load_model():
52
+ """Load the Qwen model and tokenizer"""
53
+ global tokenizer, model
54
+
55
+ try:
56
+ model_name = "Qwen/Qwen3-Coder-30B-A3B-Instruct" # Adjust model name as needed
57
+
58
+ logger.info(f"Loading model: {model_name}")
59
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_name,
62
+ torch_dtype=torch.float16,
63
+ device_map="auto",
64
+ trust_remote_code=True
65
+ )
66
+ logger.info("Model loaded successfully")
67
+
68
+ except Exception as e:
69
+ logger.error(f"Error loading model: {e}")
70
+ # For development/testing, use a fallback
71
+ logger.warning("Using fallback model response")
72
+
73
+ def generate_response(messages: List[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048):
74
+ """Generate response from the model"""
75
+ try:
76
+ if model is None or tokenizer is None:
77
+ # Fallback response for development
78
+ return "I'm a Qwen AI assistant. The model is currently loading, please try again in a moment."
79
+
80
+ # Format messages for the model
81
+ formatted_messages = []
82
+ for msg in messages:
83
+ formatted_messages.append({"role": msg.role, "content": msg.content})
84
+
85
+ # Apply chat template
86
+ text = tokenizer.apply_chat_template(
87
+ formatted_messages,
88
+ tokenize=False,
89
+ add_generation_prompt=True
90
+ )
91
+
92
+ # Tokenize
93
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
94
+
95
+ # Generate
96
+ with torch.no_grad():
97
+ outputs = model.generate(
98
+ **inputs,
99
+ max_new_tokens=max_tokens,
100
+ temperature=temperature,
101
+ do_sample=True,
102
+ pad_token_id=tokenizer.eos_token_id
103
+ )
104
+
105
+ # Decode response
106
+ response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
107
+ return response.strip()
108
+
109
+ except Exception as e:
110
+ logger.error(f"Error generating response: {e}")
111
+ return f"I apologize, but I encountered an error while processing your request: {str(e)}"
112
+
113
+ def generate_streaming_response(messages: List[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048):
114
+ """Generate streaming response from the model"""
115
+ try:
116
+ if model is None or tokenizer is None:
117
+ # Fallback streaming response
118
+ response = "I'm a Qwen AI assistant. The model is currently loading, please try again in a moment."
119
+ for char in response:
120
+ yield f"data: {json.dumps({'choices': [{'delta': {'content': char}}]})}\n\n"
121
+ time.sleep(0.05)
122
+ yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop'}]})}\n\n"
123
+ yield "data: [DONE]\n\n"
124
+ return
125
+
126
+ # Format messages
127
+ formatted_messages = []
128
+ for msg in messages:
129
+ formatted_messages.append({"role": msg.role, "content": msg.content})
130
+
131
+ # Apply chat template
132
+ text = tokenizer.apply_chat_template(
133
+ formatted_messages,
134
+ tokenize=False,
135
+ add_generation_prompt=True
136
+ )
137
+
138
+ # Tokenize
139
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
140
+
141
+ # Setup streaming
142
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
143
+
144
+ generation_kwargs = {
145
+ **inputs,
146
+ "max_new_tokens": max_tokens,
147
+ "temperature": temperature,
148
+ "do_sample": True,
149
+ "pad_token_id": tokenizer.eos_token_id,
150
+ "streamer": streamer
151
+ }
152
+
153
+ # Start generation in a thread
154
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
155
+ thread.start()
156
+
157
+ # Stream the response
158
+ for new_text in streamer:
159
+ if new_text:
160
+ yield f"data: {json.dumps({'choices': [{'delta': {'content': new_text}}]})}\n\n"
161
+
162
+ yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop'}]})}\n\n"
163
+ yield "data: [DONE]\n\n"
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error in streaming generation: {e}")
167
+ error_msg = f"Error: {str(e)}"
168
+ yield f"data: {json.dumps({'choices': [{'delta': {'content': error_msg}}]})}\n\n"
169
+ yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop'}]})}\n\n"
170
+ yield "data: [DONE]\n\n"
171
+
172
+ # FastAPI app
173
+ app = FastAPI(title="AI Chat API", description="OpenAI compatible interface for Qwen model")
174
 
175
  # Add CORS middleware
176
  app.add_middleware(
 
181
  allow_headers=["*"],
182
  )
183
 
184
+ # API endpoints
185
+ @app.get("/")
186
+ async def serve_index():
187
+ """Serve the main HTML file"""
188
+ return FileResponse("public/index.html")
189
 
190
+ @app.get("/health")
 
191
  async def health_check():
192
+ """Health check endpoint"""
193
+ return {"status": "healthy", "model_loaded": model is not None}
194
+
195
+ @app.get("/ping")
196
+ async def ping():
197
+ """Simple ping endpoint"""
198
+ return {"status": "pong"}
199
+
200
+ @app.get("/api/models")
201
+ async def list_models():
202
+ """List available models"""
203
+ return {
204
+ "data": [
205
+ {
206
+ "id": "qwen-coder-3-30b",
207
+ "object": "model",
208
+ "created": int(time.time()),
209
+ "owned_by": "qwen"
210
+ }
211
+ ]
212
+ }
213
 
214
+ @app.post("/api/chat")
215
  async def chat_completion(request: ChatRequest):
216
+ """OpenAI compatible chat completion endpoint"""
217
  try:
218
+ if request.stream:
219
+ return StreamingResponse(
220
+ generate_streaming_response(
221
+ request.messages,
222
+ request.temperature or 0.7,
223
+ request.max_tokens or 2048
224
+ ),
225
+ media_type="text/plain"
226
+ )
227
+ else:
228
+ response_content = generate_response(
229
+ request.messages,
230
+ request.temperature or 0.7,
231
+ request.max_tokens or 2048
232
+ )
233
+
234
+ return ChatResponse(
235
+ id=f"chatcmpl-{int(time.time())}",
236
+ created=int(time.time()),
237
+ model=request.model or "qwen-coder-3-30b",
238
+ choices=[{
239
+ "index": 0,
240
+ "message": {
241
+ "role": "assistant",
242
+ "content": response_content
243
+ },
244
+ "finish_reason": "stop"
245
+ }]
246
+ )
247
+
248
  except Exception as e:
249
  logger.error(f"Error in chat completion: {e}")
250
  raise HTTPException(status_code=500, detail=str(e))
251
 
252
+ @app.post("/v1/chat/completions")
253
+ async def openai_chat_completion(request: ChatRequest):
254
+ """OpenAI API compatible endpoint"""
255
+ return await chat_completion(request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # Mount static files AFTER API routes
258
+ app.mount("/", StaticFiles(directory="public", html=True), name="static")
259
 
260
+ # Startup event
261
+ @app.on_event("startup")
262
+ async def startup_event():
263
+ """Initialize the model on startup"""
264
+ # Load model in background thread to avoid blocking startup
265
+ thread = Thread(target=load_model)
266
+ thread.daemon = True
267
+ thread.start()
268
 
269
  if __name__ == "__main__":
270
+ import uvicorn
271
+
272
+ # For Hugging Face Spaces
273
+ port = int(os.environ.get("PORT", 7860))
274
+
275
+ uvicorn.run(
276
+ app,
277
+ host="0.0.0.0",
278
+ port=port,
279
+ access_log=True
280
+ )
requirements.txt CHANGED
@@ -1,9 +1,8 @@
1
- gradio>=3.0.0
2
- transformers>=4.0.0
3
- torch>=1.9.0
4
- fastapi>=0.68.0
5
- uvicorn>=0.15.0
6
- redis>=3.5.0
7
- aiohttp>=3.7.0
8
- pydantic>=1.8.0
9
- accelerate>=0.20.0
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ torch>=2.0.0
4
+ transformers>=4.36.0
5
+ accelerate>=0.24.0
6
+ pydantic>=2.0.0
7
+ python-multipart>=0.0.6
8
+ aiofiles>=23.0.0