Spaces:
Runtime error
Runtime error
| import spaces | |
| import torch | |
| import gradio as gr | |
| from PIL import Image | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from functools import lru_cache | |
| MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B" | |
| def _load_model(): | |
| """Load and cache the model and processor inside GPU worker.""" | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 | |
| ).to("cuda") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| return model, processor | |
| def gpu_inference(image_path: str, prompt: str) -> str: | |
| """Perform inference entirely in GPU subprocess.""" | |
| model, processor = _load_model() | |
| # Load and preprocess image | |
| image = Image.open(image_path).convert("RGB") | |
| if image.width > 512: | |
| ratio = image.height / image.width | |
| image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS) | |
| # Build conversation | |
| system_msg = ( | |
| "You are VL-Thinking U+1F914, a helpful assistant with excellent reasoning ability.\n" | |
| "A user asks you a question, and you should try to solve it." | |
| "You should first think about the reasoning process in the mind and then provides the user with the answer.\n" | |
| "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>." | |
| ) | |
| conversation = [ | |
| {"role": "system", "content": [{"type": "text", "text": system_msg}]}, | |
| {"role": "user", "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": prompt} | |
| ]} | |
| ] | |
| # Tokenize, generate, decode | |
| chat_input = processor.apply_chat_template( | |
| conversation, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda") | |
| output_ids = model.generate(**inputs, max_new_tokens=1024) | |
| decoded = processor.batch_decode( | |
| output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| # Extract assistant portion | |
| return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip() | |
| # Message handling | |
| def add_message(history, user_input): | |
| if history is None: | |
| history = [] | |
| for f in user_input.get("files", []): | |
| history.append({"role": "user", "content": (f,)}) | |
| text = user_input.get("text", "") | |
| if text: | |
| history.append({"role": "user", "content": text}) | |
| return history, gr.MultimodalTextbox(value=None) | |
| def inference_interface(history): | |
| if not history: | |
| return history, gr.MultimodalTextbox(value=None) | |
| # Last user text | |
| user_text = next( | |
| (m["content"] for m in reversed(history) | |
| if m["role"] == "user" and isinstance(m["content"], str)), | |
| None | |
| ) | |
| if user_text is None: | |
| return history, gr.MultimodalTextbox(value=None) | |
| # Last user image | |
| image_path = next( | |
| (m["content"][0] for m in reversed(history) | |
| if m["role"] == "user" and isinstance(m["content"], tuple)), | |
| None | |
| ) | |
| if image_path is None: | |
| return history, gr.MultimodalTextbox(value=None) | |
| # GPU inference | |
| reply = gpu_inference(image_path, user_text) | |
| history.append({"role": "assistant", "content": reply}) | |
| return history, gr.MultimodalTextbox(value=None) | |
| def build_demo(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# SpaceThinker-Qwen2.5VL-3B") | |
| chatbot = gr.Chatbot([], type="messages", label="Conversation") | |
| chat_input = gr.MultimodalTextbox( | |
| interactive=True, | |
| file_types=["image"], | |
| placeholder="Enter text and upload an image.", | |
| show_label=True | |
| ) | |
| submit_evt = chat_input.submit( | |
| add_message, [chatbot, chat_input], [chatbot, chat_input] | |
| ) | |
| submit_evt.then( | |
| inference_interface, [chatbot], [chatbot, chat_input] | |
| ) | |
| with gr.Row(): | |
| send_btn = gr.Button("Send") | |
| clear_btn = gr.ClearButton([chatbot, chat_input]) | |
| send_click = send_btn.click( | |
| add_message, [chatbot, chat_input], [chatbot, chat_input] | |
| ) | |
| send_click.then( | |
| inference_interface, [chatbot], [chatbot, chat_input] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_demo() | |
| demo.launch(share=True) | |