broadfield-dev commited on
Commit
c2ef06d
·
verified ·
1 Parent(s): a498c92

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ import requests
5
+ from io import BytesIO
6
+ from sglang import Engine
7
+ from qwen_vl_utils import process_vision_info
8
+ from transformers import AutoProcessor
9
+
10
+ # --- Configuration ---
11
+ CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"
12
+
13
+ # --- Model and Processor Loading ---
14
+ # Note: This is a heavy operation and will be done once when the Space starts.
15
+ processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)
16
+
17
+ # SGLang Engine setup for GPU
18
+ # For a CPU space, this will be extremely slow. A GPU is strongly recommended.
19
+ llm_engine = Engine(
20
+ model_path=CHECKPOINT_PATH,
21
+ enable_multimodal=True,
22
+ mem_fraction_static=0.8,
23
+ tp_size=1, # Set to 1 for a single GPU
24
+ attention_backend="fa3"
25
+ )
26
+
27
+ # --- Inference Function ---
28
+ def process_and_generate(image_input, text_prompt):
29
+ """
30
+ Processes the image and text prompt, and generates a response from the model.
31
+ """
32
+ if image_input is None or text_prompt.strip() == "":
33
+ return "Please provide both an image and a text prompt."
34
+
35
+ # Convert Gradio's image input (numpy array) to a PIL Image
36
+ pil_image = Image.fromarray(image_input)
37
+
38
+ # Prepare the messages payload for the model
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ {"type": "image", "image": pil_image},
44
+ {"type": "text", "text": text_prompt},
45
+ ],
46
+ }
47
+ ]
48
+
49
+ # Apply the chat template and process vision info
50
+ text = processor.apply_chat_template(
51
+ messages,
52
+ tokenize=False,
53
+ add_generation_prompt=True
54
+ )
55
+
56
+ image_inputs, _ = process_vision_info(
57
+ messages,
58
+ image_patch_size=processor.image_processor.patch_size
59
+ )
60
+
61
+ # Define sampling parameters
62
+ sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}
63
+
64
+ # Generate the response
65
+ try:
66
+ response = llm_engine.generate(
67
+ prompt=text,
68
+ image_data=image_inputs,
69
+ sampling_params=sampling_params
70
+ )
71
+ return response['text']
72
+ except Exception as e:
73
+ return f"An error occurred during generation: {str(e)}"
74
+
75
+ # --- Gradio Interface ---
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown(
78
+ """
79
+ # Qwen3-VL-2B-Instruct-FP8 Demo
80
+ This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
81
+ Upload an image, type a question or a command, and see the model's response.
82
+ **Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
83
+ """
84
+ )
85
+
86
+ with gr.Row():
87
+ with gr.Column():
88
+ image_input = gr.Image(type="numpy", label="Upload Image")
89
+ text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
90
+ submit_button = gr.Button("Generate Response")
91
+ with gr.Column():
92
+ output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)
93
+
94
+ submit_button.click(
95
+ fn=process_and_generate,
96
+ inputs=[image_input, text_prompt],
97
+ outputs=output_text
98
+ )
99
+
100
+ gr.Examples(
101
+ examples=[
102
+ ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
103
+ ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
104
+ ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
105
+ ],
106
+ inputs=[image_input, text_prompt]
107
+ )
108
+
109
+ if __name__ == "__main__":
110
+ demo.launch()