Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Running

App Files Files Community

broadfield-dev commited on 20 days ago

Commit

c2ef06d

verified ·

1 Parent(s): a498c92

Create app.py

Browse files

Files changed (1) hide show

app.py +110 -0

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+import torch
+from PIL import Image
+import requests
+from io import BytesIO
+from sglang import Engine
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+# --- Configuration ---
+CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"
+# --- Model and Processor Loading ---
+# Note: This is a heavy operation and will be done once when the Space starts.
+processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)
+# SGLang Engine setup for GPU
+# For a CPU space, this will be extremely slow. A GPU is strongly recommended.
+llm_engine = Engine(
+    model_path=CHECKPOINT_PATH,
+    enable_multimodal=True,
+    mem_fraction_static=0.8,
+    tp_size=1, # Set to 1 for a single GPU
+    attention_backend="fa3"
+)
+# --- Inference Function ---
+def process_and_generate(image_input, text_prompt):
+    """
+    Processes the image and text prompt, and generates a response from the model.
+    """
+    if image_input is None or text_prompt.strip() == "":
+        return "Please provide both an image and a text prompt."
+    # Convert Gradio's image input (numpy array) to a PIL Image
+    pil_image = Image.fromarray(image_input)
+    # Prepare the messages payload for the model
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": text_prompt},
+            ],
+        }
+    ]
+    # Apply the chat template and process vision info
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    image_inputs, _ = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size
+    )
+    # Define sampling parameters
+    sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}
+    # Generate the response
+    try:
+        response = llm_engine.generate(
+            prompt=text,
+            image_data=image_inputs,
+            sampling_params=sampling_params
+        )
+        return response['text']
+    except Exception as e:
+        return f"An error occurred during generation: {str(e)}"
+# --- Gradio Interface ---
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # Qwen3-VL-2B-Instruct-FP8 Demo
+        This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
+        Upload an image, type a question or a command, and see the model's response.
+        **Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="numpy", label="Upload Image")
+            text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
+            submit_button = gr.Button("Generate Response")
+        with gr.Column():
+            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)
+    submit_button.click(
+        fn=process_and_generate,
+        inputs=[image_input, text_prompt],
+        outputs=output_text
+    )
+    gr.Examples(
+        examples=[
+            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
+            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
+            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
+        ],
+        inputs=[image_input, text_prompt]
+    )
+if __name__ == "__main__":
+    demo.launch()