Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 13 days ago

Commit

23c94ef

verified ·

1 Parent(s): 4306537

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -110

app.py CHANGED Viewed

@@ -19,83 +19,54 @@ DTYPE = "auto"
 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 PLACEHOLDERS = {
-    "Query": "What is in this image?",
-    "Caption": "Select a caption length from the suggestions below.",
-    "Point": "Select an object from suggestions or enter a custom one.",
-    "Detect": "Select an object from suggestions or enter a custom one.",
 }
 qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen3-VL-32B-Instruct",
-    torch_dtype=DTYPE,
     device_map=DEVICE,
 ).eval()
 qwen_processor = Qwen3VLProcessor.from_pretrained(
-    "Qwen/Qwen3-VL-32B-Instruct",
 )
-print("Model loaded successfully.")
 # --- Utility Functions ---
 def safe_parse_json(text: str):
-    """Safely parse JSON or Python literal from a string, cleaning it first."""
-    # Find the JSON object within the text
-    match = re.search(r'\{.*\}', text, re.DOTALL)
-    if not match:
-        return {}
-    text = match.group(0)
     try:
         return json.loads(text)
     except json.JSONDecodeError:
-        try:
-            # Fallback for Python dictionary literals
-            return ast.literal_eval(text)
-        except (ValueError, SyntaxError):
-            return {}
-def annotate_image(image: Image.Image, result: dict, category: str):
-    """Draws annotations on the image based on the model's output."""
-    if not isinstance(image, Image.Image) or not isinstance(result, dict):
-        return image
-    image_np = np.array(image.convert("RGB"))
-    # Handle Point annotations
-    if category == "Point" and "points" in result and result["points"]:
-        points_xy = np.array(result["points"])
-        if points_xy.size == 0:
-            return image
-        # Denormalize points from [0, 1] range to image dimensions
-        points_xy *= np.array([image.width, image.height])
-        key_points = sv.KeyPoints(xy=points_xy.reshape(1, -1, 2))
-        annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
-        annotated_image = annotator.annotate(scene=image_np.copy(), key_points=key_points)
-        return Image.fromarray(annotated_image)
-    # Handle Detection annotations
-    if category == "Detect" and "objects" in result and result["objects"]:
-        boxes_xyxy = np.array(result["objects"])
-        if boxes_xyxy.size == 0:
-            return image
-        # Denormalize boxes from [0, 1] range to image dimensions
-        boxes_xyxy *= np.array([image.width, image.height, image.width, image.height])
-        detections = sv.Detections(xyxy=boxes_xyxy)
-        annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=4)
-        annotated_image = annotator.annotate(scene=image_np.copy(), detections=detections)
-        return Image.fromarray(annotated_image)
-    return image
 # --- Inference Functions ---
 def run_qwen_inference(image: Image.Image, prompt: str):
-    """Core function to run inference with the Qwen3-VL model."""
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
     inputs = qwen_processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -105,9 +76,15 @@ def run_qwen_inference(image: Image.Image, prompt: str):
     ).to(DEVICE)
     with torch.inference_mode():
-        generated_ids = qwen_model.generate(**inputs, max_new_tokens=512)
-    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]
     output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
@@ -118,88 +95,174 @@ def run_qwen_inference(image: Image.Image, prompt: str):
 @GPU
 def get_suggested_objects(image: Image.Image):
-    """Get suggested objects in the image using Qwen3-VL to populate radio buttons."""
     if image is None:
-        return gr.Radio(choices=[], visible=False)
     try:
-        prompt = "List the 3 most prominent objects in this image as a Python list of strings. Example: ['car', 'tree', 'person']"
-        result_text = run_qwen_inference(image, prompt)
         match = re.search(r'\[.*?\]', result_text)
         if match:
-            suggestions = ast.literal_eval(match.group())
-            if isinstance(suggestions, list) and suggestions:
-                return gr.Radio(choices=suggestions, visible=True, interactive=True)
     except Exception as e:
         print(f"Error getting suggestions with Qwen: {e}")
-    return gr.Radio(choices=[], visible=False)
 @GPU
 def process_qwen(image: Image.Image, category: str, prompt: str):
-    """Process inputs based on the selected category, returning text and data for annotation."""
     if category == "Query":
         return run_qwen_inference(image, prompt), {}
     elif category == "Caption":
         full_prompt = f"Provide a {prompt} length caption for the image."
         return run_qwen_inference(image, full_prompt), {}
     elif category == "Point":
         full_prompt = (
-            f"Provide 2D point coordinates for '{prompt}'. Respond ONLY with a JSON object like "
-            f"`{{\"points\": [[x1, y1], [x2, y2], ...]}}`. The coordinates must be normalized between 0.0 and 1.0."
         )
         output_text = run_qwen_inference(image, full_prompt)
         parsed_json = safe_parse_json(output_text)
-        # Ensure the parsed data has the correct structure
-        if "points" not in parsed_json or not isinstance(parsed_json["points"], list):
-            return output_text, {}
-        return output_text, parsed_json
     elif category == "Detect":
         full_prompt = (
-            f"Provide bounding box coordinates for '{prompt}'. Respond ONLY with a JSON object like "
-            f"`{{\"objects\": [[x_min, y_min, x_max, y_max], ...]}}`. The coordinates must be normalized between 0.0 and 1.0."
         )
         output_text = run_qwen_inference(image, full_prompt)
         parsed_json = safe_parse_json(output_text)
-        if "objects" not in parsed_json or not isinstance(parsed_json["objects"], list):
-            return output_text, {}
-        return output_text, parsed_json
     return "Invalid category", {}
 # --- Gradio Interface Logic ---
 def on_category_and_image_change(image, category):
-    """Handle UI changes when the image or category is updated."""
     text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
     if category == "Caption":
-        return gr.Radio(choices=["short", "normal", "long"], value="normal", visible=True), text_box
     if image is None or category not in ["Point", "Detect"]:
         return gr.Radio(choices=[], visible=False), text_box
-    return get_suggested_objects(image), text_box
 def process_inputs(image, category, prompt):
-    """Main function to handle the user's submission."""
     if image is None:
         raise gr.Error("Please upload an image.")
     if not prompt and category not in ["Caption"]:
-         raise gr.Error("Please provide a prompt or select a suggestion.")
-    if category == "Caption" and not prompt:
-        prompt = "normal" # Default caption length
-    image.thumbnail((1024, 1024)) # Resize for faster inference
     qwen_text, qwen_data = process_qwen(image, category, prompt)
-    qwen_annotated_image = annotate_image(image, qwen_data, category)
     return qwen_annotated_image, qwen_text
@@ -207,38 +270,54 @@ def process_inputs(image, category, prompt):
 # --- Gradio UI Layout ---
 with gr.Blocks(theme=Ocean()) as demo:
     gr.Markdown("# 👓 Object Understanding with Qwen3-VL")
-    gr.Markdown("### Explore object detection, keypoint detection, and captioning using natural language prompts.")
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Input Image")
             category_select = gr.Radio(
-                choices=CATEGORIES, value=CATEGORIES[0], label="Select Task", interactive=True
             )
             suggestions_radio = gr.Radio(
-                choices=[], label="Suggestions", visible=False, interactive=True
             )
             prompt_input = gr.Textbox(
-                placeholder=PLACEHOLDERS[CATEGORIES[0]], label="Prompt", lines=2
             )
-            submit_btn = gr.Button("Generate", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Qwen/Qwen3-VL-4B-Instruct Output")
             qwen_img_output = gr.Image(label="Annotated Image")
-            qwen_text_output = gr.Textbox(label="Text Output", lines=8, interactive=False, show_copy_button=True)
     gr.Examples(
         examples=[
-            ["examples/cars.jpg", "Query", "How many cars are in the image?"],
-            ["examples/dog_beach.jpg", "Detect", "dog"],
-            ["examples/person_skiing.jpg", "Point", "the person's head"],
-            ["examples/dog_beach.jpg", "Caption", "short"],
         ],
         inputs=[image_input, category_select, prompt_input],
     )
     # --- Event Listeners ---
     category_select.change(
         fn=on_category_and_image_change,
         inputs=[image_input, category_select],
@@ -249,7 +328,15 @@ with gr.Blocks(theme=Ocean()) as demo:
         inputs=[image_input, category_select],
         outputs=[suggestions_radio, prompt_input],
     )
-    suggestions_radio.change(fn=lambda x: x, inputs=suggestions_radio, outputs=prompt_input)
     submit_btn.click(
         fn=process_inputs,
         inputs=[image_input, category_select, prompt_input],
@@ -257,4 +344,4 @@ with gr.Blocks(theme=Ocean()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 PLACEHOLDERS = {
+    "Query": "What's in this image?",
+    "Caption": "Select caption length: short, normal, or long",
+    "Point": "Select an object from suggestions or enter manually",
+    "Detect": "Select an object from suggestions or enter manually",
 }
+# --- Model Loading ---
+# Load Qwen3-VL
 qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen3-VL-4B-Instruct",
+    dtype=DTYPE,
     device_map=DEVICE,
 ).eval()
 qwen_processor = Qwen3VLProcessor.from_pretrained(
+    "Qwen/Qwen3-VL-4B-Instruct",
 )
 # --- Utility Functions ---
 def safe_parse_json(text: str):
+    """Safely parse a string that may be JSON or a Python literal."""
+    text = text.strip()
+    # Remove markdown code blocks
+    text = re.sub(r"^```(json)?", "", text)
+    text = re.sub(r"```$", "", text)
+    text = text.strip()
     try:
         return json.loads(text)
     except json.JSONDecodeError:
+        pass
+    try:
+        # Fallback to literal_eval for Python-like dictionary/list strings
+        return ast.literal_eval(text)
+    except Exception:
+        return {}
 # --- Inference Functions ---
 def run_qwen_inference(image: Image.Image, prompt: str):
+    """Core function to run inference with the Qwen model."""
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
     inputs = qwen_processor.apply_chat_template(
         messages,
         tokenize=True,
     ).to(DEVICE)
     with torch.inference_mode():
+        generated_ids = qwen_model.generate(
+            **inputs,
+            max_new_tokens=512,
+        )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
     output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
 @GPU
 def get_suggested_objects(image: Image.Image):
+    """Get suggested objects in the image using Qwen."""
     if image is None:
+        return []
     try:
+        # Resize image for faster suggestion generation
+        suggest_image = image.copy()
+        suggest_image.thumbnail((512, 512))
+        prompt = "List the main objects in the image in a Python list format. For example: ['cat', 'dog', 'table']"
+        result_text = run_qwen_inference(suggest_image, prompt)
+        # Clean up the output to find the list
         match = re.search(r'\[.*?\]', result_text)
         if match:
+            suggested_objects = ast.literal_eval(match.group())
+            if isinstance(suggested_objects, list):
+                # Return up to 3 suggestions
+                return suggested_objects[:3]
+        return []
     except Exception as e:
         print(f"Error getting suggestions with Qwen: {e}")
+        return []
+def annotate_image(image: Image.Image, result: dict):
+    """Annotates the image with points or bounding boxes based on model output."""
+    if not isinstance(image, Image.Image) or not isinstance(result, dict):
+        return image
+    original_width, original_height = image.size
+    scene_np = np.array(image.copy())
+    # Handle Point annotations
+    if "points" in result and result["points"]:
+        points_list = []
+        for point in result.get("points", []):
+            x = int(point["x"] * original_width)
+            y = int(point["y"] * original_height)
+            points_list.append([x, y])
+        if not points_list:
+            return image
+        points_array = np.array(points_list).reshape(-1, 2)
+        key_points = sv.KeyPoints(xy=points_array)
+        vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        annotated_image_np = vertex_annotator.annotate(
+            scene=scene_np, key_points=key_points
+        )
+        return Image.fromarray(annotated_image_np)
+    # Handle Detection annotations
+    if "objects" in result and result["objects"]:
+        boxes = []
+        for obj in result["objects"]:
+            x_min = obj["x_min"] * original_width
+            y_min = obj["y_min"] * original_height
+            x_max = obj["x_max"] * original_width
+            y_max = obj["y_max"] * original_height
+            boxes.append([x_min, y_min, x_max, y_max])
+        if not boxes:
+            return image
+        detections = sv.Detections(xyxy=np.array(boxes))
+        box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=4)
+        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+        annotated_image_np = box_annotator.annotate(
+            scene=scene_np, detections=detections
+        )
+        return Image.fromarray(annotated_image_np)
+    return image
 @GPU
 def process_qwen(image: Image.Image, category: str, prompt: str):
+    """Processes the input based on the selected category using the Qwen model."""
     if category == "Query":
         return run_qwen_inference(image, prompt), {}
     elif category == "Caption":
         full_prompt = f"Provide a {prompt} length caption for the image."
         return run_qwen_inference(image, full_prompt), {}
     elif category == "Point":
         full_prompt = (
+            f"Provide 2d point coordinates for {prompt}. Report in JSON format like "
+            `[{"point_2d": [x, y]}]` " where coordinates are from 0 to 1000."
         )
         output_text = run_qwen_inference(image, full_prompt)
         parsed_json = safe_parse_json(output_text)
+        points_result = {"points": []}
+        if isinstance(parsed_json, list):
+            for item in parsed_json:
+                if "point_2d" in item and len(item["point_2d"]) == 2:
+                    x, y = item["point_2d"]
+                    points_result["points"].append({"x": x / 1000.0, "y": y / 1000.0})
+        return json.dumps(points_result, indent=2), points_result
     elif category == "Detect":
         full_prompt = (
+            f"Provide bounding box coordinates for {prompt}. Report in JSON format like "
+            `[{"bbox_2d": [xmin, ymin, xmax, ymax]}]` " where coordinates are from 0 to 1000."
         )
         output_text = run_qwen_inference(image, full_prompt)
         parsed_json = safe_parse_json(output_text)
+        objects_result = {"objects": []}
+        if isinstance(parsed_json, list):
+            for item in parsed_json:
+                if "bbox_2d" in item and len(item["bbox_2d"]) == 4:
+                    xmin, ymin, xmax, ymax = item["bbox_2d"]
+                    objects_result["objects"].append(
+                        {
+                            "x_min": xmin / 1000.0,
+                            "y_min": ymin / 1000.0,
+                            "x_max": xmax / 1000.0,
+                            "y_max": ymax / 1000.0,
+                        }
+                    )
+        return json.dumps(objects_result, indent=2), objects_result
     return "Invalid category", {}
 # --- Gradio Interface Logic ---
 def on_category_and_image_change(image, category):
+    """Generate suggestions when category or image changes."""
     text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
     if category == "Caption":
+        return gr.Radio(choices=["short", "normal", "long"], label="Caption Length", value="normal", visible=True), text_box
     if image is None or category not in ["Point", "Detect"]:
         return gr.Radio(choices=[], visible=False), text_box
+    suggestions = get_suggested_objects(image)
+    if suggestions:
+        return gr.Radio(choices=suggestions, label="Suggestions", visible=True, interactive=True), text_box
+    else:
+        return gr.Radio(choices=[], visible=False), text_box
+def update_prompt_from_radio(selected_object):
+    """Update prompt textbox when a radio option is selected."""
+    if selected_object:
+        return gr.Textbox(value=selected_object)
+    return gr.Textbox(value="")
 def process_inputs(image, category, prompt):
+    """Main function to handle the user's request."""
     if image is None:
         raise gr.Error("Please upload an image.")
     if not prompt and category not in ["Caption"]:
+         # Caption can have an empty prompt if a length is selected
+        if category == "Caption" and not prompt:
+            prompt = "normal" # default
+        else:
+            raise gr.Error("Please provide a prompt or select a suggestion.")
+    # Resize the image to make inference quicker
+    image.thumbnail((1024, 1024))
+    # Process with Qwen
     qwen_text, qwen_data = process_qwen(image, category, prompt)
+    qwen_annotated_image = annotate_image(image, qwen_data)
     return qwen_annotated_image, qwen_text
 # --- Gradio UI Layout ---
 with gr.Blocks(theme=Ocean()) as demo:
     gr.Markdown("# 👓 Object Understanding with Qwen3-VL")
+    gr.Markdown(
+        "### Explore object detection, visual grounding, and keypoint detection through natural language prompts."
+    )
+    gr.Markdown("""
+    *Powered by [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct). Inspired by the tutorial [Object Detection and Visual Grounding with Qwen 2.5](https://pyimagesearch.com/2025/06/09/object-detection-and-visual-grounding-with-qwen-2-5/) on PyImageSearch.*
+    """)
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Input Image")
             category_select = gr.Radio(
+                choices=CATEGORIES,
+                value=CATEGORIES[0],
+                label="Select Task Category",
+                interactive=True,
             )
             suggestions_radio = gr.Radio(
+                choices=[],
+                label="Suggestions",
+                visible=False,
+                interactive=True,
             )
             prompt_input = gr.Textbox(
+                placeholder=PLACEHOLDERS[CATEGORIES[0]],
+                label="Prompt",
+                lines=2,
             )
+            submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Qwen/Qwen3-VL-4B-Instruct Output")
             qwen_img_output = gr.Image(label="Annotated Image")
+            qwen_text_output = gr.Textbox(
+                label="Text Output", lines=10, interactive=False
+            )
     gr.Examples(
         examples=[
+            ["examples/example_1.jpg", "Query", "How many cars are in the image?"],
+            ["examples/example_1.jpg", "Detect", "car"],
+            ["examples/example_2.JPG", "Point", "the person's face"],
+            ["examples/example_2.JPG", "Caption", "short"],
         ],
         inputs=[image_input, category_select, prompt_input],
     )
     # --- Event Listeners ---
+    # When image or category changes, update suggestions
     category_select.change(
         fn=on_category_and_image_change,
         inputs=[image_input, category_select],
         inputs=[image_input, category_select],
         outputs=[suggestions_radio, prompt_input],
     )
+    # When a suggestion is clicked, update the prompt box
+    suggestions_radio.change(
+        fn=update_prompt_from_radio,
+        inputs=[suggestions_radio],
+        outputs=[prompt_input],
+    )
+    # Main submission action
     submit_btn.click(
         fn=process_inputs,
         inputs=[image_input, category_select, prompt_input],
     )
 if __name__ == "__main__":
+    demo.launch(debug=True)