Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

1b33b1c

verified ·

1 Parent(s): dd45b2a

update app

Browse files

Files changed (1) hide show

app.py +149 -94

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import os
 import gradio as gr
 import numpy as np
 import torch
-from PIL import Image
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -100,96 +101,133 @@ try:
     print("Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
     model = None
     processor = None
 @spaces.GPU(duration=60)
-def process_image(input_image, task_type, text_prompt, threshold=0.5):
     if input_image is None:
         raise gr.Error("Please upload an image.")
     if model is None or processor is None:
         raise gr.Error("Model not loaded correctly.")
-    # Convert image to RGB
     image_pil = input_image.convert("RGB")
-    annotations = []
-    with torch.no_grad():
-        if task_type == "Instance Segmentation":
             if not text_prompt:
-                raise gr.Error("Please enter a text prompt for Instance Segmentation.")
-            # 1. Instance Segmentation Flow (Text Prompt)
             inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
-            outputs = model(**inputs)
-            # Post-process instance masks
-            results = processor.post_process_instance_segmentation(
-                outputs,
-                threshold=threshold,
-                mask_threshold=0.5,
-                target_sizes=inputs.get("original_sizes").tolist()
-            )[0]
-            masks_np = results['masks'].cpu().numpy() # [N, H, W]
-            scores_np = results['scores'].cpu().numpy()
-            for i, mask in enumerate(masks_np):
-                score_val = scores_np[i]
-                label = f"{text_prompt} ({score_val:.2f})"
-                annotations.append((mask, label))
-        elif task_type == "Semantic Segmentation":
-            # 2. Semantic Segmentation Flow (No Prompt)
-            # Call processor without text
-            inputs = processor(images=image_pil, return_tensors="pt").to(device)
-            outputs = model(**inputs)
-            # Extract semantic segmentation map
-            # Shape: [batch, channels, height, width]
-            semantic_seg = outputs.semantic_seg
-            # Process for visualization:
-            # Assuming semantic_seg is a dense map (e.g., saliency or class probabilities).
-            # Since the snippet implies a single channel [batch, 1, H, W], we threshold it.
-            # Remove batch dim -> [1, H, W] or [C, H, W]
-            seg_map = semantic_seg.squeeze(0)
-            # If 1 channel, create binary mask based on threshold/sigmoid
-            if seg_map.shape[0] == 1:
-                # Apply sigmoid if logits, or just threshold if probs
-                # Assuming logits for general safety in torch models
-                mask_tensor = torch.sigmoid(seg_map[0]) > threshold
-                mask_np = mask_tensor.cpu().numpy()
-                # Resize mask to original image size if needed
-                # (Note: outputs.semantic_seg is usually feature map size, might need upscaling)
-                # For simplicity in this snippet, we assume processor/output aligns or AnnotatedImage handles resizing (it usually requires matching sizes).
-                # If size mismatch occurs, we convert mask to PIL, resize, then back to numpy.
-                if mask_np.shape != (image_pil.height, image_pil.width):
-                    mask_img = Image.fromarray(mask_np.astype(np.uint8) * 255)
-                    mask_img = mask_img.resize(image_pil.size, Image.NEAREST)
-                    mask_np = np.array(mask_img) > 128
-                annotations.append((mask_np, "Semantic Region"))
-            else:
-                # If multiple channels (classes), take argmax
-                # This logic depends on specific SAM3 output structure
-                mask_idx = torch.argmax(seg_map, dim=0).cpu().numpy()
-                # Just visualize non-background (assuming 0 is background)
-                mask_np = mask_idx > 0
-                if mask_np.shape != (image_pil.height, image_pil.width):
-                     mask_img = Image.fromarray(mask_np.astype(np.uint8) * 255)
-                     mask_img = mask_img.resize(image_pil.size, Image.NEAREST)
-                     mask_np = np.array(mask_img) > 128
-                annotations.append((mask_np, "Segmented Objects"))
-    # Return tuple format for AnnotatedImage: (original_image, list_of_annotations)
     return (image_pil, annotations)
 # --- UI Logic ---
@@ -207,12 +245,6 @@ css="""
 }
 """
-def update_visibility(task):
-    if task == "Instance Segmentation":
-        return gr.update(visible=True)
-    else:
-        return gr.update(visible=False)
 with gr.Blocks(css=css, theme=plum_theme) as demo:
     with gr.Column(elem_id="col-container"):
         # Header with Logo
@@ -221,59 +253,82 @@ with gr.Blocks(css=css, theme=plum_theme) as demo:
             elem_id="main-title"
         )
-        gr.Markdown("Segment objects using **SAM3** (Segment Anything Model 3). Choose **Instance** for specific text prompts or **Semantic** for automatic segmentation.")
         with gr.Row():
             # Left Column: Inputs
             with gr.Column(scale=1):
                 input_image = gr.Image(label="Input Image", type="pil", height=350)
-                task_type = gr.Radio(
-                    choices=["Instance Segmentation", "Semantic Segmentation"],
-                    value="Instance Segmentation",
                     label="Task Type",
                     interactive=True
                 )
-                text_prompt = gr.Textbox(
                     label="Text Prompt",
-                    placeholder="e.g., cat, ear, car wheel...",
-                    info="Required for Instance Segmentation",
                     visible=True
                 )
                 threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
-                run_button = gr.Button("Run Segmentation", variant="primary")
             # Right Column: Output
             with gr.Column(scale=1.5):
                 output_image = gr.AnnotatedImage(label="Segmented Output", height=500)
-        # Event: Hide text prompt when Semantic Segmentation is selected
         task_type.change(
-            fn=update_visibility,
             inputs=[task_type],
-            outputs=[text_prompt]
         )
         # Examples
         gr.Examples(
             examples=[
-                ["examples/cat.jpg", "Instance Segmentation", "cat", 0.5],
-                ["examples/room.jpg", "Semantic Segmentation", "", 0.5],
-                ["examples/car.jpg", "Instance Segmentation", "tire", 0.4],
             ],
-            inputs=[input_image, task_type, text_prompt, threshold],
             outputs=[output_image],
-            fn=process_image,
             cache_examples=False,
-            label="Examples"
         )
     run_button.click(
-        fn=process_image,
-        inputs=[input_image, task_type, text_prompt, threshold],
         outputs=[output_image]
     )

 import gradio as gr
 import numpy as np
 import torch
+import random
+from PIL import Image, ImageDraw
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
     print("Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
+    print("Ensure you have the correct libraries installed (transformers>=4.40.0) and access to the model.")
     model = None
     processor = None
+# --- Helper Functions ---
+def parse_boxes(box_str):
+    """
+    Parses a string of coordinates into a list of lists.
+    Format expected: "x1,y1,x2,y2" or "x1,y1,x2,y2; x3,y3,x4,y4"
+    """
+    try:
+        boxes = []
+        # Split by semicolon for multiple boxes
+        segments = box_str.split(';')
+        for seg in segments:
+            if not seg.strip():
+                continue
+            coords = [float(c.strip()) for c in seg.split(',')]
+            if len(coords) != 4:
+                raise ValueError(f"Expected 4 coordinates per box, got {len(coords)}")
+            boxes.append(coords)
+        return boxes
+    except Exception as e:
+        raise ValueError(f"Invalid box format: {e}")
 @spaces.GPU(duration=60)
+def process_sam3(input_image, task_type, text_prompt, box_input, threshold=0.5):
     if input_image is None:
         raise gr.Error("Please upload an image.")
     if model is None or processor is None:
         raise gr.Error("Model not loaded correctly.")
     image_pil = input_image.convert("RGB")
+    inputs = {}
+    # Logic branching based on Task Type
+    try:
+        if task_type == "Text Prompt":
             if not text_prompt:
+                raise gr.Error("Please enter a text prompt.")
             inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
+            display_label_prefix = text_prompt
+        elif task_type == "Single Bounding Box":
+            if not box_input:
+                raise gr.Error("Please enter box coordinates.")
+            boxes = parse_boxes(box_input)
+            if len(boxes) != 1:
+                raise gr.Error("Please provide exactly one box for this mode.")
+            input_boxes = [boxes] # [batch_size, num_boxes, 4]
+            input_boxes_labels = [[1]] # 1 = positive
+            inputs = processor(
+                images=image_pil,
+                input_boxes=input_boxes,
+                input_boxes_labels=input_boxes_labels,
+                return_tensors="pt"
+            ).to(device)
+            display_label_prefix = "Box"
+        elif task_type == "Multiple Boxes (Positive)":
+            if not box_input:
+                raise gr.Error("Please enter box coordinates.")
+            boxes = parse_boxes(box_input) # Returns list of [x1,y1,x2,y2]
+            input_boxes = [boxes] # [batch, num_boxes, 4]
+            # All labels 1 (positive)
+            input_boxes_labels = [[1] * len(boxes)]
+            inputs = processor(
+                images=image_pil,
+                input_boxes=input_boxes,
+                input_boxes_labels=input_boxes_labels,
+                return_tensors="pt"
+            ).to(device)
+            display_label_prefix = "Multi-Box"
+        elif task_type == "Text + Negative Box":
+            if not text_prompt or not box_input:
+                raise gr.Error("Please provide both Text Prompt and Box Coordinates.")
+            boxes = parse_boxes(box_input)
+            input_boxes = [boxes]
+            # Labels 0 (negative/exclude)
+            input_boxes_labels = [[0] * len(boxes)]
+            inputs = processor(
+                images=image_pil,
+                text=text_prompt,
+                input_boxes=input_boxes,
+                input_boxes_labels=input_boxes_labels,
+                return_tensors="pt"
+            ).to(device)
+            display_label_prefix = f"{text_prompt} (Excl. Box)"
+    except ValueError as e:
+        raise gr.Error(str(e))
+    # Inference
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Post-processing
+    results = processor.post_process_instance_segmentation(
+        outputs,
+        threshold=threshold,
+        mask_threshold=0.5,
+        target_sizes=inputs.get("original_sizes").tolist()
+    )[0]
+    masks = results['masks']
+    scores = results['scores']
+    # Prepare AnnotatedImage Output
+    annotations = []
+    masks_np = masks.cpu().numpy()
+    scores_np = scores.cpu().numpy()
+    for i, mask in enumerate(masks_np):
+        score_val = scores_np[i]
+        label = f"{display_label_prefix} ({score_val:.2f})"
+        annotations.append((mask, label))
     return (image_pil, annotations)
 # --- UI Logic ---
 }
 """
 with gr.Blocks(css=css, theme=plum_theme) as demo:
     with gr.Column(elem_id="col-container"):
         # Header with Logo
             elem_id="main-title"
         )
+        gr.Markdown("Perform advanced segmentation using **SAM3** with Text, Boxes, or Combined prompts.")
         with gr.Row():
             # Left Column: Inputs
             with gr.Column(scale=1):
                 input_image = gr.Image(label="Input Image", type="pil", height=350)
+                task_type = gr.Dropdown(
                     label="Task Type",
+                    choices=[
+                        "Text Prompt",
+                        "Single Bounding Box",
+                        "Multiple Boxes (Positive)",
+                        "Text + Negative Box"
+                    ],
+                    value="Text Prompt",
                     interactive=True
                 )
+                # Conditional Inputs
+                text_prompt_input = gr.Textbox(
                     label="Text Prompt",
+                    placeholder="e.g., cat, ear, car wheel",
                     visible=True
                 )
+                box_input = gr.Textbox(
+                    label="Box Coordinates (x1, y1, x2, y2)",
+                    placeholder="e.g., 100, 150, 500, 450",
+                    info="For multiple boxes, separate with semicolon ';'. E.g., 10,10,50,50; 60,60,100,100",
+                    visible=False
+                )
                 threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
+                run_button = gr.Button("Segment Image", variant="primary")
             # Right Column: Output
             with gr.Column(scale=1.5):
                 output_image = gr.AnnotatedImage(label="Segmented Output", height=500)
+        # Logic to toggle visibility of inputs based on dropdown
+        def update_inputs(task):
+            if task == "Text Prompt":
+                return gr.update(visible=True), gr.update(visible=False)
+            elif task == "Single Bounding Box":
+                return gr.update(visible=False), gr.update(visible=True, label="Single Box (x1, y1, x2, y2)")
+            elif task == "Multiple Boxes (Positive)":
+                return gr.update(visible=False), gr.update(visible=True, label="Multiple Boxes (x1,y1,x2,y2; x1,y1,x2,y2)")
+            elif task == "Text + Negative Box":
+                return gr.update(visible=True), gr.update(visible=True, label="Negative Box to Exclude (x1, y1, x2, y2)")
+            return gr.update(visible=True), gr.update(visible=True)
         task_type.change(
+            fn=update_inputs,
             inputs=[task_type],
+            outputs=[text_prompt_input, box_input]
         )
         # Examples
         gr.Examples(
             examples=[
+                ["examples/cat.jpg", "Text Prompt", "cat", "", 0.5],
+                ["examples/car.jpg", "Single Bounding Box", "", "100, 200, 400, 500", 0.5],
+                ["examples/fruit.jpg", "Text + Negative Box", "apple", "50, 50, 100, 100", 0.4],
             ],
+            inputs=[input_image, task_type, text_prompt_input, box_input, threshold],
             outputs=[output_image],
+            fn=process_sam3,
             cache_examples=False,
+            label="Examples (Ensure files exist and coordinates match images)"
         )
     run_button.click(
+        fn=process_sam3,
+        inputs=[input_image, task_type, text_prompt_input, box_input, threshold],
         outputs=[output_image]
     )