Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

b01e247

verified ·

1 Parent(s): 1b33b1c

update app

Browse files

Files changed (1) hide show

app.py +36 -157

app.py CHANGED Viewed

@@ -94,6 +94,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # --- Model Loading ---
 try:
     print("Loading SAM3 Model and Processor...")
     model = Sam3Model.from_pretrained("facebook/sam3").to(device)
@@ -101,113 +102,32 @@ try:
     print("Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
-    print("Ensure you have the correct libraries installed (transformers>=4.40.0) and access to the model.")
     model = None
     processor = None
-# --- Helper Functions ---
-def parse_boxes(box_str):
-    """
-    Parses a string of coordinates into a list of lists.
-    Format expected: "x1,y1,x2,y2" or "x1,y1,x2,y2; x3,y3,x4,y4"
-    """
-    try:
-        boxes = []
-        # Split by semicolon for multiple boxes
-        segments = box_str.split(';')
-        for seg in segments:
-            if not seg.strip():
-                continue
-            coords = [float(c.strip()) for c in seg.split(',')]
-            if len(coords) != 4:
-                raise ValueError(f"Expected 4 coordinates per box, got {len(coords)}")
-            boxes.append(coords)
-        return boxes
-    except Exception as e:
-        raise ValueError(f"Invalid box format: {e}")
 @spaces.GPU(duration=60)
-def process_sam3(input_image, task_type, text_prompt, box_input, threshold=0.5):
     if input_image is None:
         raise gr.Error("Please upload an image.")
     if model is None or processor is None:
         raise gr.Error("Model not loaded correctly.")
     image_pil = input_image.convert("RGB")
-    inputs = {}
-    # Logic branching based on Task Type
-    try:
-        if task_type == "Text Prompt":
-            if not text_prompt:
-                raise gr.Error("Please enter a text prompt.")
-            inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
-            display_label_prefix = text_prompt
-        elif task_type == "Single Bounding Box":
-            if not box_input:
-                raise gr.Error("Please enter box coordinates.")
-            boxes = parse_boxes(box_input)
-            if len(boxes) != 1:
-                raise gr.Error("Please provide exactly one box for this mode.")
-            input_boxes = [boxes] # [batch_size, num_boxes, 4]
-            input_boxes_labels = [[1]] # 1 = positive
-            inputs = processor(
-                images=image_pil,
-                input_boxes=input_boxes,
-                input_boxes_labels=input_boxes_labels,
-                return_tensors="pt"
-            ).to(device)
-            display_label_prefix = "Box"
-        elif task_type == "Multiple Boxes (Positive)":
-            if not box_input:
-                raise gr.Error("Please enter box coordinates.")
-            boxes = parse_boxes(box_input) # Returns list of [x1,y1,x2,y2]
-            input_boxes = [boxes] # [batch, num_boxes, 4]
-            # All labels 1 (positive)
-            input_boxes_labels = [[1] * len(boxes)]
-            inputs = processor(
-                images=image_pil,
-                input_boxes=input_boxes,
-                input_boxes_labels=input_boxes_labels,
-                return_tensors="pt"
-            ).to(device)
-            display_label_prefix = "Multi-Box"
-        elif task_type == "Text + Negative Box":
-            if not text_prompt or not box_input:
-                raise gr.Error("Please provide both Text Prompt and Box Coordinates.")
-            boxes = parse_boxes(box_input)
-            input_boxes = [boxes]
-            # Labels 0 (negative/exclude)
-            input_boxes_labels = [[0] * len(boxes)]
-            inputs = processor(
-                images=image_pil,
-                text=text_prompt,
-                input_boxes=input_boxes,
-                input_boxes_labels=input_boxes_labels,
-                return_tensors="pt"
-            ).to(device)
-            display_label_prefix = f"{text_prompt} (Excl. Box)"
-    except ValueError as e:
-        raise gr.Error(str(e))
     # Inference
     with torch.no_grad():
         outputs = model(**inputs)
-    # Post-processing
     results = processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
@@ -215,120 +135,79 @@ def process_sam3(input_image, task_type, text_prompt, box_input, threshold=0.5):
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
-    masks = results['masks']
     scores = results['scores']
-    # Prepare AnnotatedImage Output
     annotations = []
     masks_np = masks.cpu().numpy()
     scores_np = scores.cpu().numpy()
     for i, mask in enumerate(masks_np):
         score_val = scores_np[i]
-        label = f"{display_label_prefix} ({score_val:.2f})"
         annotations.append((mask, label))
     return (image_pil, annotations)
-# --- UI Logic ---
 css="""
 #col-container {
     margin: 0 auto;
-    max-width: 1100px;
-}
-#main-title h1 {
-    font-size: 2.1em !important;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    gap: 10px;
 }
 """
 with gr.Blocks(css=css, theme=plum_theme) as demo:
     with gr.Column(elem_id="col-container"):
-        # Header with Logo
         gr.Markdown(
-            "# **SAM3 Image Segmentation** <img src='https://huggingface.co/spaces/prithivMLmods/Qwen-Image-Edit-2509-LoRAs-Fast-Fusion/resolve/main/Lora%20Huggy.png' alt='Logo' width='35' height='35' style='display: inline-block; vertical-align: text-bottom; margin-left: 5px;'>",
             elem_id="main-title"
         )
-        gr.Markdown("Perform advanced segmentation using **SAM3** with Text, Boxes, or Combined prompts.")
         with gr.Row():
             # Left Column: Inputs
             with gr.Column(scale=1):
                 input_image = gr.Image(label="Input Image", type="pil", height=350)
-                task_type = gr.Dropdown(
-                    label="Task Type",
-                    choices=[
-                        "Text Prompt",
-                        "Single Bounding Box",
-                        "Multiple Boxes (Positive)",
-                        "Text + Negative Box"
-                    ],
-                    value="Text Prompt",
-                    interactive=True
-                )
-                # Conditional Inputs
-                text_prompt_input = gr.Textbox(
                     label="Text Prompt",
-                    placeholder="e.g., cat, ear, car wheel",
-                    visible=True
-                )
-                box_input = gr.Textbox(
-                    label="Box Coordinates (x1, y1, x2, y2)",
-                    placeholder="e.g., 100, 150, 500, 450",
-                    info="For multiple boxes, separate with semicolon ';'. E.g., 10,10,50,50; 60,60,100,100",
-                    visible=False
                 )
                 threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
-                run_button = gr.Button("Segment Image", variant="primary")
             # Right Column: Output
             with gr.Column(scale=1.5):
                 output_image = gr.AnnotatedImage(label="Segmented Output", height=500)
-        # Logic to toggle visibility of inputs based on dropdown
-        def update_inputs(task):
-            if task == "Text Prompt":
-                return gr.update(visible=True), gr.update(visible=False)
-            elif task == "Single Bounding Box":
-                return gr.update(visible=False), gr.update(visible=True, label="Single Box (x1, y1, x2, y2)")
-            elif task == "Multiple Boxes (Positive)":
-                return gr.update(visible=False), gr.update(visible=True, label="Multiple Boxes (x1,y1,x2,y2; x1,y1,x2,y2)")
-            elif task == "Text + Negative Box":
-                return gr.update(visible=True), gr.update(visible=True, label="Negative Box to Exclude (x1, y1, x2, y2)")
-            return gr.update(visible=True), gr.update(visible=True)
-        task_type.change(
-            fn=update_inputs,
-            inputs=[task_type],
-            outputs=[text_prompt_input, box_input]
-        )
         # Examples
         gr.Examples(
             examples=[
-                ["examples/cat.jpg", "Text Prompt", "cat", "", 0.5],
-                ["examples/car.jpg", "Single Bounding Box", "", "100, 200, 400, 500", 0.5],
-                ["examples/fruit.jpg", "Text + Negative Box", "apple", "50, 50, 100, 100", 0.4],
             ],
-            inputs=[input_image, task_type, text_prompt_input, box_input, threshold],
             outputs=[output_image],
-            fn=process_sam3,
             cache_examples=False,
-            label="Examples (Ensure files exist and coordinates match images)"
         )
     run_button.click(
-        fn=process_sam3,
-        inputs=[input_image, task_type, text_prompt_input, box_input, threshold],
         outputs=[output_image]
     )

 print(f"Using device: {device}")
 # --- Model Loading ---
+# Using the facebook/sam3 model as requested
 try:
     print("Loading SAM3 Model and Processor...")
     model = Sam3Model.from_pretrained("facebook/sam3").to(device)
     print("Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
+    print("Ensure you have the correct libraries installed and access to the model.")
+    # Fallback/Placeholder for demonstration if model doesn't exist in environment yet
     model = None
     processor = None
 @spaces.GPU(duration=60)
+def segment_image(input_image, text_prompt, threshold=0.5):
     if input_image is None:
         raise gr.Error("Please upload an image.")
+    if not text_prompt:
+        raise gr.Error("Please enter a text prompt (e.g., 'cat', 'face').")
     if model is None or processor is None:
         raise gr.Error("Model not loaded correctly.")
+    # Convert image to RGB
     image_pil = input_image.convert("RGB")
+    # Preprocess
+    inputs = processor(images=image_pil, text=text_prompt, return_tensors="pt").to(device)
     # Inference
     with torch.no_grad():
         outputs = model(**inputs)
+    # Post-process results
     results = processor.post_process_instance_segmentation(
         outputs,
         threshold=threshold,
         target_sizes=inputs.get("original_sizes").tolist()
     )[0]
+    masks = results['masks'] # Boolean tensor [N, H, W]
     scores = results['scores']
+    # Prepare for Gradio AnnotatedImage
+    # Gradio expects (image, [(mask, label), ...])
     annotations = []
     masks_np = masks.cpu().numpy()
     scores_np = scores.cpu().numpy()
     for i, mask in enumerate(masks_np):
+        # mask is a boolean array (True/False).
+        # AnnotatedImage handles the coloring automatically.
+        # We just pass the mask and a label.
         score_val = scores_np[i]
+        label = f"{text_prompt} ({score_val:.2f})"
         annotations.append((mask, label))
+    # Return tuple format for AnnotatedImage
     return (image_pil, annotations)
 css="""
 #col-container {
     margin: 0 auto;
+    max-width: 980px;
 }
+#main-title h1 {font-size: 2.1em !important;}
 """
 with gr.Blocks(css=css, theme=plum_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(
+            "# **SAM3 Image Segmentation**",
             elem_id="main-title"
         )
+        gr.Markdown("Segment objects in images using **SAM3** (Segment Anything Model 3) with text prompts.")
         with gr.Row():
             # Left Column: Inputs
             with gr.Column(scale=1):
                 input_image = gr.Image(label="Input Image", type="pil", height=350)
+                text_prompt = gr.Textbox(
                     label="Text Prompt",
+                    placeholder="e.g., cat, ear, car wheel...",
+                    info="What do you want to segment?"
                 )
                 threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.05)
+                run_button = gr.Button("Segment", variant="primary")
             # Right Column: Output
             with gr.Column(scale=1.5):
+                # AnnotatedImage creates a nice overlay visualization
                 output_image = gr.AnnotatedImage(label="Segmented Output", height=500)
         # Examples
         gr.Examples(
             examples=[
+                ["examples/cat.jpg", "cat", 0.5],
+                ["examples/car.jpg", "tire", 0.4],
+                ["examples/fruit.jpg", "apple", 0.5],
             ],
+            inputs=[input_image, text_prompt, threshold],
             outputs=[output_image],
+            fn=segment_image,
             cache_examples=False,
+            label="Examples"
         )
     run_button.click(
+        fn=segment_image,
+        inputs=[input_image, text_prompt, threshold],
         outputs=[output_image]
     )