Spaces:

Mountchicken
/

Rex-Omni

Running on Zero

App Files Files Community

Mountchicken commited on Oct 17

Commit

e2e6048

verified ·

1 Parent(s): 60f587b

Update app.py

Browse files

Fix Visual Prompting bug

Files changed (1) hide show

app.py +70 -40

app.py CHANGED Viewed

@@ -1,26 +1,32 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import spaces
 import argparse
 import json
 import os
-os.system("pip install torch==2.4.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu124")
 import subprocess
-subprocess.run('pip install flash-attn==2.7.4.post1 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import sys
 import threading
-import re
 from typing import Any, Dict, List
 import gradio as gr
 import numpy as np
-from gradio_image_prompter import ImagePrompter
 from PIL import Image
 from rex_omni import RexOmniVisualize, RexOmniWrapper, TaskType
 from rex_omni.tasks import KEYPOINT_CONFIGS, TASK_CONFIGS, get_task_config
@@ -234,22 +240,33 @@ EXAMPLE_CONFIGS = [
 ]
-def parse_visual_prompt(points: List) -> List[List[float]]:
-    """Parse visual prompt points to bounding boxes"""
-    boxes = []
-    for point in points:
-        if point[2] == 2 and point[-1] == 3:  # Rectangle
-            x1, y1, _, x2, y2, _ = point
-            boxes.append([x1, y1, x2, y2])
-        elif point[2] == 1 and point[-1] == 4:  # Positive point
-            x, y, _, _, _, _ = point
-            half_width = 10
-            x1 = max(0, x - half_width)
-            y1 = max(0, y - half_width)
-            x2 = x + half_width
-            y2 = y + half_width
-            boxes.append([x1, y1, x2, y2])
-    return boxes
 def convert_boxes_to_visual_prompt_format(
@@ -344,6 +361,7 @@ def get_task_prompt(
         else:
             return task_config.prompt_template.replace("{categories}", "objects")
 @spaces.GPU
 def run_inference(
     image,
@@ -362,7 +380,6 @@ def run_inference(
     if image is None:
         return None, "Please upload an image first."
     # Convert numpy array to PIL Image if needed
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
@@ -375,8 +392,8 @@ def run_inference(
         # Check if we have predefined visual prompt boxes from examples
         if hasattr(image, "_example_visual_prompts"):
             visual_prompt_boxes = image._example_visual_prompts
-        elif visual_prompt_data is not None and "points" in visual_prompt_data:
-            visual_prompt_boxes = parse_visual_prompt(visual_prompt_data["points"])
     # Determine task type and categories based on task selection
     if task_selection == "OCR":
@@ -406,9 +423,7 @@ def run_inference(
         task_key = task_type.value
         # Split categories by comma and clean up
-        categories_list = [
-            cat.strip() for cat in categories.split(",") if cat.strip()
-        ]
         if not categories_list:
             categories_list = ["object"]
@@ -456,6 +471,7 @@ def run_inference(
     except Exception as e:
         return image, f"Visualization failed: {str(e)}\n\nRaw output:\n{raw_output}"
 def update_interface(task_selection):
     """Update interface based on task selection"""
     config = DEMO_TASK_CONFIGS.get(task_selection, {})
@@ -580,8 +596,8 @@ def update_prompt_preview(
     # Parse visual prompts
     visual_prompt_boxes = []
-    if "points" in visual_prompt_data:
-        visual_prompt_boxes = parse_visual_prompt(visual_prompt_data["points"])
     # Generate prompt preview
     prompt = get_task_prompt(
@@ -697,7 +713,7 @@ def create_demo():
                 with visual_prompt_tab:
                     gr.Markdown("### 🎯 Visual Prompt Configuration")
                     gr.Markdown(
-                        "Draw bounding boxes on the image to provide visual examples"
                     )
                 # Prompt Preview
@@ -735,10 +751,9 @@ def create_demo():
                         )
                         # Visual Prompt Interface (only visible for Visual Prompting task)
-                        visual_prompter = ImagePrompter(
                             label="🎯 Visual Prompt Interface",
-                            width=420,
-                            height=315,  # 4:3 aspect ratio (420 * 3/4 = 315)
                             visible=False,
                             elem_classes=["preserve-aspect-ratio"],
                         )
@@ -857,15 +872,30 @@ def create_demo():
             show_labels,
             custom_color,
         ):
-            # For Visual Prompting task, use the visual prompter image
             if task_selection == "Visual Prompting":
-                if visual_prompter_data is not None and "image" in visual_prompter_data:
-                    image_to_use = visual_prompter_data["image"]
-                else:
                     return (
                         None,
-                        "Please upload an image in the Visual Prompt Interface for Visual Prompting task.",
                     )
             else:
                 image_to_use = input_image

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
 import json
 import os
+import spaces
+os.system(
+    "pip install torch==2.4.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu124"
+)
+os.system("pip install gradio_bbox_annotator")
 import subprocess
+subprocess.run(
+    "pip install flash-attn==2.7.4.post1 --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+import re
 import sys
 import threading
 from typing import Any, Dict, List
 import gradio as gr
 import numpy as np
+from gradio_bbox_annotator import BBoxAnnotator
 from PIL import Image
 from rex_omni import RexOmniVisualize, RexOmniWrapper, TaskType
 from rex_omni.tasks import KEYPOINT_CONFIGS, TASK_CONFIGS, get_task_config
 ]
+def parse_visual_prompt(bbox_data) -> List[List[float]]:
+    """Parse BBoxAnnotator output to bounding boxes"""
+    if bbox_data is None:
+        return []
+    try:
+        # BBoxAnnotator returns format: (image, boxes_list)
+        # where boxes_list contains [x, y, width, height] for each box
+        if isinstance(bbox_data, tuple) and len(bbox_data) >= 2:
+            boxes_list = bbox_data[1]
+        else:
+            boxes_list = bbox_data
+        if not boxes_list:
+            return []
+        # Convert from [x, y, width, height] to [x1, y1, x2, y2] format
+        boxes = []
+        for box in boxes_list:
+            if len(box) >= 4:
+                x1, y1, x2, y2 = box[:4]
+                boxes.append([x1, y1, x2, y2])
+        return boxes
+    except Exception as e:
+        print(f"Error parsing visual prompt: {e}")
+        return []
 def convert_boxes_to_visual_prompt_format(
         else:
             return task_config.prompt_template.replace("{categories}", "objects")
 @spaces.GPU
 def run_inference(
     image,
     if image is None:
         return None, "Please upload an image first."
     # Convert numpy array to PIL Image if needed
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
         # Check if we have predefined visual prompt boxes from examples
         if hasattr(image, "_example_visual_prompts"):
             visual_prompt_boxes = image._example_visual_prompts
+        elif visual_prompt_data is not None:
+            visual_prompt_boxes = parse_visual_prompt(visual_prompt_data)
     # Determine task type and categories based on task selection
     if task_selection == "OCR":
         task_key = task_type.value
         # Split categories by comma and clean up
+        categories_list = [cat.strip() for cat in categories.split(",") if cat.strip()]
         if not categories_list:
             categories_list = ["object"]
     except Exception as e:
         return image, f"Visualization failed: {str(e)}\n\nRaw output:\n{raw_output}"
 def update_interface(task_selection):
     """Update interface based on task selection"""
     config = DEMO_TASK_CONFIGS.get(task_selection, {})
     # Parse visual prompts
     visual_prompt_boxes = []
+    if visual_prompt_data is not None:
+        visual_prompt_boxes = parse_visual_prompt(visual_prompt_data)
     # Generate prompt preview
     prompt = get_task_prompt(
                 with visual_prompt_tab:
                     gr.Markdown("### 🎯 Visual Prompt Configuration")
                     gr.Markdown(
+                        "Select the pen tool and draw one or multiple boxes on the image. "
                     )
                 # Prompt Preview
                         )
                         # Visual Prompt Interface (only visible for Visual Prompting task)
+                        visual_prompter = BBoxAnnotator(
                             label="🎯 Visual Prompt Interface",
+                            categories="D",
                             visible=False,
                             elem_classes=["preserve-aspect-ratio"],
                         )
             show_labels,
             custom_color,
         ):
+            # For Visual Prompting task, extract image from BBoxAnnotator data
             if task_selection == "Visual Prompting":
+                if (
+                    visual_prompter_data is None
+                    or not isinstance(visual_prompter_data, tuple)
+                    or len(visual_prompter_data) < 1
+                ):
                     return (
                         None,
+                        "Please upload an image and draw bounding boxes in the Visual Prompt Interface for Visual Prompting task.",
                     )
+                # Extract image from BBoxAnnotator data (first element of the tuple)
+                image_to_use = visual_prompter_data[0]
+                # If image_to_use is a string (file path), convert to PIL Image
+                if isinstance(image_to_use, str):
+                    try:
+                        from PIL import Image
+                        image_to_use = Image.open(image_to_use).convert("RGB")
+                    except Exception as e:
+                        return (
+                            None,
+                            f"Error loading image from path: {e}",
+                        )
             else:
                 image_to_use = input_image