MM_Grounding_DINO_demo

Running on Zero

App Files Files Community

developer0hye commited on Aug 22

Commit

d58546f

verified ·

1 Parent(s): ce43380

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -8

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 import supervision as sv
 # Model ID for Hugging Face
-model_id = "IDEA-Research/grounding-dino-base"
 # Load model and processor using Transformers
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -32,8 +32,12 @@ def run_grounding(input_image, grounding_caption, box_threshold, text_threshold)
     init_image = input_image.convert("RGB")
     # Process input using transformers
-    inputs = processor(images=init_image, text=grounding_caption, return_tensors="pt").to(device)
     # Run inference
     with torch.no_grad():
@@ -42,10 +46,8 @@ def run_grounding(input_image, grounding_caption, box_threshold, text_threshold)
     # Post-process results
     results = processor.post_process_grounded_object_detection(
         outputs,
-        inputs.input_ids,
         threshold=box_threshold,
-        text_threshold=text_threshold,
-        target_sizes=[init_image.size[::-1]]
     )
     result = results[0]
@@ -140,8 +142,8 @@ if __name__ == "__main__":
   }
 """
     with gr.Blocks(css=css) as demo:
-        gr.Markdown("<h1><center>Grounding DINO Base<h1><center>")
-        gr.Markdown("<h3><center>Open-World Detection with <a href='https://github.com/IDEA-Research/GroundingDINO'>Grounding DINO</a><h3><center>")
         with gr.Row():
             with gr.Column():
@@ -159,7 +161,8 @@ if __name__ == "__main__":
                     )
                     text_threshold = gr.Slider(
                         minimum=0.0, maximum=1.0, value=0.25, step=0.001,
-                        label="Text Threshold"
                     )
             with gr.Column():

 import supervision as sv
 # Model ID for Hugging Face
+model_id = "rziga/mm_grounding_dino_base_all"
 # Load model and processor using Transformers
 device = "cuda" if torch.cuda.is_available() else "cpu"
     init_image = input_image.convert("RGB")
+    # Process caption into list of list format for mm grounding dino
+    # Split by period and strip whitespace
+    text_labels = [[label.strip() for label in grounding_caption.split('.') if label.strip()]]
     # Process input using transformers
+    inputs = processor(images=init_image, text=text_labels, return_tensors="pt").to(device)
     # Run inference
     with torch.no_grad():
     # Post-process results
     results = processor.post_process_grounded_object_detection(
         outputs,
         threshold=box_threshold,
+        target_sizes=[(init_image.size[1], init_image.size[0])]
     )
     result = results[0]
   }
 """
     with gr.Blocks(css=css) as demo:
+        gr.Markdown("<h1><center>MM Grounding DINO Base<h1><center>")
+        gr.Markdown("<h3><center>Open-World Detection with <a href='https://huggingface.co/openmmlab-community/mm_grounding_dino_base_all'>MM Grounding DINO</a><h3><center>")
         with gr.Row():
             with gr.Column():
                     )
                     text_threshold = gr.Slider(
                         minimum=0.0, maximum=1.0, value=0.25, step=0.001,
+                        label="Text Threshold (not used in MM Grounding DINO)",
+                        visible=False
                     )
             with gr.Column():