Spaces:

mo-thecreator
/

detect-and-describe

Sleeping

App Files Files Community

Mohammed Abdeldayem commited on Oct 6, 2024

Commit

d00962d

verified ·

1 Parent(s): 8064ded

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -11

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ from transformers import AutoTokenizer, VisionEncoderDecoderModel, AutoImageProc
 from PIL import Image
 from torchvision.transforms.functional import crop
 import gradio as gr
-import json
 import base64
 import io
 from huggingface_hub import hf_hub_download
@@ -101,20 +100,15 @@ def process_image(image):
             caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
             captions.append(caption)
-        # Prepare the result for visualization
-        detection_results = []
         for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
-            detection_results.append({
-                "label": label,
-                "caption": caption,
-                "bounding_box": [float(coord) for coord in box],  # Convert to float
-                "confidence_score": float(score)  # Convert to float
-            })
         # Render image with bounding boxes
         result_image = results.render()[0]
-        # Return the image with detections and the caption
         return result_image, detection_results, original_caption
     except Exception as e:
@@ -129,7 +123,7 @@ interface = gr.Interface(
     inputs=gr.Image(type="pil"),  # Input: Image upload
     outputs=[
         gr.Image(type="pil", label="Detected Objects"),  # Output 1: Image with bounding boxes
-        gr.JSON(label="Object Captions & Bounding Boxes"),  # Output 2: JSON results for each object
         gr.Textbox(label="Whole Image Caption")  # Output 3: Caption for the whole image
     ],
     live=True

 from PIL import Image
 from torchvision.transforms.functional import crop
 import gradio as gr
 import base64
 import io
 from huggingface_hub import hf_hub_download
             caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
             captions.append(caption)
+        # Prepare the result for visualization as a formatted string
+        detection_results = ""
         for i, (label, box, score, caption) in enumerate(zip(labels, boxes, scores, captions)):
+            detection_results += f"Object {i + 1}: {label} - Caption: {caption}\n"
         # Render image with bounding boxes
         result_image = results.render()[0]
+        # Return the image with detections, formatted captions, and the whole image caption
         return result_image, detection_results, original_caption
     except Exception as e:
     inputs=gr.Image(type="pil"),  # Input: Image upload
     outputs=[
         gr.Image(type="pil", label="Detected Objects"),  # Output 1: Image with bounding boxes
+        gr.Textbox(label="Object Captions & Bounding Boxes", lines=10),  # Output 2: Formatted captions
         gr.Textbox(label="Whole Image Caption")  # Output 3: Caption for the whole image
     ],
     live=True