Spaces:
Running
on
Zero
Running
on
Zero
MCP ready
Browse files
app.py
CHANGED
|
@@ -78,6 +78,22 @@ def visualize(pred_mask, image_path, work_dir):
|
|
| 78 |
|
| 79 |
@spaces.GPU
|
| 80 |
def image_vision(image_input_path, prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
image_path = image_input_path
|
| 82 |
text_prompts = f"<image>{prompt}"
|
| 83 |
image = Image.open(image_path).convert('RGB')
|
|
@@ -106,6 +122,23 @@ def image_vision(image_input_path, prompt):
|
|
| 106 |
|
| 107 |
@spaces.GPU(duration=80)
|
| 108 |
def video_vision(video_input_path, prompt, video_interval):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
# Open the original video
|
| 110 |
cap = cv2.VideoCapture(video_input_path)
|
| 111 |
|
|
@@ -243,4 +276,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
| 243 |
outputs = [vid_output_res, output_video, masked_output]
|
| 244 |
)
|
| 245 |
|
| 246 |
-
demo.queue().launch(show_api=
|
|
|
|
| 78 |
|
| 79 |
@spaces.GPU
|
| 80 |
def image_vision(image_input_path, prompt):
|
| 81 |
+
"""Perform image-based visual question answering and segmentation.
|
| 82 |
+
|
| 83 |
+
This function takes an image and a text prompt (instruction) as input, processes the image with a
|
| 84 |
+
multimodal model, and returns a textual answer. If the model response includes a segmentation token ("[SEG]"),
|
| 85 |
+
and segmentation visualization is available, a visual output is also generated.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
image_input_path (str): The path to the input image file.
|
| 89 |
+
prompt (str): The instruction or question about the image.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Tuple[str, Optional[str]]:
|
| 93 |
+
- A textual answer generated by the model.
|
| 94 |
+
- If segmentation is requested (indicated by '[SEG]' in the answer), the path to the segmented image file;
|
| 95 |
+
otherwise, returns None.
|
| 96 |
+
"""
|
| 97 |
image_path = image_input_path
|
| 98 |
text_prompts = f"<image>{prompt}"
|
| 99 |
image = Image.open(image_path).convert('RGB')
|
|
|
|
| 122 |
|
| 123 |
@spaces.GPU(duration=80)
|
| 124 |
def video_vision(video_input_path, prompt, video_interval):
|
| 125 |
+
"""Perform video-based visual question answering and segmentation.
|
| 126 |
+
|
| 127 |
+
This function analyzes a video file using a multimodal vision-language model. It extracts frames based
|
| 128 |
+
on a sampling interval, feeds the frames and prompt to the model, and returns a response. If segmentation
|
| 129 |
+
is requested, it produces two videos: one with overlaid masks, and one with binary masks only.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
video_input_path (str): The path to the input video file.
|
| 133 |
+
prompt (str): The instruction or question about the video.
|
| 134 |
+
video_interval (int): Frame sampling interval. A value of 1 processes every frame, 2 every second frame, etc.
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Tuple[str, Optional[str], Optional[str]]:
|
| 138 |
+
- The model-generated textual answer.
|
| 139 |
+
- If segmentation is requested (contains '[SEG]'), the path to the segmented output video file.
|
| 140 |
+
- If segmentation is requested, the path to a binary mask-only video; otherwise, None.
|
| 141 |
+
"""
|
| 142 |
# Open the original video
|
| 143 |
cap = cv2.VideoCapture(video_input_path)
|
| 144 |
|
|
|
|
| 276 |
outputs = [vid_output_res, output_video, masked_output]
|
| 277 |
)
|
| 278 |
|
| 279 |
+
demo.queue().launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
|