Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 25 days ago

Commit

eec5a19

verified ·

1 Parent(s): 583e8e1

update app

Browse files

Files changed (1) hide show

app.py +159 -199

app.py CHANGED Viewed

@@ -4,11 +4,11 @@ import uuid
 import json
 import time
 import asyncio
 from threading import Thread
 from pathlib import Path
 from io import BytesIO
 from typing import Optional, Tuple, Dict, Any, Iterable
-import re
 import gradio as gr
 import spaces
@@ -30,6 +30,9 @@ from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
@@ -37,7 +40,7 @@ colors.orange_red = colors.Color(
     c200="#FFC299",
     c300="#FFA366",
     c400="#FF8533",
-    c500="#FF4500",
     c600="#E63E00",
     c700="#CC3700",
     c800="#B33000",
@@ -96,6 +99,7 @@ class OrangeRedTheme(Soft):
             block_label_background_fill="*primary_200",
         )
 orange_red_theme = OrangeRedTheme()
 css = """
@@ -173,6 +177,59 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     dtype=torch.float16
 ).to(device).eval()
 def extract_gif_frames(gif_path: str):
     if not gif_path:
         return []
@@ -249,74 +306,6 @@ def navigate_pdf_page(direction: str, state: Dict[str, Any]):
     page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
     return image_preview, state, page_info_html
-def draw_boxes_on_image(image: Image.Image, text_output: str, object_name: str) -> Tuple[Image.Image, str]:
-    try:
-        # Extract the JSON part of the text output
-        match = re.search(r'\[\s*\[.*?\]\s*\]', text_output, re.DOTALL)
-        if not match:
-            return image, f"Could not find coordinates in the model output: {text_output}"
-        boxes_str = match.group(0)
-        boxes = json.loads(boxes_str)
-        if not boxes or not isinstance(boxes[0], list):
-            return image, f"No valid boxes found in parsed data: {boxes}"
-        width, height = image.size
-        np_image = np.array(image.convert("RGB"))
-        # Denormalize coordinates
-        xyxy = []
-        for box in boxes:
-            x1, y1, x2, y2 = box
-            xyxy.append([x1 * width, y1 * height, x2 * width, y2 * height])
-        detections = sv.Detections(xyxy=np.array(xyxy))
-        bounding_box_annotator = sv.BoxAnnotator(thickness=2)
-        label_annotator = sv.LabelAnnotator(text_thickness=1, text_scale=0.5)
-        labels = [f"{object_name} #{i+1}" for i in range(len(detections))]
-        annotated_image = bounding_box_annotator.annotate(scene=np_image.copy(), detections=detections)
-        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
-        return Image.fromarray(annotated_image), text_output
-    except (json.JSONDecodeError, IndexError, TypeError) as e:
-        return image, f"Failed to parse or draw boxes. Error: {e}\nModel Output:\n{text_output}"
-def draw_points_on_image(image: Image.Image, text_output: str) -> Tuple[Image.Image, str]:
-    try:
-        match = re.search(r'\[\s*\[.*?\]\s*\]', text_output, re.DOTALL)
-        if not match:
-            return image, f"Could not find coordinates in the model output: {text_output}"
-        points_str = match.group(0)
-        points = json.loads(points_str)
-        if not points or not isinstance(points[0], list):
-             return image, f"No valid points found in parsed data: {points}"
-        width, height = image.size
-        np_image = np.array(image.convert("RGB"))
-        # Denormalize coordinates
-        xy = []
-        for point in points:
-            x, y = point
-            xy.append([x * width, y * height])
-        points_array = np.array(xy).reshape(1, -1, 2)
-        key_points = sv.KeyPoints(xy=points_array)
-        point_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
-        annotated_image = point_annotator.annotate(scene=np_image.copy(), key_points=key_points)
-        return Image.fromarray(annotated_image), text_output
-    except (json.JSONDecodeError, IndexError, TypeError) as e:
-        return image, f"Failed to parse or draw points. Error: {e}\nModel Output:\n{text_output}"
 @spaces.GPU
 def generate_image(text: str, image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
@@ -437,62 +426,55 @@ def generate_gif(text: str, gif_path: str, max_new_tokens: int = 1024, temperatu
         yield buffer, buffer
 @spaces.GPU
-def generate_object_detection(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
-        yield image, "Please upload an image."
-        return
-    if not text:
-        yield image, "Please enter the object name to detect."
-        return
-    prompt = (
-        f"You are an expert object detection model. Your task is to find all instances of '{text}' in the image. "
-        "You must respond ONLY with a JSON list of bounding boxes. Each bounding box must be in the format "
-        "[x_min, y_min, x_max, y_max], where the coordinates are normalized to be between 0 and 1. "
-        "Do not provide any other text, explanation, or preamble. For example: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
-    )
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
-    # This task is not streamed because we need the full output to parse and draw boxes
-    outputs = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-    response_text = processor_q3vl.decode(outputs[0], skip_special_tokens=True).strip()
-    # Extract only the user-facing part of the response
-    final_text = response_text.split('<|im_end|>')[-1].strip() if '<|im_end|>' in response_text else response_text
-    annotated_image, raw_output = draw_boxes_on_image(image, final_text, text)
-    yield annotated_image, raw_output
-@spaces.GPU
-def generate_point_detection(image: Image.Image, text: str, max_new_tokens: int = 256, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
-    if image is None:
-        yield image, "Please upload an image."
-        return
-    if not text:
-        yield image, "Please enter the object/point name to detect."
-        return
-    prompt = (
-        f"You are an expert point detection model. Your task is to find the specific location of '{text}' in the image. "
-        "You must respond ONLY with a JSON list containing a single coordinate pair. The coordinate must be in the format "
-        "[[x, y]], where the coordinates are normalized to be between 0 and 1. "
-        "Do not provide any other text, explanation, or preamble. For example: [[0.45, 0.67]]"
-    )
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
-    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
-    outputs = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-    response_text = processor_q3vl.decode(outputs[0], skip_special_tokens=True).strip()
-    final_text = response_text.split('<|im_end|>')[-1].strip() if '<|im_end|>' in response_text else response_text
-    annotated_image, raw_output = draw_points_on_image(image, final_text)
-    yield annotated_image, raw_output
 image_examples = [["Perform OCR on the image...", "examples/images/1.jpg"],
@@ -506,10 +488,9 @@ gif_examples = [["Describe this GIF.", "examples/gifs/1.gif"],
                 ["Describe this GIF.", "examples/gifs/2.gif"]]
 caption_examples = [["examples/captions/1.JPG"],
                     ["examples/captions/2.jpeg"], ["examples/captions/3.jpeg"]]
-object_detection_examples = [["a cat", "examples/detection/cat_dog.jpg"],
-                             ["the person in the red shirt", "examples/detection/people.jpg"]]
-point_detection_examples = [["the dog's nose", "examples/detection/cat_dog.jpg"],
-                            ["the clock on the wall", "examples/detection/room.jpg"]]
 with gr.Blocks(theme=orange_red_theme, css=css) as demo:
@@ -524,17 +505,11 @@ with gr.Blocks(theme=orange_red_theme, css=css) as demo:
                     image_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
-                with gr.TabItem("Object Detection"):
-                    obj_det_query = gr.Textbox(label="Object to Detect", placeholder="e.g., 'a car', 'the dog'")
-                    obj_det_upload = gr.Image(type="pil", label="Upload Image", height=290)
-                    obj_det_submit = gr.Button("Detect Objects", variant="primary")
-                    gr.Examples(examples=object_detection_examples, inputs=[obj_det_query, obj_det_upload])
-                with gr.TabItem("Point Detection"):
-                    point_det_query = gr.Textbox(label="Point to Detect", placeholder="e.g., 'the cat's left eye'")
-                    point_det_upload = gr.Image(type="pil", label="Upload Image", height=290)
-                    point_det_submit = gr.Button("Detect Point", variant="primary")
-                    gr.Examples(examples=point_detection_examples, inputs=[point_det_query, point_det_upload])
                 with gr.TabItem("PDF Inference"):
                     with gr.Row():
@@ -555,17 +530,33 @@ with gr.Blocks(theme=orange_red_theme, css=css) as demo:
                     gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
                     gif_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(examples=gif_examples, inputs=[gif_query, gif_upload])
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
                     gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
-                with gr.TabItem("Video Inference"):
-                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    video_upload = gr.Video(label="Upload Video(≤30s)", height=290)
-                    video_submit = gr.Button("Submit", variant="primary")
-                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -576,70 +567,39 @@ with gr.Blocks(theme=orange_red_theme, css=css) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True, visible=True)
-            markdown_output = gr.Markdown(label="(Result.Md)", latex_delimiters=[
                                 {"left": "$$", "right": "$$", "display": True},
                                 {"left": "$", "right": "$", "display": False}
-                            ], visible=True)
-            annotated_image_output = gr.Image(label="Annotated Image", visible=False)
-            raw_detection_output = gr.Textbox(label="Raw Detection Output", interactive=False, lines=4, show_copy_button=True, visible=False)
-    def switch_output_visibility(tab_name):
-        is_detection = tab_name in ["Object Detection", "Point Detection"]
-        return {
-            output: gr.update(visible=not is_detection),
-            markdown_output: gr.update(visible=not is_detection),
-            annotated_image_output: gr.update(visible=is_detection),
-            raw_detection_output: gr.update(visible=is_detection),
-        }
-    image_submit.click(fn=generate_image,
-                       inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-                       outputs=[output, markdown_output])
-    video_submit.click(fn=generate_video,
-                       inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-                       outputs=[output, markdown_output])
-    pdf_submit.click(fn=generate_pdf,
-                     inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-                     outputs=[output, markdown_output])
-    gif_submit.click(fn=generate_gif,
-                     inputs=[gif_query, gif_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-                     outputs=[output, markdown_output])
-    caption_submit.click(fn=generate_caption,
-                         inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-                         outputs=[output, markdown_output])
     obj_det_submit.click(
-        fn=lambda: {
-            annotated_image_output: gr.update(visible=True),
-            raw_detection_output: gr.update(visible=True),
-            output: gr.update(visible=False),
-            markdown_output: gr.update(visible=False)
-        },
-        outputs=[annotated_image_output, raw_detection_output, output, markdown_output]
-    ).then(
-        fn=generate_object_detection,
-        inputs=[obj_det_upload, obj_det_query, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[annotated_image_output, raw_detection_output]
     )
     point_det_submit.click(
-         fn=lambda: {
-            annotated_image_output: gr.update(visible=True),
-            raw_detection_output: gr.update(visible=True),
-            output: gr.update(visible=False),
-            markdown_output: gr.update(visible=False)
-        },
-        outputs=[annotated_image_output, raw_detection_output, output, markdown_output]
-    ).then(
-        fn=generate_point_detection,
-        inputs=[point_det_upload, point_det_query, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[annotated_image_output, raw_detection_output]
     )
-    pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
-    prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
-    next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 import json
 import time
 import asyncio
+import re
 from threading import Thread
 from pathlib import Path
 from io import BytesIO
 from typing import Optional, Tuple, Dict, Any, Iterable
 import gradio as gr
 import spaces
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# --- Theme and CSS Definition ---
+# Define the new OrangeRed color palette
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
     c200="#FFC299",
     c300="#FFA366",
     c400="#FF8533",
+    c500="#FF4500",  # OrangeRed base color
     c600="#E63E00",
     c700="#CC3700",
     c800="#B33000",
             block_label_background_fill="*primary_200",
         )
+# Instantiate the new theme
 orange_red_theme = OrangeRedTheme()
 css = """
     dtype=torch.float16
 ).to(device).eval()
+# --- Utility functions for Detection and Drawing ---
+def parse_detection_output(text: str) -> list:
+    """Parses the model's text output to extract bounding boxes or points."""
+    match = re.search(r'\[\s*\[.*?\]\s*\]', text)
+    if not match:
+        return []
+    try:
+        result = json.loads(match.group(0))
+        if isinstance(result, list) and all(isinstance(item, list) for item in result):
+            return result
+        return []
+    except (json.JSONDecodeError, TypeError):
+        return []
+def draw_object_detections(image: Image.Image, detections: list, labels: list) -> Image.Image:
+    """Draws bounding boxes on the image."""
+    image_np = np.array(image.convert("RGB"))
+    h, w, _ = image_np.shape
+    boxes = []
+    for box in detections:
+        if len(box) == 4:
+            x1, y1, x2, y2 = box
+            boxes.append([x1 * w, y1 * h, x2 * w, y2 * h])
+    if not boxes:
+        return image
+    detections_sv = sv.Detections(xyxy=np.array(boxes))
+    bounding_box_annotator = sv.BoxAnnotator(thickness=2)
+    label_annotator = sv.LabelAnnotator(text_thickness=1, text_scale=0.5)
+    annotated_image = bounding_box_annotator.annotate(scene=image_np.copy(), detections=detections_sv)
+    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections_sv, labels=labels)
+    return Image.fromarray(annotated_image)
+def draw_point_detections(image: Image.Image, points: list) -> Image.Image:
+    """Draws points on the image."""
+    image_np = np.array(image.convert("RGB"))
+    h, w, _ = image_np.shape
+    pts = []
+    for point in points:
+        if len(point) == 2:
+            x, y = point
+            pts.append([x * w, y * h])
+    if not pts:
+        return image
+    points_np = np.array(pts).reshape(1, -1, 2)
+    key_points = sv.KeyPoints(xy=points_np)
+    point_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
+    annotated_image = point_annotator.annotate(scene=image_np.copy(), key_points=key_points)
+    return Image.fromarray(annotated_image)
+# --- Core Generation Functions ---
 def extract_gif_frames(gif_path: str):
     if not gif_path:
         return []
     page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
     return image_preview, state, page_info_html
 @spaces.GPU
 def generate_image(text: str, image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
         yield buffer, buffer
 @spaces.GPU
+def generate_detection(
+    image: Image.Image, user_prompt: str, task_type: str, max_new_tokens: int = 256,
+    temperature: float = 0.1, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2
+):
     if image is None:
+        return None, "Please upload an image."
+    if not user_prompt:
+        return image, "Please provide a prompt describing what to detect."
+    if task_type == "Object Detection":
+        system_prompt = (
+            f"You are an expert object detector. Find all instances of '{user_prompt}' in the image. "
+            "Respond ONLY with a Python list of bounding boxes in the format [[x_min, y_min, x_max, y_max], ...]. "
+            "The coordinates must be normalized between 0.0 and 1.0."
+        )
+    elif task_type == "Point Detection":
+        system_prompt = (
+            f"You are an expert keypoint detector. Find the specific points for '{user_prompt}' in the image. "
+            "Respond ONLY with a Python list of points in the format [[x, y], ...]. "
+            "The coordinates must be normalized between 0.0 and 1.0."
+        )
+    else:
+        return image, "Invalid task type specified."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+    generation_kwargs = {
+        **inputs, "max_new_tokens": max_new_tokens, "do_sample": True, "temperature": temperature,
+        "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty,
+    }
+    generate_ids = model_q3vl.generate(**generation_kwargs)
+    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+    response_text = processor_q3vl.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    try:
+        coords = parse_detection_output(response_text)
+        if not coords:
+            return image, f"Could not detect '{user_prompt}'.\nModel raw output:\n{response_text}"
+        if task_type == "Object Detection":
+            labels = [f"{user_prompt} #{i+1}" for i in range(len(coords))]
+            annotated_image = draw_object_detections(image, coords, labels)
+        else: # Point Detection
+            annotated_image = draw_point_detections(image, coords)
+        return annotated_image, response_text
+    except Exception as e:
+        return image, f"An error occurred during processing:\n{str(e)}\n\nModel raw output:\n{response_text}"
 image_examples = [["Perform OCR on the image...", "examples/images/1.jpg"],
                 ["Describe this GIF.", "examples/gifs/2.gif"]]
 caption_examples = [["examples/captions/1.JPG"],
                     ["examples/captions/2.jpeg"], ["examples/captions/3.jpeg"]]
+# NOTE: You'll need to create these example image files in a directory named 'examples/detection/'
+obj_det_examples = [["examples/detection/obj1.jpg", "the two people"], ["examples/detection/obj2.jpg", "the yellow taxi"]]
+point_det_examples = [["examples/detection/point1.jpg", "the eyes of the person"], ["examples/detection/point2.jpg", "the headlights of the car"]]
 with gr.Blocks(theme=orange_red_theme, css=css) as demo:
                     image_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Upload Video(≤30s)", height=290)
+                    video_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
                 with gr.TabItem("PDF Inference"):
                     with gr.Row():
                     gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
                     gif_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(examples=gif_examples, inputs=[gif_query, gif_upload])
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
                     gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
+                with gr.TabItem("Object Detection"):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            obj_det_image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+                            obj_det_query = gr.Textbox(label="Object to Detect", placeholder="e.g., car, person, dog")
+                            obj_det_submit = gr.Button("Detect Objects", variant="primary")
+                        with gr.Column(scale=1):
+                            obj_det_output_image = gr.Image(type="pil", label="Detection Result", height=290)
+                    obj_det_output_text = gr.Textbox(label="Model Raw Output", interactive=False, lines=5)
+                    gr.Examples(examples=obj_det_examples, inputs=[obj_det_image_upload, obj_det_query])
+                with gr.TabItem("Point Detection"):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            point_det_image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+                            point_det_query = gr.Textbox(label="Point(s) to Detect", placeholder="e.g., the eyes of the cat")
+                            point_det_submit = gr.Button("Detect Points", variant="primary")
+                        with gr.Column(scale=1):
+                            point_det_output_image = gr.Image(type="pil", label="Detection Result", height=290)
+                    point_det_output_text = gr.Textbox(label="Model Raw Output", interactive=False, lines=5)
+                    gr.Examples(examples=point_det_examples, inputs=[point_det_image_upload, point_det_query])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            output = gr.Textbox(label="Raw Output Stream (General Tasks)", interactive=False, lines=20, show_copy_button=True)
+            with gr.Accordion("(Result.md)", open=False):
+                markdown_output = gr.Markdown(label="(Result.Md)", latex_delimiters=[
                                 {"left": "$$", "right": "$$", "display": True},
                                 {"left": "$", "right": "$", "display": False}
+                            ])
+    # Click handlers for original tabs
+    image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    gif_submit.click(fn=generate_gif, inputs=[gif_query, gif_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    # PDF navigation handlers
+    pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
+    prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
+    next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
+    # Click handlers for NEW tabs
     obj_det_submit.click(
+        fn=generate_detection,
+        inputs=[obj_det_image_upload, obj_det_query, gr.Textbox(value="Object Detection", visible=False),
+                max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[obj_det_output_image, obj_det_output_text]
     )
     point_det_submit.click(
+        fn=generate_detection,
+        inputs=[point_det_image_upload, point_det_query, gr.Textbox(value="Point Detection", visible=False),
+                max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[point_det_output_image, point_det_output_text]
     )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)