Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import cv2 | |
| import numpy as np | |
| import os | |
| import json | |
| from PIL import Image | |
| import io | |
| import base64 | |
| from openai import OpenAI | |
| from ultralytics import YOLO | |
| # Define the Latex2Layout model path | |
| model_path = "latex2layout_object_detection_yolov8.pt" | |
| # Verify model file existence | |
| if not os.path.exists(model_path): | |
| raise FileNotFoundError(f"Model file not found at {model_path}") | |
| # Load the Latex2Layout model with error handling | |
| try: | |
| model = YOLO(model_path) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load Latex2Layout model: {e}") | |
| # Qwen API configuration | |
| QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" | |
| QWEN_MODELS = { | |
| "Qwen2.5-VL-3B-Instruct": "qwen2.5-vl-3b-instruct", | |
| "Qwen2.5-VL-7B-Instruct": "qwen2.5-vl-7b-instruct", | |
| "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct", | |
| } | |
| # Default system prompt template | |
| default_system_prompt = """You are an assistant specialized in document layout analysis. | |
| The following layout elements were detected in the image (confidence >= 0.5): | |
| {layout_info} | |
| Use this information and the image to answer layout-related questions.""" | |
| def encode_image(image_array): | |
| """ | |
| Convert a numpy array image to a base64-encoded string. | |
| Args: | |
| image_array: Numpy array representing the image. | |
| Returns: | |
| str: Base64-encoded string of the image. | |
| """ | |
| try: | |
| pil_image = Image.fromarray(image_array) | |
| img_byte_arr = io.BytesIO() | |
| pil_image.save(img_byte_arr, format='PNG') | |
| return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8") | |
| except Exception as e: | |
| raise ValueError(f"Failed to encode image: {e}") | |
| def detect_layout(image, confidence_threshold=0.5): | |
| """ | |
| Detect layout elements in the uploaded image using the Latex2Layout model. | |
| Args: | |
| image: Uploaded image as a numpy array. | |
| confidence_threshold: Minimum confidence score to retain detections (default: 0.5). | |
| Returns: | |
| tuple: (annotated_image, layout_info_str) | |
| - annotated_image: Image with bounding boxes drawn (confidence >= 0.5). | |
| - layout_info_str: JSON string of layout detections (confidence >= 0.5). | |
| """ | |
| if image is None or not isinstance(image, np.ndarray): | |
| return None, "Error: No image uploaded or invalid image format." | |
| try: | |
| # Perform detection | |
| results = model(image) | |
| result = results[0] | |
| annotated_image = image.copy() | |
| layout_info = [] | |
| # Process detections | |
| for box in result.boxes: | |
| conf = float(box.conf[0]) | |
| if conf < confidence_threshold: | |
| continue | |
| x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy()) | |
| cls_id = int(box.cls[0]) | |
| cls_name = result.names[cls_id] | |
| color = tuple(np.random.randint(0, 255, 3).tolist()) | |
| cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2) | |
| label = f"{cls_name} {conf:.2f}" | |
| (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) | |
| cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1) | |
| cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) | |
| layout_info.append({ | |
| "bbox": [x1, y1, x2, y2], | |
| "class": cls_name, | |
| "confidence": conf | |
| }) | |
| layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5." | |
| return annotated_image, layout_info_str | |
| except Exception as e: | |
| return None, f"Error during layout detection: {str(e)}" | |
| def detect_example_image(): | |
| """ | |
| Load and detect layout elements in the example image (./image1.png). | |
| Returns: | |
| tuple: (example_image, annotated_image, layout_info_str) | |
| - example_image: Original example image. | |
| - annotated_image: Annotated example image. | |
| - layout_info_str: JSON string of layout detections. | |
| """ | |
| example_image_path = "./image1.png" | |
| if not os.path.exists(example_image_path): | |
| return None, None, "Error: Example image not found." | |
| try: | |
| # Load image in BGR and convert to RGB | |
| bgr_image = cv2.imread(example_image_path) | |
| if bgr_image is None: | |
| return None, None, "Error: Failed to load example image." | |
| rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) | |
| # Run detection | |
| annotated_image, layout_info_str = detect_layout(rgb_image) | |
| return rgb_image, annotated_image, layout_info_str | |
| except Exception as e: | |
| return None, None, f"Error processing example image: {str(e)}" | |
| def qa_about_layout(image, question, layout_info, api_key, model_name, system_prompt_template): | |
| """ | |
| Answer layout-related questions using the Qwen API with an editable system prompt. | |
| Args: | |
| image: Uploaded image as a numpy array. | |
| question: User's question about the layout. | |
| layout_info: JSON string of layout detection results. | |
| api_key: User's Qwen API key. | |
| model_name: Selected Qwen model name. | |
| system_prompt_template: Editable system prompt template. | |
| Returns: | |
| str: Qwen's response to the question. | |
| """ | |
| if image is None or not isinstance(image, np.ndarray): | |
| return "Error: Please upload a valid image." | |
| if not question: | |
| return "Error: Please enter a question." | |
| if not api_key: | |
| return "Error: Please provide a Qwen API key." | |
| if not layout_info: | |
| return "Error: No layout information available. Detect layout first." | |
| try: | |
| # Encode image to base64 | |
| base64_image = encode_image(image) | |
| # Map model name to ID | |
| model_id = QWEN_MODELS.get(model_name) | |
| if not model_id: | |
| return "Error: Invalid Qwen model selected." | |
| # Replace placeholder in system prompt with layout info | |
| system_prompt = system_prompt_template.replace("{layout_info}", layout_info) | |
| # Initialize OpenAI client for Qwen API | |
| client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL) | |
| # Prepare API request messages | |
| messages = [ | |
| {"role": "system", "content": [{"type": "text", "text": system_prompt}]}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}, | |
| {"type": "text", "text": question}, | |
| ], | |
| }, | |
| ] | |
| # Call Qwen API | |
| completion = client.chat.completions.create(model=model_id, messages=messages) | |
| return completion.choices[0].message.content | |
| except Exception as e: | |
| return f"Error during QA: {str(e)}" | |
| # Build Gradio interface | |
| with gr.Blocks(title="Latex2Layout QA System") as demo: | |
| gr.Markdown("# Latex2Layout QA System") | |
| gr.Markdown("Upload an image or use the example to detect layout elements and ask questions using Qwen models.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image(label="Upload Image", type="numpy") | |
| detect_btn = gr.Button("Detect Layout") | |
| example_btn = gr.Button("Detect Example Image") | |
| gr.Markdown("**Tip**: Use clear images for best results.") | |
| with gr.Column(scale=1): | |
| output_image = gr.Image(label="Detected Layout") | |
| layout_info = gr.Textbox(label="Layout Information", lines=10, interactive=False) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| api_key_input = gr.Textbox( | |
| label="Qwen API Key", | |
| placeholder="Enter your Qwen API key", | |
| type="password" | |
| ) | |
| model_select = gr.Dropdown( | |
| label="Select Qwen Model", | |
| choices=list(QWEN_MODELS.keys()), | |
| value="Qwen2.5-VL-3B-Instruct" | |
| ) | |
| gr.Markdown("**System Prompt Template**: Edit the prompt sent to Qwen. Include `{layout_info}` to insert detection results.") | |
| system_prompt_input = gr.Textbox( | |
| label="System Prompt Template", | |
| value=default_system_prompt, | |
| lines=5, | |
| placeholder="Edit the system prompt here. Keep {layout_info} to include detection results." | |
| ) | |
| question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'") | |
| qa_btn = gr.Button("Ask Question") | |
| with gr.Column(scale=1): | |
| answer_output = gr.Textbox(label="Answer", lines=5, interactive=False) | |
| # Event handlers | |
| detect_btn.click( | |
| fn=detect_layout, | |
| inputs=[input_image], | |
| outputs=[output_image, layout_info] | |
| ) | |
| example_btn.click( | |
| fn=detect_example_image, | |
| inputs=[], | |
| outputs=[input_image, output_image, layout_info] | |
| ) | |
| qa_btn.click( | |
| fn=qa_about_layout, | |
| inputs=[input_image, question_input, layout_info, api_key_input, model_select, system_prompt_input], | |
| outputs=[answer_output] | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo.launch() | |