Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

f821a2b

verified ·

1 Parent(s): e715667

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -46

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
@@ -21,6 +22,62 @@ from transformers import (
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
@@ -144,7 +201,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     messages = [{
         "role": "user",
         "content": [
-            {"type": "image", "image": image},
             {"type": "text", "text": text},
         ]
     }]
@@ -154,7 +211,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         images=[image],
         return_tensors="pt",
         padding=True,
-        truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -202,24 +259,24 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield "Please upload a video.", "Please upload a video."
         return
-    frames = downsample_video(video_path)
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": text}]}
-    ]
-    for frame in frames:
-        image, timestamp = frame
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-        messages[1]["content"].append({"type": "image", "image": image})
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
         return_tensors="pt",
-        truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -256,38 +313,32 @@ video_examples = [
 ]
 css = """
-.submit-btn {
-    background-color: #2980b9 !important;
-    color: white !important;
-}
-.submit-btn:hover {
-    background-color: #3498db !important;
 }
-.canvas-output {
-    border: 2px solid #4682B4;
-    border-radius: 10px;
-    padding: 20px;
 }
 """
 # Create the Gradio Interface
-with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal OCR hpc/.](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    image_upload = gr.Image(type="pil", label="Image", height=290)
-                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=image_examples,
                         inputs=[image_query, image_upload]
                     )
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    video_upload = gr.Video(label="Video", height=290)
-                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
@@ -301,11 +352,14 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
-                gr.Markdown("## Output")
-                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
-                    markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
                 choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
@@ -313,15 +367,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 label="Select Model",
                 value="olmOCR-7B-0725"
             )
-            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
-            gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
-            gr.Markdown("> [olmOCR-7B-0725](https://huggingface.co/allenai/olmOCR-7B-0725): olmocr-7b-0725 — fine-tuned with olmocr-mix-0225 on top of Qwen2.5-VL-7B-Instruct, pushing the boundaries of OCR technology. high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition.")
-            gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
-            gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
-            gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
-            gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -334,4 +380,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 import time
 import asyncio
 from threading import Thread
+from typing import Iterable
 import gradio as gr
 import spaces
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+# --- Theme and CSS Definition ---
+# Define the new LightBlue color palette
+colors.light_blue = colors.Color(
+    name="light_blue",
+    c50="#F0F8FF",
+    c100="#E0FFFF",
+    c200="#BFEFFF",
+    c300="#B0E0E6",
+    c400="#87CEEB",
+    c500="#ADD8E6",  # LightBlue base color
+    c600="#6495ED",
+    c700="#4682B4",
+    c800="#4169E1",
+    c900="#0000CD",
+    c950="#00008B",
+)
+class LightBlueTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.light_blue,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_500)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_text_color="white",
+            slider_color="*secondary_500",
+            block_title_text_weight="600",
+            block_border_width="2px",
+            block_shadow="*shadow_drop_lg",
+        )
+# Instantiate the new theme
+light_blue_theme = LightBlueTheme()
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
     messages = [{
         "role": "user",
         "content": [
+            {"type": "image"},
             {"type": "text", "text": text},
         ]
     }]
         images=[image],
         return_tensors="pt",
         padding=True,
+        truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         yield "Please upload a video.", "Please upload a video."
         return
+    frames_with_ts = downsample_video(video_path)
+    images_for_processor = [frame for frame, ts in frames_with_ts]
+    messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+    for frame in images_for_processor:
+        messages[0]["content"].insert(0, {"type": "image"})
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full],
+        images=images_for_processor,
         return_tensors="pt",
+        padding=True,
+        truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
 ]
 css = """
+#main-title h1 {
+    font-size: 2.3em !important;
 }
+#output-title h2 {
+    font-size: 2.1em !important;
 }
 """
 # Create the Gradio Interface
+with gr.Blocks(css=css, theme=light_blue_theme) as demo:
+    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+                    image_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(
                         examples=image_examples,
                         inputs=[image_query, image_upload]
                     )
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Upload Video", height=290)
+                    video_submit = gr.Button("Submit", variant="primary")
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
+                gr.Markdown("## Output", elem_id="output-title")
+                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
+                    markdown_output = gr.Markdown(label="(Result.Md)", latex_delimiters=[
+                                    {"left": "$$", "right": "$$", "display": True},
+                                    {"left": "$", "right": "$", "display": False}
+                                ])
             model_choice = gr.Radio(
                 choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
                 label="Select Model",
                 value="olmOCR-7B-0725"
             )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)