prithivMLmods commited on
Commit
f821a2b
·
verified ·
1 Parent(s): e715667

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -46
app.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
 
8
 
9
  import gradio as gr
10
  import spaces
@@ -21,6 +22,62 @@ from transformers import (
21
  TextIteratorStreamer,
22
  )
23
  from transformers.image_utils import load_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Constants for text generation
26
  MAX_MAX_NEW_TOKENS = 2048
@@ -144,7 +201,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
144
  messages = [{
145
  "role": "user",
146
  "content": [
147
- {"type": "image", "image": image},
148
  {"type": "text", "text": text},
149
  ]
150
  }]
@@ -154,7 +211,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
154
  images=[image],
155
  return_tensors="pt",
156
  padding=True,
157
- truncation=False,
158
  max_length=MAX_INPUT_TOKEN_LENGTH
159
  ).to(device)
160
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -202,24 +259,24 @@ def generate_video(model_name: str, text: str, video_path: str,
202
  yield "Please upload a video.", "Please upload a video."
203
  return
204
 
205
- frames = downsample_video(video_path)
206
- messages = [
207
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
208
- {"role": "user", "content": [{"type": "text", "text": text}]}
209
- ]
210
- for frame in frames:
211
- image, timestamp = frame
212
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
213
- messages[1]["content"].append({"type": "image", "image": image})
214
- inputs = processor.apply_chat_template(
215
- messages,
216
- tokenize=True,
217
- add_generation_prompt=True,
218
- return_dict=True,
219
  return_tensors="pt",
220
- truncation=False,
 
221
  max_length=MAX_INPUT_TOKEN_LENGTH
222
  ).to(device)
 
223
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
224
  generation_kwargs = {
225
  **inputs,
@@ -256,38 +313,32 @@ video_examples = [
256
  ]
257
 
258
  css = """
259
- .submit-btn {
260
- background-color: #2980b9 !important;
261
- color: white !important;
262
- }
263
- .submit-btn:hover {
264
- background-color: #3498db !important;
265
  }
266
- .canvas-output {
267
- border: 2px solid #4682B4;
268
- border-radius: 10px;
269
- padding: 20px;
270
  }
271
  """
272
 
273
  # Create the Gradio Interface
274
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
275
- gr.Markdown("# **[Multimodal OCR hpc/.](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
276
  with gr.Row():
277
  with gr.Column():
278
  with gr.Tabs():
279
  with gr.TabItem("Image Inference"):
280
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
281
- image_upload = gr.Image(type="pil", label="Image", height=290)
282
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
283
  gr.Examples(
284
  examples=image_examples,
285
  inputs=[image_query, image_upload]
286
  )
287
  with gr.TabItem("Video Inference"):
288
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
289
- video_upload = gr.Video(label="Video", height=290)
290
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
291
  gr.Examples(
292
  examples=video_examples,
293
  inputs=[video_query, video_upload]
@@ -301,11 +352,14 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
301
 
302
  with gr.Column():
303
  with gr.Column(elem_classes="canvas-output"):
304
- gr.Markdown("## Output")
305
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
306
 
307
  with gr.Accordion("(Result.md)", open=False):
308
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
 
 
 
309
 
310
  model_choice = gr.Radio(
311
  choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
@@ -313,15 +367,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
313
  label="Select Model",
314
  value="olmOCR-7B-0725"
315
  )
316
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
317
- gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
318
- gr.Markdown("> [olmOCR-7B-0725](https://huggingface.co/allenai/olmOCR-7B-0725): olmocr-7b-0725 — fine-tuned with olmocr-mix-0225 on top of Qwen2.5-VL-7B-Instruct, pushing the boundaries of OCR technology. high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition.")
319
- gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
320
- gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
321
- gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
322
-
323
- gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
324
-
325
  image_submit.click(
326
  fn=generate_image,
327
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -334,4 +380,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
334
  )
335
 
336
  if __name__ == "__main__":
337
- demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
8
+ from typing import Iterable
9
 
10
  import gradio as gr
11
  import spaces
 
22
  TextIteratorStreamer,
23
  )
24
  from transformers.image_utils import load_image
25
+ from gradio.themes import Soft
26
+ from gradio.themes.utils import colors, fonts, sizes
27
+
28
+ # --- Theme and CSS Definition ---
29
+
30
+ # Define the new LightBlue color palette
31
+ colors.light_blue = colors.Color(
32
+ name="light_blue",
33
+ c50="#F0F8FF",
34
+ c100="#E0FFFF",
35
+ c200="#BFEFFF",
36
+ c300="#B0E0E6",
37
+ c400="#87CEEB",
38
+ c500="#ADD8E6", # LightBlue base color
39
+ c600="#6495ED",
40
+ c700="#4682B4",
41
+ c800="#4169E1",
42
+ c900="#0000CD",
43
+ c950="#00008B",
44
+ )
45
+
46
+ class LightBlueTheme(Soft):
47
+ def __init__(
48
+ self,
49
+ *,
50
+ primary_hue: colors.Color | str = colors.gray,
51
+ secondary_hue: colors.Color | str = colors.light_blue,
52
+ neutral_hue: colors.Color | str = colors.slate,
53
+ text_size: sizes.Size | str = sizes.text_lg,
54
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
55
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
56
+ ),
57
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
58
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
59
+ ),
60
+ ):
61
+ super().__init__(
62
+ primary_hue=primary_hue,
63
+ secondary_hue=secondary_hue,
64
+ neutral_hue=neutral_hue,
65
+ text_size=text_size,
66
+ font=font,
67
+ font_mono=font_mono,
68
+ )
69
+ super().set(
70
+ button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_500)",
71
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_500, *secondary_600)",
72
+ button_primary_text_color="white",
73
+ slider_color="*secondary_500",
74
+ block_title_text_weight="600",
75
+ block_border_width="2px",
76
+ block_shadow="*shadow_drop_lg",
77
+ )
78
+
79
+ # Instantiate the new theme
80
+ light_blue_theme = LightBlueTheme()
81
 
82
  # Constants for text generation
83
  MAX_MAX_NEW_TOKENS = 2048
 
201
  messages = [{
202
  "role": "user",
203
  "content": [
204
+ {"type": "image"},
205
  {"type": "text", "text": text},
206
  ]
207
  }]
 
211
  images=[image],
212
  return_tensors="pt",
213
  padding=True,
214
+ truncation=True,
215
  max_length=MAX_INPUT_TOKEN_LENGTH
216
  ).to(device)
217
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
259
  yield "Please upload a video.", "Please upload a video."
260
  return
261
 
262
+ frames_with_ts = downsample_video(video_path)
263
+ images_for_processor = [frame for frame, ts in frames_with_ts]
264
+
265
+ messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
266
+ for frame in images_for_processor:
267
+ messages[0]["content"].insert(0, {"type": "image"})
268
+
269
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
270
+
271
+ inputs = processor(
272
+ text=[prompt_full],
273
+ images=images_for_processor,
 
 
274
  return_tensors="pt",
275
+ padding=True,
276
+ truncation=True,
277
  max_length=MAX_INPUT_TOKEN_LENGTH
278
  ).to(device)
279
+
280
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
281
  generation_kwargs = {
282
  **inputs,
 
313
  ]
314
 
315
  css = """
316
+ #main-title h1 {
317
+ font-size: 2.3em !important;
 
 
 
 
318
  }
319
+ #output-title h2 {
320
+ font-size: 2.1em !important;
 
 
321
  }
322
  """
323
 
324
  # Create the Gradio Interface
325
+ with gr.Blocks(css=css, theme=light_blue_theme) as demo:
326
+ gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
327
  with gr.Row():
328
  with gr.Column():
329
  with gr.Tabs():
330
  with gr.TabItem("Image Inference"):
331
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
332
+ image_upload = gr.Image(type="pil", label="Upload Image", height=290)
333
+ image_submit = gr.Button("Submit", variant="primary")
334
  gr.Examples(
335
  examples=image_examples,
336
  inputs=[image_query, image_upload]
337
  )
338
  with gr.TabItem("Video Inference"):
339
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
340
+ video_upload = gr.Video(label="Upload Video", height=290)
341
+ video_submit = gr.Button("Submit", variant="primary")
342
  gr.Examples(
343
  examples=video_examples,
344
  inputs=[video_query, video_upload]
 
352
 
353
  with gr.Column():
354
  with gr.Column(elem_classes="canvas-output"):
355
+ gr.Markdown("## Output", elem_id="output-title")
356
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
357
 
358
  with gr.Accordion("(Result.md)", open=False):
359
+ markdown_output = gr.Markdown(label="(Result.Md)", latex_delimiters=[
360
+ {"left": "$$", "right": "$$", "display": True},
361
+ {"left": "$", "right": "$", "display": False}
362
+ ])
363
 
364
  model_choice = gr.Radio(
365
  choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
 
367
  label="Select Model",
368
  value="olmOCR-7B-0725"
369
  )
370
+
 
 
 
 
 
 
 
 
371
  image_submit.click(
372
  fn=generate_image,
373
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
380
  )
381
 
382
  if __name__ == "__main__":
383
+ demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)