RMA-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 14

Commit

710f72c

verified ·

1 Parent(s): 4485da2

update app

Browse files

Files changed (1) hide show

app.py +11 -12

app.py CHANGED Viewed

@@ -160,7 +160,8 @@ div.no-padding { padding: 0 !important; }
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -230,7 +231,8 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
@@ -284,14 +286,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
         padding=True,
-        truncation=True,
-        max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -350,9 +354,8 @@ def generate_video(model_name: str, text: str, video_path: str,
         text=[prompt_full],
         images=images_for_processor,
         return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -421,11 +424,7 @@ with gr.Blocks(css=css, theme=thistle_theme) as demo:
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
-                                   #              , latex_delimiters=[
-                                   # {"left": "$$", "right": "$$", "display": True},
-                                   # {"left": "$", "right": "$", "display": False}
-                                #])
                 model_choice = gr.Radio(
                     choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
                          "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],

 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
+# Increased max_length to accommodate more complex inputs, especially with multiple images
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Use a maximum of 10 frames to avoid excessive memory usage
+    frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # FIX: Set truncation to False to avoid the ValueError
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
         padding=True,
+        truncation=False, # Disabled truncation
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
         text=[prompt_full],
         images=images_for_processor,
         return_tensors="pt",
+        padding=True
+        #truncation=False, # Disabled truncation
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
                 model_choice = gr.Radio(
                     choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
                          "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],