prithivMLmods commited on
Commit
710f72c
·
verified ·
1 Parent(s): 4485da2

update app

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -160,7 +160,8 @@ div.no-padding { padding: 0 !important; }
160
  # Constants for text generation
161
  MAX_MAX_NEW_TOKENS = 2048
162
  DEFAULT_MAX_NEW_TOKENS = 1024
163
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
164
 
165
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
166
 
@@ -230,7 +231,8 @@ def downsample_video(video_path):
230
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
231
  fps = vidcap.get(cv2.CAP_PROP_FPS)
232
  frames = []
233
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
 
234
  for i in frame_indices:
235
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
236
  success, image = vidcap.read()
@@ -284,14 +286,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
284
  ]
285
  }]
286
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
287
  inputs = processor(
288
  text=[prompt_full],
289
  images=[image],
290
  return_tensors="pt",
291
  padding=True,
292
- truncation=True,
293
- max_length=MAX_INPUT_TOKEN_LENGTH
294
  ).to(device)
 
295
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
296
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
297
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -350,9 +354,8 @@ def generate_video(model_name: str, text: str, video_path: str,
350
  text=[prompt_full],
351
  images=images_for_processor,
352
  return_tensors="pt",
353
- padding=True,
354
- truncation=True,
355
- max_length=MAX_INPUT_TOKEN_LENGTH
356
  ).to(device)
357
 
358
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -421,11 +424,7 @@ with gr.Blocks(css=css, theme=thistle_theme) as demo:
421
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
422
  with gr.Accordion("(Result.md)", open=False):
423
  markdown_output = gr.Markdown(label="(Result.Md)")
424
- # , latex_delimiters=[
425
- # {"left": "$$", "right": "$$", "display": True},
426
- # {"left": "$", "right": "$", "display": False}
427
- #])
428
-
429
  model_choice = gr.Radio(
430
  choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
431
  "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
 
160
  # Constants for text generation
161
  MAX_MAX_NEW_TOKENS = 2048
162
  DEFAULT_MAX_NEW_TOKENS = 1024
163
+ # Increased max_length to accommodate more complex inputs, especially with multiple images
164
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
165
 
166
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
167
 
 
231
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
232
  fps = vidcap.get(cv2.CAP_PROP_FPS)
233
  frames = []
234
+ # Use a maximum of 10 frames to avoid excessive memory usage
235
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
236
  for i in frame_indices:
237
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
238
  success, image = vidcap.read()
 
286
  ]
287
  }]
288
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
289
+
290
+ # FIX: Set truncation to False to avoid the ValueError
291
  inputs = processor(
292
  text=[prompt_full],
293
  images=[image],
294
  return_tensors="pt",
295
  padding=True,
296
+ truncation=False, # Disabled truncation
 
297
  ).to(device)
298
+
299
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
300
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
301
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
354
  text=[prompt_full],
355
  images=images_for_processor,
356
  return_tensors="pt",
357
+ padding=True
358
+ #truncation=False, # Disabled truncation
 
359
  ).to(device)
360
 
361
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
424
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
425
  with gr.Accordion("(Result.md)", open=False):
426
  markdown_output = gr.Markdown(label="(Result.Md)")
427
+
 
 
 
 
428
  model_choice = gr.Radio(
429
  choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
430
  "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],