update app
Browse files
app.py
CHANGED
|
@@ -160,7 +160,8 @@ div.no-padding { padding: 0 !important; }
|
|
| 160 |
# Constants for text generation
|
| 161 |
MAX_MAX_NEW_TOKENS = 2048
|
| 162 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 166 |
|
|
@@ -230,7 +231,8 @@ def downsample_video(video_path):
|
|
| 230 |
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 231 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
| 232 |
frames = []
|
| 233 |
-
|
|
|
|
| 234 |
for i in frame_indices:
|
| 235 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 236 |
success, image = vidcap.read()
|
|
@@ -284,14 +286,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 284 |
]
|
| 285 |
}]
|
| 286 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
|
|
|
| 287 |
inputs = processor(
|
| 288 |
text=[prompt_full],
|
| 289 |
images=[image],
|
| 290 |
return_tensors="pt",
|
| 291 |
padding=True,
|
| 292 |
-
truncation=
|
| 293 |
-
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 294 |
).to(device)
|
|
|
|
| 295 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 296 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 297 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
@@ -350,9 +354,8 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 350 |
text=[prompt_full],
|
| 351 |
images=images_for_processor,
|
| 352 |
return_tensors="pt",
|
| 353 |
-
padding=True
|
| 354 |
-
truncation=
|
| 355 |
-
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 356 |
).to(device)
|
| 357 |
|
| 358 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
|
@@ -421,11 +424,7 @@ with gr.Blocks(css=css, theme=thistle_theme) as demo:
|
|
| 421 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
|
| 422 |
with gr.Accordion("(Result.md)", open=False):
|
| 423 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 424 |
-
|
| 425 |
-
# {"left": "$$", "right": "$$", "display": True},
|
| 426 |
-
# {"left": "$", "right": "$", "display": False}
|
| 427 |
-
#])
|
| 428 |
-
|
| 429 |
model_choice = gr.Radio(
|
| 430 |
choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
|
| 431 |
"Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
|
|
|
|
| 160 |
# Constants for text generation
|
| 161 |
MAX_MAX_NEW_TOKENS = 2048
|
| 162 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 163 |
+
# Increased max_length to accommodate more complex inputs, especially with multiple images
|
| 164 |
+
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
|
| 165 |
|
| 166 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 167 |
|
|
|
|
| 231 |
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 232 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
| 233 |
frames = []
|
| 234 |
+
# Use a maximum of 10 frames to avoid excessive memory usage
|
| 235 |
+
frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
|
| 236 |
for i in frame_indices:
|
| 237 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 238 |
success, image = vidcap.read()
|
|
|
|
| 286 |
]
|
| 287 |
}]
|
| 288 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 289 |
+
|
| 290 |
+
# FIX: Set truncation to False to avoid the ValueError
|
| 291 |
inputs = processor(
|
| 292 |
text=[prompt_full],
|
| 293 |
images=[image],
|
| 294 |
return_tensors="pt",
|
| 295 |
padding=True,
|
| 296 |
+
truncation=False, # Disabled truncation
|
|
|
|
| 297 |
).to(device)
|
| 298 |
+
|
| 299 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 300 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 301 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
|
|
| 354 |
text=[prompt_full],
|
| 355 |
images=images_for_processor,
|
| 356 |
return_tensors="pt",
|
| 357 |
+
padding=True
|
| 358 |
+
#truncation=False, # Disabled truncation
|
|
|
|
| 359 |
).to(device)
|
| 360 |
|
| 361 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
|
|
|
| 424 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
|
| 425 |
with gr.Accordion("(Result.md)", open=False):
|
| 426 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 427 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
model_choice = gr.Radio(
|
| 429 |
choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
|
| 430 |
"Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
|