Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

prithivMLmods commited on Feb 4

Commit

99eb8ee

verified ·

1 Parent(s): a0e8fb6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -51,6 +51,18 @@ def identify_and_save_blob(blob_path):
     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
@@ -95,9 +107,8 @@ def model_inference(input_dict, history):
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Prepare inputs for the processor
-    image_inputs = [load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"]
-    video_inputs = [path for path, media_type in zip(media_paths, media_types) if media_type == "video"]
     # Ensure video_inputs is not empty
     if not video_inputs:

     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
+def process_vision_info(messages):
+    """Processes vision inputs (images and videos) from messages."""
+    image_inputs = []
+    video_inputs = []
+    for message in messages:
+        for content in message["content"]:
+            if content["type"] == "image":
+                image_inputs.append(load_image(content["image"]))
+            elif content["type"] == "video":
+                video_inputs.append(content["video"])
+    return image_inputs, video_inputs
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Process vision inputs (images and videos)
+    image_inputs, video_inputs = process_vision_info(messages)
     # Ensure video_inputs is not empty
     if not video_inputs: