Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 15

Commit

4eaf777

verified ·

1 Parent(s): 128e479

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -22

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Cosmos-Reason1-7B
 MODEL_ID_M = "reducto/RolmOCR"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -37,7 +37,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load DocScope
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -46,16 +46,7 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Relaxed
-MODEL_ID_Z = "lingshu-medical-mllm/Lingshu-7B"
-processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
-model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_Z,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load visionOCR
 MODEL_ID_V = "nanonets/Nanonets-OCR-s"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -101,9 +92,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
-    elif model_name == "Lingshu-7B":
-        processor = processor_z
-        model = model_z
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
@@ -157,9 +145,6 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
-    elif model_name == "Lingshu-7B":
-        processor = processor_z
-        model = model_z
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
@@ -215,7 +200,6 @@ image_examples = [
 ]
 video_examples = [
-    ["Explain the watch ad in detail.", "videos/1.mp4"],
     ["Identify the main actions in the cartoon video", "videos/2.mp4"]
 ]
@@ -260,16 +244,15 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Lingshu-7B"],
                 label="Select Model",
-                value="RolmOCR"
             )
             gr.Markdown("**Model Info**")
             gr.Markdown("⤷ [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
             gr.Markdown("⤷ [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve <messy> optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
             gr.Markdown("⤷ [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
-            gr.Markdown("⤷ [Lingshu-7B](https://huggingface.co/lingshu-medical-mllm/Lingshu-7B): lingshu-7b is a generalist foundation model for unified multimodal medical understanding and reasoning, virtual assistants, and content generation.")
     image_submit.click(
         fn=generate_image,

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load RolmOCR
 MODEL_ID_M = "reducto/RolmOCR"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Qwen2-VL-OCR-2B-Instruct
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Nanonets-OCR-s
 MODEL_ID_V = "nanonets/Nanonets-OCR-s"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
     elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
 ]
 video_examples = [
     ["Identify the main actions in the cartoon video", "videos/2.mp4"]
 ]
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR"],
                 label="Select Model",
+                value="Nanonets-OCR-s"
             )
             gr.Markdown("**Model Info**")
             gr.Markdown("⤷ [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
             gr.Markdown("⤷ [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve <messy> optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
             gr.Markdown("⤷ [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
     image_submit.click(
         fn=generate_image,