Spaces:

prithivMLmods
/

Super-OCRs-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on Aug 17

Commit

cb89a3b

verified ·

1 Parent(s): 69a99ff

update app models(++)

Browse files

Files changed (1) hide show

app.py +56 -10

app.py CHANGED Viewed

@@ -150,14 +150,15 @@ model_o = AutoModelForVision2Seq.from_pretrained(
     MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
 ).to(device).eval()
-# --- NEW MODEL: SmolVLM2-500M-Video-Instruct ---
-MODEL_ID_SV = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
-processor_sv = AutoProcessor.from_pretrained(MODEL_ID_SV, trust_remote_code=True)
-model_sv = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID_SV,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
 ).to(device).eval()
 # --- PDF Generation and Preview Utility Function ---
@@ -250,6 +251,9 @@ def process_document_stream(
         yield "Please enter a prompt.", ""
         return
     if model_name == "Moondream2(vision)":
         image_embeds = moondream.encode_image(image)
         answer = moondream.answer_question(
@@ -259,8 +263,51 @@ def process_document_stream(
         )
         yield answer, answer
         return
-    # Model and processor selection
     if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
     elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
     elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -274,7 +321,6 @@ def process_document_stream(
     elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
     elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
     elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
-    elif model_name == "SmolVLM2-500M-Video-Instruct(video)": processor, model = processor_sv, model_sv
     else:
         yield "Invalid model selected.", ""
         return
@@ -333,7 +379,7 @@ def create_gradio_interface():
                 model_choice = gr.Dropdown(
                     choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
-                             "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "SmolVLM2-500M-Video-Instruct(video)"],
                     label="Select Model", value= "LFM2-VL-450M(fast)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")

     MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
 ).to(device).eval()
+# --- AIDC-AI/Ovis2-1B Model Loading ---
+MODEL_ID_O2 = 'AIDC-AI/Ovis2-1B'
+model_o2 = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_O2,
+    torch_dtype=torch.bfloat16,
+    multimodal_max_length=8192,
+    trust_remote_code=True
 ).to(device).eval()
+text_tokenizer_o2 = model_o2.get_text_tokenizer()
 # --- PDF Generation and Preview Utility Function ---
         yield "Please enter a prompt.", ""
         return
+    # --- Model-specific inference paths ---
+    # Moondream2 has a unique generation method
     if model_name == "Moondream2(vision)":
         image_embeds = moondream.encode_image(image)
         answer = moondream.answer_question(
         )
         yield answer, answer
         return
+    # Ovis2-1B has a custom preprocessing pipeline
+    elif model_name == "AIDC-AI/Ovis2-1B(ovis)":
+        conversations = [{"from": "human", "value": f"<image>\n{prompt_input}"}]
+        _, input_ids, pixel_values = model_o2.preprocess_inputs(conversations, [image], max_partition=16)
+        attention_mask = torch.ne(input_ids, text_tokenizer_o2.pad_token_id)
+        model_inputs = {
+            "inputs": input_ids.unsqueeze(0).to(device=model_o2.device),
+            "attention_mask": attention_mask.unsqueeze(0).to(device=model_o2.device),
+            "pixel_values": [pixel_values.to(dtype=torch.bfloat16, device=model_o2.device)]
+        }
+        streamer = TextIteratorStreamer(text_tokenizer_o2, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **model_inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+            "eos_token_id": model_o2.generation_config.eos_token_id,
+            "pad_token_id": text_tokenizer_o2.pad_token_id,
+            "use_cache": True
+        }
+        thread = Thread(target=model_o2.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer, buffer
+        yield buffer, buffer
+        return
+    # --- Standardized inference path for most other models ---
+    # Select model and processor
     if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
     elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
     elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
     elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
     elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
     elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
     else:
         yield "Invalid model selected.", ""
         return
                 model_choice = gr.Dropdown(
                     choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
+                             "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)", "SmolVLM-500M-Instruct(smol)", "AIDC-AI/Ovis2-1B(ovis)"],
                     label="Select Model", value= "LFM2-VL-450M(fast)"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")