Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

93c4cc1

verified ·

1 Parent(s): 147f26b

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -19

app.py CHANGED Viewed

@@ -24,14 +24,6 @@ from transformers import (
     TextIteratorStreamer,
 )
-# It's good practice to ensure compressed_tensors is installed when dealing with such models
-try:
-    from compressed_tensors import save_compressed, load_compressed, BitmaskConfig
-except ImportError:
-    print("compressed_tensors is not installed. Please install it using 'pip install compressed-tensors'")
-    sys.exit(1)
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -130,6 +122,37 @@ if torch.cuda.is_available():
 print("Using device:", device)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -146,17 +169,14 @@ model_v = Qwen3VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # Load Nanonets-OCR2-3B
-MODEL_ID_X = "prithivMLmods/Nanonets-OCR2-3B-AWQ-nvfp4"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-# The fix is to load the model in a supported dtype like float16.
-# The `compressed-tensors` library will handle the dequantization from Float8_e4m3fn.
 model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
-    torch_dtype=torch.float16,  # Change "auto" to torch.float16
 ).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
@@ -285,10 +305,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     )
 if __name__ == "__main__":
-    # To run this, you would need to have example images in an "examples" directory
-    # or upload your own images.
-    if not os.path.exists("examples"):
-        os.makedirs("examples")
-        print("Created 'examples' directory. Please add your example images there.")
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 print("Using device:", device)
+# CACHE_PATH = "./model_cache"
+# if not os.path.exists(CACHE_PATH):
+#     os.makedirs(CACHE_PATH)
+#
+# model_path_d_local = snapshot_download(
+#     repo_id='rednote-hilab/dots.ocr',
+#     local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
+#     max_workers=20,
+#     local_dir_use_symlinks=False
+# )
+#
+# config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
+#
+# if os.path.exists(config_file_path):
+#     with open(config_file_path, 'r') as f:
+#         input_code = f.read()
+#
+#     lines = input_code.splitlines()
+#     if "class DotsVLProcessor" in input_code and not any("attributes = " in line for line in lines):
+#         output_lines = []
+#         for line in lines:
+#             output_lines.append(line)
+#             if line.strip().startswith("class DotsVLProcessor"):
+#                 output_lines.append("    attributes = [\"image_processor\", \"tokenizer\"]")
+#
+#         with open(config_file_path, 'w') as f:
+#             f.write('\n'.join(output_lines))
+#         print("Patched configuration_dots.py successfully.")
+#
+#sys.path.append(model_path_d_local)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ).to(device).eval()
 # Load Nanonets-OCR2-3B
+MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
 ).to(device).eval()
 # Load Dots.OCR from the local, patched directory
 MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
     )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)