Spaces:

atlasia
/

AtlasOCR-demo

Running on Zero

App Files Files Community

abdeljalilELmajjodi commited on Aug 28

Commit

186fd60

verified ·

1 Parent(s): edf3f62

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -219

app.py CHANGED Viewed

@@ -1,224 +1,114 @@
-# Copyright (c) AtlasIA.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import os
-import numpy as np
-from urllib3.exceptions import HTTPError
-os.system('pip install dashscope  modelscope oss2 -U')
-from argparse import ArgumentParser
-from pathlib import Path
-import copy
 import gradio as gr
-import oss2
 import os
-import re
-import secrets
-import tempfile
-import requests
-from http import HTTPStatus
-from dashscope import MultiModalConversation
-import dashscope
-API_KEY = os.environ['API_KEY']
-dashscope.api_key = API_KEY
-BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
-def _get_args():
-    parser = ArgumentParser()
-    parser.add_argument("--revision", type=str, default=REVISION)
-    parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
-    parser.add_argument("--share", action="store_true", default=False,
-                        help="Create a publicly shareable link for the interface.")
-    parser.add_argument("--inbrowser", action="store_true", default=False,
-                        help="Automatically launch the interface in a new tab on the default browser.")
-    parser.add_argument("--server-port", type=int, default=7860,
-                        help="Demo server port.")
-    parser.add_argument("--server-name", type=str, default="127.0.0.1",
-                        help="Demo server name.")
-    args = parser.parse_args()
-    return args
-def _parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line != ""]
-    count = 0
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                lines[i] = f'<pre><code class="language-{items[-1]}">'
-            else:
-                lines[i] = f"<br></code></pre>"
-        else:
-            if i > 0:
-                if count % 2 == 1:
-                    line = line.replace("`", r"\`")
-                    line = line.replace("<", "&lt;")
-                    line = line.replace(">", "&gt;")
-                    line = line.replace(" ", "&nbsp;")
-                    line = line.replace("*", "&ast;")
-                    line = line.replace("_", "&lowbar;")
-                    line = line.replace("-", "&#45;")
-                    line = line.replace(".", "&#46;")
-                    line = line.replace("!", "&#33;")
-                    line = line.replace("(", "&#40;")
-                    line = line.replace(")", "&#41;")
-                    line = line.replace("$", "&#36;")
-                lines[i] = "<br>" + line
-    text = "".join(lines)
-    return text
-def _remove_image_special(text):
-    text = text.replace('<ref>', '').replace('</ref>', '')
-    return re.sub(r'<box>.*?(</box>|$)', '', text)
-def _launch_demo(args):
-    uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(
-        Path(tempfile.gettempdir()) / "gradio"
     )
-    def predict(_chatbot, task_history):
-        chat_query = _chatbot[-1][0]
-        query = task_history[-1][0]
-        if len(chat_query) == 0:
-            _chatbot.pop()
-            task_history.pop()
-            return _chatbot
-        print("User: " + _parse_text(query))
-        history_cp = copy.deepcopy(task_history)
-        full_response = ""
-        messages = []
-        content = []
-        for q, a in history_cp:
-            if isinstance(q, (tuple, list)):
-                content.append({'image': f'file://{q[0]}'})
-            else:
-                content.append({'text': q})
-                messages.append({'role': 'user', 'content': content})
-                messages.append({'role': 'assistant', 'content': [{'text': a}]})
-                content = []
-        messages.pop()
-        responses = MultiModalConversation.call(
-            model='AtlasOCR', messages=messages, stream=True,
-        )
-        for response in responses:
-            if not response.status_code == HTTPStatus.OK:
-                raise HTTPError(f'response.code: {response.code}\nresponse.message: {response.message}')
-            response = response.output.choices[0].message.content
-            response_text = []
-            for ele in response:
-                if 'text' in ele:
-                    response_text.append(ele['text'])
-                elif 'box' in ele:
-                    response_text.append(ele['box'])
-            response_text = ''.join(response_text)
-            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(response_text))
-            yield _chatbot
-        if len(response) > 1:
-            result_image = response[-1]['result_image']
-            resp = requests.get(result_image)
-            os.makedirs(uploaded_file_dir, exist_ok=True)
-            name = f"tmp{secrets.token_hex(20)}.jpg"
-            filename = os.path.join(uploaded_file_dir, name)
-            with open(filename, 'wb') as f:
-                f.write(resp.content)
-            response = ''.join(r['box'] if 'box' in r else r['text'] for r in response[:-1])
-            _chatbot.append((None, (filename,)))
-        else:
-            response = response[0]['text']
-            _chatbot[-1] = (_parse_text(chat_query), response)
-        full_response = _parse_text(response)
-        task_history[-1] = (query, full_response)
-        print("AtlasOCR-Chat: " + _parse_text(full_response))
-        yield _chatbot
-    def regenerate(_chatbot, task_history):
-        if not task_history:
-            return _chatbot
-        item = task_history[-1]
-        if item[1] is None:
-            return _chatbot
-        task_history[-1] = (item[0], None)
-        chatbot_item = _chatbot.pop(-1)
-        if chatbot_item[0] is None:
-            _chatbot[-1] = (_chatbot[-1][0], None)
-        else:
-            _chatbot.append((chatbot_item[0], None))
-        _chatbot_gen = predict(_chatbot, task_history)
-        for _chatbot in _chatbot_gen:
-            yield _chatbot
-    def add_text(history, task_history, text):
-        task_text = text
-        history = history if history is not None else []
-        task_history = task_history if task_history is not None else []
-        history = history + [(_parse_text(text), None)]
-        task_history = task_history + [(task_text, None)]
-        return history, task_history, ""
-    def add_file(history, task_history, file):
-        history = history if history is not None else []
-        task_history = task_history if task_history is not None else []
-        history = history + [((file.name,), None)]
-        task_history = task_history + [((file.name,), None)]
-        return history, task_history
-    def reset_user_input():
-        return gr.update(value="")
-    def reset_state(task_history):
-        task_history.clear()
-        return []
-    with gr.Blocks() as demo:
-        gr.Markdown("""<center><font size=3> AtlasOCR Demo </center>""")
-        chatbot = gr.Chatbot(label='AtlasOCR', elem_classes="control-height", height=500)
-        query = gr.Textbox(lines=2, label='Input')
-        task_history = gr.State([])
-        with gr.Row():
-            addfile_btn = gr.UploadButton("📁 Upload", file_types=["image"])
-            submit_btn = gr.Button("🚀 Submit")
-            regen_btn = gr.Button("🤔️ Regenerate")
-            empty_bin = gr.Button("🧹 Clear History")
-        submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
-            predict, [chatbot, task_history], [chatbot], show_progress=True
-        )
-        submit_btn.click(reset_user_input, [], [query])
-        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
-        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
-        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
-    demo.queue(default_concurrency_limit=40).launch(
-        share=args.share,
-        # inbrowser=args.inbrowser,
-        # server_port=args.server_port,
-        # server_name=args.server_name,
     )
-def main():
-    args = _get_args()
-    _launch_demo(args)
-if __name__ == '__main__':
-    main()

 import gradio as gr
+import time
+import spaces
+from PIL import Image
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import uuid
 import os
+import numpy as np
+# Load model and processor
+# model_name = "NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct"
+model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name,
+                torch_dtype="auto",
+                device_map="cuda"
+            )
+processor = AutoProcessor.from_pretrained(model_name)
+max_tokens = 2000
+@spaces.GPU
+def perform_ocr(image):
+    inputArray = np.any(image)
+    if inputArray == False:
+        return "Error Processing"
+    """Process image and extract text using OCR model"""
+    image = Image.fromarray(image)
+    src = str(uuid.uuid4()) + ".png"
+    prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
+    image.save(src)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"file://{src}"},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    # Process inputs
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
     )
+    inputs = inputs.to("cuda")
+    # Generate text
+    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    # Cleanup
+    os.remove(src)
+    return output_text
+# Create Gradio interface
+with gr.Blocks(title="Qari Arabic OCR") as demo:
+    gr.Markdown("# Qari Arabic OCR")
+    gr.Markdown("Upload an image to extract Arabic text in real-time. This model is specialized for Arabic document OCR.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input image
+            image_input = gr.Image(type="numpy", label="Upload Image")
+            # Example gallery
+            gr.Examples(
+                examples=[
+                    ["2.jpg"],
+                    ["3.jpg"]
+                ],
+                inputs=image_input,
+                label="Example Images",
+                examples_per_page=4
+            )
+            # Submit button
+            submit_btn = gr.Button("Extract Text")
+        with gr.Column(scale=1):
+            # Output text
+            output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
+            # Model details
+            with gr.Accordion("Model Information", open=False):
+                gr.Markdown("""
+                **Model:** Qari-OCR-0.1-VL-2B-Instruct
+                **Description:** Arabic OCR model based on Qwen2-VL architecture
+                **Size:** 2B parameters
+                **Context window:** Supports up to 2000 output tokens
+                """)
+    # Set up processing flow
+    submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
+    image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)
+demo.launch()