Spaces:

vithacocf
/

ocr

Running on Zero

App Files Files Community

vithacocf commited on Jul 16

Commit

a5d053e

verified ·

1 Parent(s): 49eb956

Update app.py

Browse files

Files changed (1) hide show

app.py +509 -77

app.py CHANGED Viewed

@@ -48,103 +48,535 @@
 #     demo.launch()
 # Code fix
-import gradio as gr
 from PIL import Image, UnidentifiedImageError
-from transformers import AutoProcessor, BitsAndBytesConfig, TextIteratorStreamer
-from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 import torch
-from threading import Thread
-import time
-# Cấu hình thiết bị
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-torch.cuda.empty_cache()
-# Load mô hình Qwen2.5-VL với quantization 4-bit
-model_id = "prithivMLmods/Camel-Doc-OCR-062825"
-# bnb_config = BitsAndBytesConfig(
-#     load_in_4bit=True,
-#     bnb_4bit_use_double_quant=True,
-#     bnb_4bit_quant_type="nf4",
-#     bnb_4bit_compute_dtype=torch.float16
-# )
-processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_id,
-    # quantization_config=bnb_config, Quantization
-    device_map="auto",
-    trust_remote_code=True
-).eval()
-def convert_png_to_jpg(image):
-    if image.mode in ["RGBA", "LA"]:
-        converted = Image.new("RGB", image.size, (255, 255, 255))
-        converted.paste(image, mask=image.split()[-1])
-        return converted
     return image.convert("RGB")
-# Hàm dự đoán
-def predict(image, prompt=""):
-    if image is None:
-        return "=Vui lòng tải lên ảnh hợp lệ."
-    try:
-        image = convert_png_to_jpg(image)
-        prompt = prompt.strip() if prompt else "Please describe the document."
-        # Xây dựng prompt theo định dạng Qwen2.5-VL
         messages = [{
             "role": "user",
             "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt}
             ]
         }]
-        text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[text_prompt],
-            images=[image],
-            return_tensors="pt",
-            padding=True
-        ).to(model.device)
-        # Dùng streamer để sinh kết quả mượt hơn
-        streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": 512,
-            "do_sample": False,
-            "use_cache": True
-        }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-        return buffer
-    except UnidentifiedImageError:
-        return "Không thể đọc ảnh. Ảnh có thể bị hỏng hoặc sai định dạng."
     except Exception as e:
-        return f"Lỗi khi xử lý ảnh: {str(e)}"
-demo = gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.Image(type="pil", label="Tải ảnh tài liệu lên"),
-        gr.Textbox(label="Gợi ý (tuỳ chọn)", placeholder="VD: Trích số hóa đơn")
-    ],
-    outputs="text",
-    title="Camel-Doc OCR - Trích xuất văn bản từ ảnh"
-)
 if __name__ == "__main__":
-    demo.launch()

 #     demo.launch()
 # Code fix
+import os
+import json
+import re
+import hashlib
+import gc
+from io import BytesIO
+from collections import OrderedDict
 from PIL import Image, UnidentifiedImageError
 import torch
+from transformers import AutoProcessor, BitsAndBytesConfig
+from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+from pdf2image import convert_from_bytes
+import gradio as gr
+import fitz
+# --- CONFIGURATION ---
+MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
+CACHE_MAX_SIZE = 128
+DPI = 100
+THREAD_COUNT = 4
+IMAGE_MAX_DIM = 1024
+JPEG_QUALITY = 75
+GPU_MEMORY_FRACTION = 0.8  # use 80% of GPU memory
+PAD_TOKEN_ID = None  # set later to avoid warnings
+# --- 1. Device & torch settings ---
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cudnn.benchmark = True
+if device.type == 'cuda':
+    try:
+        torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=device)
+    except Exception:
+        pass
+# --- 2. Model & tokenizer ---
+bnb = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+# load and compile model
+try:
+    base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        quantization_config=bnb,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    model = torch.compile(base.eval())
+except Exception as e:
+    print(f"[Warning] Model compile failed: {e}")
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        quantization_config=bnb,
+        device_map="auto",
+        trust_remote_code=True
+    ).eval()
+# avoid padding warnings
+PAD_TOKEN_ID = processor.tokenizer.eos_token_id
+processor.tokenizer.pad_token_id = PAD_TOKEN_ID
+# --- 3. Memory utilities ---
+def cleanup_memory():
+    gc.collect()
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+def get_memory_info():
+    if device.type == 'cuda':
+        return {
+            'allocated': torch.cuda.memory_allocated() / (1024**3),
+            'reserved': torch.cuda.memory_reserved() / (1024**3)
+        }
+    return {'allocated': 0, 'reserved': 0}
+# --- 4. LRU Cache for inference ---
+_mru_cache = OrderedDict()
+def cache_get(key):
+    if key in _mru_cache:
+        _mru_cache.move_to_end(key)
+        return _mru_cache[key]
+    return None
+def cache_set(key, value):
+    _mru_cache[key] = value
+    if len(_mru_cache) > CACHE_MAX_SIZE:
+        _mru_cache.popitem(last=False)
+def cache_clear():
+    _mru_cache.clear()
+# --- 5. Image preprocessing ---
+def normalize_image(image: Image.Image) -> Image.Image:
+    if image.mode in ("RGBA", "LA"):
+        bg = Image.new("RGB", image.size, (255,255,255))
+        bg.paste(image, mask=image.split()[-1])
+        image = bg
+    image.thumbnail((IMAGE_MAX_DIM, IMAGE_MAX_DIM), Image.Resampling.LANCZOS)
     return image.convert("RGB")
+# --- 6. Cache key generation ---
+def make_cache_key(image: Image.Image, prompt: str) -> str:
+    bio = BytesIO()
+    image.save(bio, format="JPEG", quality=JPEG_QUALITY)
+    data = bio.getvalue() + prompt.encode('utf-8')
+    return hashlib.md5(data).hexdigest()
+# --- 7. Inference with mixed precision & error handling ---
+def run_inference(image: Image.Image, prompt: str = "") -> str:
+    prompt_text = prompt.strip() or "Read information from the document."
+    img = normalize_image(image)
+    key = make_cache_key(img, prompt_text)
+    cached = cache_get(key)
+    if cached is not None:
+        return cached
+    try:
         messages = [{
             "role": "user",
             "content": [
+                {"type": "image", "image": img},
+                {"type": "text",  "text": prompt_text}
             ]
         }]
+        text_prompt = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = processor(text=[text_prompt], images=[img], return_tensors="pt", padding=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.inference_mode():
+            with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
+                gen = model.generate(
+                    **inputs,
+                    max_new_tokens=512,
+                    do_sample=False,
+                    eos_token_id=processor.tokenizer.eos_token_id
+                )
+        trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
+        result = processor.tokenizer.batch_decode(
+            trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )[0].strip()
+        cache_set(key, result)
+        cleanup_memory()
+        return result
+    except torch.cuda.OutOfMemoryError:
+        cleanup_memory()
+        return "[OOM] GPU out of memory. Try smaller image."
+    except Exception as e:
+        cleanup_memory()
+        return f"[Error] {str(e)}"
+# --- 8. File handler ---
+import traceback
+def handle_file(file, prompt, extra_prompt, progress=gr.Progress()):
+    try:
+        # Xác định đường dẫn thật sự
+        # file có thể là UploadedFile với .name, hoặc đơn giản là str
+        file_path = file.name if hasattr(file, "name") else file
+        filename = os.path.basename(file_path)
+        ext = filename.lower().split('.')[-1]
+        full_prompt = (prompt + "\n" + extra_prompt).strip() or "Read information from file/image."
+        print(f"[INFO] handle_file → {filename} (.{ext})")
+        # ---- PDF branch ----
+        if ext == "pdf":
+            try:
+                with open(file_path, "rb") as f:
+                    pdf_bytes = f.read()
+                print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
+                # Dùng PyMuPDF để convert
+                doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+                pages = []
+                for i, page in enumerate(doc, start=1):
+                    pix = page.get_pixmap(dpi=DPI)
+                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    pages.append(img)
+                print(f"[INFO] Converted PDF → {len(pages)} pages")
+            except Exception as e:
+                traceback.print_exc()
+                return filename, f"[ERROR] PDF conversion failed: {e}"
+            # Inference trên từng trang
+            outputs = []
+            for idx, img in enumerate(pages, start=1):
+                try:
+                    print(f"[INFO] Inference page {idx}")
+                    out = run_inference(img, full_prompt)
+                except Exception as e:
+                    traceback.print_exc()
+                    out = f"[ERROR] Inference page {idx} failed: {e}"
+                outputs.append(out)
+                if idx % 3 == 0:
+                    cleanup_memory()
+                progress((idx-1)/len(pages), desc=f"Page {idx}/{len(pages)}")
+            result = "\n\n--- Page Break ---\n\n".join(outputs)
+            print("[INFO] handle_file done")
+            return filename, result
+        # ---- Image branch ----
+        else:
+            try:
+                img = Image.open(file_path)
+                print(f"[INFO] Opened image: {img.mode}, {img.size}")
+            except Exception as e:
+                traceback.print_exc()
+                return filename, f"[ERROR] Image open failed: {e}"
+            return filename, run_inference(img, full_prompt)
     except Exception as e:
+        traceback.print_exc()
+        return "error", f"[ERROR] handle_file unexpected: {e}"
+# --- 9. Prompt templates & JSON export ---
+prompt_templates = {
+    "Electrolux": """Extract all structured information from the delivery order document image.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_electrolux_form>
+  <document_number>Số lệnh giao nhận hàng</document_number>
+  <order_number>Số đơn hàng</order_number>
+  <customer_code>Mã số khách hàng</customer_code>
+  <customer_order_code>Mã đơn khách hàng</customer_order_code>
+  <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
+  <delivery_date>Ngày giao hàng</delivery_date>
+  <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
+  <invoice_number>Số hóa đơn</invoice_number>
+  <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
+  <shipper_address>Địa chỉ gửi hàng</shipper_address>
+  <shipper_phone>Số điện thoại</shipper_phone>
+  <shipper_fax>Số fax</shipper_fax>
+  <shipper_tax_code>Mã số thuế</shipper_tax_code>
+  <consignee_customer_code>Mã khách hàng</consignee_customer_code>
+  <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
+  <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
+  <city_province>Tỉnh/Thành phố</city_province>
+  <postal_code>Mã bưu chính</postal_code>
+  <preparer_name>Họ tên người lập phiếu</preparer_name>
+  <preparer_date>Ngày lập phiếu</preparer_date>
+  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
+</s_electrolux_form>
+""",
+    "Jotun": """Extract all structured information from the delivery order document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_jotun_form>
+  <document_number>Số lệnh giao hàng</document_number>
+  <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
+  <customer_code>Mã khách hàng</customer_code>
+  <customer_name>Tên khách hàng</customer_name>
+  <customer_address>Địa chỉ khách hàng</customer_address>
+  <customer_phone>Điện thoại khách hàng</customer_phone>
+  <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
+  <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
+  <order_code>Số đơn đặt hàng</order_code>
+  <order_date>Ngày đặt hàng</order_date>
+  <order_number>Số đơn hàng</order_number>
+  <delivery_date>Ngày giao hàng</delivery_date>
+  <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
+</s_jotun_form>
+""",
+    "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_mawb_form>
+  <air_waybill_number>Số MAWB</air_waybill_number>
+  <shipper_name>Tên người gửi hàng</shipper_name>
+  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
+  <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
+  <consignee_name>Tên người nhận hàng</consignee_name>
+  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
+  <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
+  <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+</s_mawb_form>
+""",
+    "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_weight_ticket>
+  <awb_number>Số AWB</awb_number>
+  <shipper_name>Tên người gửi hàng</shipper_name>
+  <shipper_address>Địa chỉ người gửi hàng</shipper_address>
+  <shipper_contact>Số điện thoại người gửi</shipper_contact>
+  <consignee_name>Tên người nhận hàng</consignee_name>
+  <consignee_address>Địa chỉ người nhận hàng</consignee_address>
+  <cargo_description>Tên hàng hóa</cargo_description>
+  <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
+  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
+  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
+</s_weight_ticket>
+""",
+    "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_pc3u_form>
+  <awb_number>Số AWB</awb_number>
+  <cargo_service_code>Mã dịch vụ</cargo_service_code>
+  <shipper_name>Tên người gửi</shipper_name>
+  <shipper_address>Địa chỉ người gửi</shipper_address>
+  <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
+  <payer_name>Người thanh toán</payer_name>
+  <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
+  <consignee_name>Tên người nhận</consignee_name>
+  <consignee_address>Địa chỉ người nhận</consignee_address>
+  <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+  <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
+</s_pc3u_form>
+""",
+    "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
+You must return the result as a valid XML block that strictly follows the structure below.
+STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
+1. Return **ONLY** the XML block – nothing before or after it.
+2. DO NOT add, remove, rename, or reorder any XML tags.
+3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
+4. For every tag, fill in the exact value read from the image.
+   • NEVER copy or repeat the label/placeholder text.
+   • NEVER guess or invent values.
+5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
+6. DO NOT include Vietnamese text or translations inside tag values.
+7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
+8. Dates must be in YYYY-MM-DD format.
+9. Boolean tags must be exactly true or false (lower-case, no quotes).
+   ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
+10. **Inside each value**
+    • Replace every internal line-break with “, ” (comma + space).
+    • Trim leading/trailing whitespace.
+    • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
+11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
+12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
+13. Ignore any information not represented by the tags below.
+<s_avs_dad>
+  <air_waybill_number>Số AWB</air_waybill_number>
+  <form_code>Mã biểu mẫu</form_code>
+  <shipper_name>Tên người gửi</shipper_name>
+  <shipper_address>Địa chỉ người gửi</shipper_address>
+  <shipper_phone>Điện thoại người gửi</shipper_phone>
+  <shipper_email>Email người gửi</shipper_email>
+  <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
+  <consignee_name>Tên người nhận</consignee_name>
+  <consignee_address>Địa chỉ người nhận</consignee_address>
+  <consignee_phone>Điện thoại người nhận</consignee_phone>
+  <consignee_email>Email người nhận</consignee_email>
+  <departure_airport>Nơi đi</departure_airport>
+  <destination_airport>Nơi đến</destination_airport>
+  <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
+  <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
+  <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
+  <shipper_signature>Chữ ký người gửi</shipper_signature>
+  <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
+</s_avs_dad>
+"""
+}
+def insert_template(name):
+    return prompt_templates.get(name, "")
+def sanitize_filename(name):
+    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
+def clean_text(text):
+    text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
+    text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
+    return text.strip()
+def export_json(image_name, result_text):
+    try:
+        clean_name = sanitize_filename(image_name)
+        content = {"image": image_name, "text_sequence": clean_text(result_text)}
+        path = f"/tmp/{clean_name}.json"
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(content, f, ensure_ascii=False, indent=2)
+        return path, json.dumps(content, ensure_ascii=False, indent=2)
+    except Exception as e:
+        return "", f"[Export JSON Failed]: {e}"
+# --- 10. Gradio UI ---
+with gr.Blocks(title="Camel-Doc-OCR") as demo:
+    gr.Markdown("Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
+    status_txt = gr.Textbox(label="Status & Memory", interactive=False)
+    cache_txt = gr.Textbox(label="Cache Stats", interactive=False)
+    clear_btn = gr.Button("Clear Cache")
+    clear_btn.click(fn=lambda: (cache_clear(), f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}"), outputs=[cache_txt])
+    file_input = gr.File(label="Tải ảnh hoặc PDF", file_types=[".jpg", ".jpeg", ".png", ".pdf"])
+    prompt_input = gr.Textbox(label="Prompt thuần", lines=2)
+    config_input = gr.Textbox(label="JSON Prompt", lines=12)
+    gr.Markdown("Chọn mẫu prompt:")
+    with gr.Row():
+        for key in prompt_templates:
+            btn = gr.Button(f"Mẫu {key}")
+            btn.click(fn=insert_template, inputs=[gr.State(key)], outputs=config_input)
+    run_btn = gr.Button("Chạy OCR")
+    export_btn = gr.Button("Xuất JSON", visible=False)
+    hidden_name = gr.Textbox(visible=False)
+    result_output = gr.Textbox(label="Kết quả trích xuất", lines=20)
+    json_file = gr.File(label="File JSON", visible=False, file_types=[".json"])
+    json_text = gr.Code(label="JSON Output", language="json", lines=20)
+    # Run inference
+    run_btn.click(
+        fn=handle_file,
+        inputs=[file_input, prompt_input, config_input],
+        outputs=[hidden_name, result_output]
+    )
+    # Update memory status
+    run_btn.click(fn=lambda: get_memory_info(), outputs=[status_txt])
+    run_btn.click(fn=lambda: f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}", outputs=[cache_txt])
+    run_btn.click(fn=lambda: gr.update(visible=True), outputs=[export_btn])
+    # Export
+    export_btn.click(fn=export_json, inputs=[hidden_name, result_output], outputs=[json_file, json_text])
+    export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
 if __name__ == "__main__":
+    demo.launch(share=True)