|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import json |
|
|
import re |
|
|
import hashlib |
|
|
import gc |
|
|
from io import BytesIO |
|
|
from collections import OrderedDict |
|
|
from PIL import Image, UnidentifiedImageError |
|
|
import torch |
|
|
from transformers import AutoProcessor, BitsAndBytesConfig |
|
|
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration |
|
|
from pdf2image import convert_from_bytes |
|
|
import gradio as gr |
|
|
import fitz |
|
|
|
|
|
|
|
|
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825" |
|
|
CACHE_MAX_SIZE = 128 |
|
|
DPI = 100 |
|
|
THREAD_COUNT = 4 |
|
|
IMAGE_MAX_DIM = 1024 |
|
|
JPEG_QUALITY = 75 |
|
|
GPU_MEMORY_FRACTION = 0.8 |
|
|
PAD_TOKEN_ID = None |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
torch.backends.cudnn.benchmark = True |
|
|
if device.type == 'cuda': |
|
|
try: |
|
|
torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=device) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
bnb = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.float16 |
|
|
) |
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
|
|
|
try: |
|
|
base = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
|
MODEL_ID, |
|
|
quantization_config=bnb, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
model = torch.compile(base.eval()) |
|
|
except Exception as e: |
|
|
print(f"[Warning] Model compile failed: {e}") |
|
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
|
MODEL_ID, |
|
|
quantization_config=bnb, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
).eval() |
|
|
|
|
|
PAD_TOKEN_ID = processor.tokenizer.eos_token_id |
|
|
processor.tokenizer.pad_token_id = PAD_TOKEN_ID |
|
|
|
|
|
|
|
|
def cleanup_memory(): |
|
|
gc.collect() |
|
|
if device.type == 'cuda': |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
def get_memory_info(): |
|
|
if device.type == 'cuda': |
|
|
return { |
|
|
'allocated': torch.cuda.memory_allocated() / (1024**3), |
|
|
'reserved': torch.cuda.memory_reserved() / (1024**3) |
|
|
} |
|
|
return {'allocated': 0, 'reserved': 0} |
|
|
|
|
|
|
|
|
_mru_cache = OrderedDict() |
|
|
|
|
|
def cache_get(key): |
|
|
if key in _mru_cache: |
|
|
_mru_cache.move_to_end(key) |
|
|
return _mru_cache[key] |
|
|
return None |
|
|
|
|
|
def cache_set(key, value): |
|
|
_mru_cache[key] = value |
|
|
if len(_mru_cache) > CACHE_MAX_SIZE: |
|
|
_mru_cache.popitem(last=False) |
|
|
|
|
|
def cache_clear(): |
|
|
_mru_cache.clear() |
|
|
|
|
|
|
|
|
def normalize_image(image: Image.Image) -> Image.Image: |
|
|
if image.mode in ("RGBA", "LA"): |
|
|
bg = Image.new("RGB", image.size, (255,255,255)) |
|
|
bg.paste(image, mask=image.split()[-1]) |
|
|
image = bg |
|
|
image.thumbnail((IMAGE_MAX_DIM, IMAGE_MAX_DIM), Image.Resampling.LANCZOS) |
|
|
return image.convert("RGB") |
|
|
|
|
|
|
|
|
def make_cache_key(image: Image.Image, prompt: str) -> str: |
|
|
bio = BytesIO() |
|
|
image.save(bio, format="JPEG", quality=JPEG_QUALITY) |
|
|
data = bio.getvalue() + prompt.encode('utf-8') |
|
|
return hashlib.md5(data).hexdigest() |
|
|
|
|
|
|
|
|
def run_inference(image: Image.Image, prompt: str = "") -> str: |
|
|
prompt_text = prompt.strip() or "Read information from the document." |
|
|
img = normalize_image(image) |
|
|
key = make_cache_key(img, prompt_text) |
|
|
|
|
|
cached = cache_get(key) |
|
|
if cached is not None: |
|
|
return cached |
|
|
|
|
|
try: |
|
|
messages = [{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": img}, |
|
|
{"type": "text", "text": prompt_text} |
|
|
] |
|
|
}] |
|
|
text_prompt = processor.apply_chat_template( |
|
|
messages, tokenize=False, add_generation_prompt=True |
|
|
) |
|
|
inputs = processor(text=[text_prompt], images=[img], return_tensors="pt", padding=True) |
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.inference_mode(): |
|
|
with torch.cuda.amp.autocast(enabled=(device.type=='cuda')): |
|
|
gen = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=512, |
|
|
do_sample=False, |
|
|
eos_token_id=processor.tokenizer.eos_token_id |
|
|
) |
|
|
trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)] |
|
|
result = processor.tokenizer.batch_decode( |
|
|
trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True |
|
|
)[0].strip() |
|
|
|
|
|
cache_set(key, result) |
|
|
cleanup_memory() |
|
|
return result |
|
|
|
|
|
except torch.cuda.OutOfMemoryError: |
|
|
cleanup_memory() |
|
|
return "[OOM] GPU out of memory. Try smaller image." |
|
|
except Exception as e: |
|
|
cleanup_memory() |
|
|
return f"[Error] {str(e)}" |
|
|
|
|
|
|
|
|
import traceback |
|
|
def handle_file(file, prompt, extra_prompt, progress=gr.Progress()): |
|
|
try: |
|
|
|
|
|
|
|
|
file_path = file.name if hasattr(file, "name") else file |
|
|
filename = os.path.basename(file_path) |
|
|
ext = filename.lower().split('.')[-1] |
|
|
full_prompt = (prompt + "\n" + extra_prompt).strip() or "Read information from file/image." |
|
|
|
|
|
print(f"[INFO] handle_file → {filename} (.{ext})") |
|
|
|
|
|
|
|
|
if ext == "pdf": |
|
|
try: |
|
|
with open(file_path, "rb") as f: |
|
|
pdf_bytes = f.read() |
|
|
print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes") |
|
|
|
|
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
pages = [] |
|
|
for i, page in enumerate(doc, start=1): |
|
|
pix = page.get_pixmap(dpi=DPI) |
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
|
pages.append(img) |
|
|
print(f"[INFO] Converted PDF → {len(pages)} pages") |
|
|
|
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return filename, f"[ERROR] PDF conversion failed: {e}" |
|
|
|
|
|
|
|
|
outputs = [] |
|
|
for idx, img in enumerate(pages, start=1): |
|
|
try: |
|
|
print(f"[INFO] Inference page {idx}") |
|
|
out = run_inference(img, full_prompt) |
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
out = f"[ERROR] Inference page {idx} failed: {e}" |
|
|
outputs.append(out) |
|
|
if idx % 3 == 0: |
|
|
cleanup_memory() |
|
|
progress((idx-1)/len(pages), desc=f"Page {idx}/{len(pages)}") |
|
|
|
|
|
result = "\n\n--- Page Break ---\n\n".join(outputs) |
|
|
print("[INFO] handle_file done") |
|
|
return filename, result |
|
|
|
|
|
|
|
|
else: |
|
|
try: |
|
|
img = Image.open(file_path) |
|
|
print(f"[INFO] Opened image: {img.mode}, {img.size}") |
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return filename, f"[ERROR] Image open failed: {e}" |
|
|
return filename, run_inference(img, full_prompt) |
|
|
|
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return "error", f"[ERROR] handle_file unexpected: {e}" |
|
|
|
|
|
|
|
|
prompt_templates = { |
|
|
"Electrolux": """Extract all structured information from the delivery order document image. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_electrolux_form> |
|
|
<document_number>Số lệnh giao nhận hàng</document_number> |
|
|
<order_number>Số đơn hàng</order_number> |
|
|
<customer_code>Mã số khách hàng</customer_code> |
|
|
<customer_order_code>Mã đơn khách hàng</customer_order_code> |
|
|
<customer_order_date>Ngày đặt hàng của khách</customer_order_date> |
|
|
<delivery_date>Ngày giao hàng</delivery_date> |
|
|
<requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date> |
|
|
<invoice_number>Số hóa đơn</invoice_number> |
|
|
<shipper_company_name>Tên công ty gửi hàng</shipper_company_name> |
|
|
<shipper_address>Địa chỉ gửi hàng</shipper_address> |
|
|
<shipper_phone>Số điện thoại</shipper_phone> |
|
|
<shipper_fax>Số fax</shipper_fax> |
|
|
<shipper_tax_code>Mã số thuế</shipper_tax_code> |
|
|
<consignee_customer_code>Mã khách hàng</consignee_customer_code> |
|
|
<consignee_company_name>Tên công ty nhận hàng</consignee_company_name> |
|
|
<shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address> |
|
|
<city_province>Tỉnh/Thành phố</city_province> |
|
|
<postal_code>Mã bưu chính</postal_code> |
|
|
<preparer_name>Họ tên người lập phiếu</preparer_name> |
|
|
<preparer_date>Ngày lập phiếu</preparer_date> |
|
|
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> |
|
|
</s_electrolux_form> |
|
|
""", |
|
|
|
|
|
"Jotun": """Extract all structured information from the delivery order document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_jotun_form> |
|
|
<document_number>Số lệnh giao hàng</document_number> |
|
|
<delivery_order_code>Số lệnh giao hàng số</delivery_order_code> |
|
|
<customer_code>Mã khách hàng</customer_code> |
|
|
<customer_name>Tên khách hàng</customer_name> |
|
|
<customer_address>Địa chỉ khách hàng</customer_address> |
|
|
<customer_phone>Điện thoại khách hàng</customer_phone> |
|
|
<invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name> |
|
|
<invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address> |
|
|
<order_code>Số đơn đặt hàng</order_code> |
|
|
<order_date>Ngày đặt hàng</order_date> |
|
|
<order_number>Số đơn hàng</order_number> |
|
|
<delivery_date>Ngày giao hàng</delivery_date> |
|
|
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> |
|
|
</s_jotun_form> |
|
|
""", |
|
|
|
|
|
"MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_mawb_form> |
|
|
<air_waybill_number>Số MAWB</air_waybill_number> |
|
|
<shipper_name>Tên người gửi hàng</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi hàng</shipper_address> |
|
|
<shipper_account_number>Mã tài khoản người gửi</shipper_account_number> |
|
|
<consignee_name>Tên người nhận hàng</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận hàng</consignee_address> |
|
|
<consignee_account_number>Mã tài khoản người nhận</consignee_account_number> |
|
|
<dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
</s_mawb_form> |
|
|
""", |
|
|
|
|
|
"Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_weight_ticket> |
|
|
<awb_number>Số AWB</awb_number> |
|
|
<shipper_name>Tên người gửi hàng</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi hàng</shipper_address> |
|
|
<shipper_contact>Số điện thoại người gửi</shipper_contact> |
|
|
<consignee_name>Tên người nhận hàng</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận hàng</consignee_address> |
|
|
<cargo_description>Tên hàng hóa</cargo_description> |
|
|
<security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete> |
|
|
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> |
|
|
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> |
|
|
</s_weight_ticket> |
|
|
""", |
|
|
|
|
|
"PC 3U": """Extract all structured information from the PC 3U air cargo instruction document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_pc3u_form> |
|
|
<awb_number>Số AWB</awb_number> |
|
|
<cargo_service_code>Mã dịch vụ</cargo_service_code> |
|
|
<shipper_name>Tên người gửi</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi</shipper_address> |
|
|
<shipper_contact>Thông tin liên hệ người gửi</shipper_contact> |
|
|
<payer_name>Người thanh toán</payer_name> |
|
|
<payer_tax_code>Mã số thuế người thanh toán</payer_tax_code> |
|
|
<consignee_name>Tên người nhận</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận</consignee_address> |
|
|
<consignee_contact>Thông tin liên hệ người nhận</consignee_contact> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> |
|
|
</s_pc3u_form> |
|
|
""", |
|
|
|
|
|
"SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_avs_dad> |
|
|
<air_waybill_number>Số AWB</air_waybill_number> |
|
|
<form_code>Mã biểu mẫu</form_code> |
|
|
<shipper_name>Tên người gửi</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi</shipper_address> |
|
|
<shipper_phone>Điện thoại người gửi</shipper_phone> |
|
|
<shipper_email>Email người gửi</shipper_email> |
|
|
<shipper_tax_code>Mã số thuế người gửi</shipper_tax_code> |
|
|
<consignee_name>Tên người nhận</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận</consignee_address> |
|
|
<consignee_phone>Điện thoại người nhận</consignee_phone> |
|
|
<consignee_email>Email người nhận</consignee_email> |
|
|
<departure_airport>Nơi đi</departure_airport> |
|
|
<destination_airport>Nơi đến</destination_airport> |
|
|
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> |
|
|
<acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature> |
|
|
<acceptance_time>Thời điểm tiếp nhận</acceptance_time> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
<shipper_signature_date>Ngày ký người gửi</shipper_signature_date> |
|
|
</s_avs_dad> |
|
|
""" |
|
|
} |
|
|
|
|
|
def insert_template(name): |
|
|
return prompt_templates.get(name, "") |
|
|
|
|
|
def sanitize_filename(name): |
|
|
return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) |
|
|
|
|
|
def clean_text(text): |
|
|
text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text) |
|
|
text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text) |
|
|
return text.strip() |
|
|
|
|
|
def export_json(image_name, result_text): |
|
|
try: |
|
|
clean_name = sanitize_filename(image_name) |
|
|
content = {"image": image_name, "text_sequence": clean_text(result_text)} |
|
|
path = f"/tmp/{clean_name}.json" |
|
|
with open(path, "w", encoding="utf-8") as f: |
|
|
json.dump(content, f, ensure_ascii=False, indent=2) |
|
|
return path, json.dumps(content, ensure_ascii=False, indent=2) |
|
|
except Exception as e: |
|
|
return "", f"[Export JSON Failed]: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
css = """ |
|
|
.gradio-textbox textarea { |
|
|
font-size: 13px !important; |
|
|
line-height: 1.3 !important; |
|
|
padding: 6px 8px !important; |
|
|
} |
|
|
.gradio-textbox label { |
|
|
font-size: 13px !important; |
|
|
font-weight: 600 !important; |
|
|
margin-bottom: 4px !important; |
|
|
} |
|
|
.gradio-button { |
|
|
font-size: 12px !important; |
|
|
padding: 4px 8px !important; |
|
|
height: 28px !important; |
|
|
min-height: 28px !important; |
|
|
margin: 2px !important; |
|
|
} |
|
|
.gradio-button[data-variant="primary"] { |
|
|
height: 36px !important; |
|
|
font-size: 13px !important; |
|
|
padding: 8px 16px !important; |
|
|
} |
|
|
.gradio-file { |
|
|
font-size: 13px !important; |
|
|
} |
|
|
.gradio-file .file-upload { |
|
|
padding: 8px !important; |
|
|
min-height: 80px !important; |
|
|
} |
|
|
.gradio-markdown h3 { |
|
|
font-size: 14px !important; |
|
|
margin: 8px 0 4px 0 !important; |
|
|
} |
|
|
.gradio-markdown h2 { |
|
|
font-size: 18px !important; |
|
|
margin: 8px 0 !important; |
|
|
} |
|
|
.gradio-code { |
|
|
font-size: 12px !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo: |
|
|
gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
status_txt = gr.Textbox(label="Status & Memory", interactive=False, scale=2) |
|
|
cache_txt = gr.Textbox(label="Cache Stats", interactive=False, scale=1) |
|
|
clear_btn = gr.Button("Clear Cache", scale=1) |
|
|
clear_btn.click(fn=lambda: (cache_clear(), f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}"), outputs=[cache_txt]) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📥 INPUT") |
|
|
|
|
|
|
|
|
file_input = gr.File( |
|
|
label="📤 Tải ảnh hoặc PDF", |
|
|
file_types=[".jpg", ".jpeg", ".png", ".pdf"], |
|
|
height=100 |
|
|
) |
|
|
|
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="Prompt thuần", |
|
|
lines=2, |
|
|
placeholder="Nhập prompt tùy chỉnh...", |
|
|
max_lines=3 |
|
|
) |
|
|
|
|
|
|
|
|
config_input = gr.Textbox( |
|
|
label="JSON Prompt", |
|
|
lines=6, |
|
|
placeholder="Cấu hình JSON sẽ xuất hiện ở đây...", |
|
|
max_lines=8 |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### 📑 Mẫu:") |
|
|
with gr.Row(): |
|
|
for key in list(prompt_templates.keys()): |
|
|
gr.Button(f"{key}", size="sm", scale=1).click( |
|
|
fn=lambda *, k=key: insert_template(k), |
|
|
inputs=[], |
|
|
outputs=config_input |
|
|
) |
|
|
|
|
|
|
|
|
run_btn = gr.Button("🚀 Chạy OCR", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📤 OUTPUT") |
|
|
|
|
|
|
|
|
result_output = gr.Textbox( |
|
|
label="Kết quả trích xuất", |
|
|
lines=10, |
|
|
placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...", |
|
|
max_lines=12 |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm") |
|
|
|
|
|
|
|
|
json_text = gr.Code( |
|
|
label="JSON Output", |
|
|
language="json", |
|
|
lines=6, |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
json_file = gr.File( |
|
|
label="File JSON để tải", |
|
|
visible=False, |
|
|
file_types=[".json"] |
|
|
) |
|
|
|
|
|
|
|
|
hidden_name = gr.Textbox(visible=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_btn.click( |
|
|
fn=handle_file, |
|
|
inputs=[file_input, prompt_input, config_input], |
|
|
outputs=[hidden_name, result_output] |
|
|
) |
|
|
|
|
|
run_btn.click(fn=get_memory_info, outputs=[status_txt]) |
|
|
run_btn.click(fn=lambda: f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}", outputs=[cache_txt]) |
|
|
run_btn.click(fn=lambda: gr.update(visible=True), outputs=[export_btn]) |
|
|
|
|
|
|
|
|
export_btn.click( |
|
|
fn=export_json, |
|
|
inputs=[hidden_name, result_output], |
|
|
outputs=[json_file, json_text] |
|
|
) |
|
|
|
|
|
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file]) |
|
|
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |