|
|
import os |
|
|
import json |
|
|
import re |
|
|
import hashlib |
|
|
import gc |
|
|
from io import BytesIO |
|
|
from collections import OrderedDict |
|
|
from PIL import Image, UnidentifiedImageError |
|
|
import torch |
|
|
from transformers import AutoProcessor, BitsAndBytesConfig |
|
|
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration |
|
|
from pdf2image import convert_from_bytes |
|
|
import gradio as gr |
|
|
import fitz |
|
|
import spaces |
|
|
|
|
|
|
|
|
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825" |
|
|
CACHE_MAX_SIZE = 128 |
|
|
DPI = 100 |
|
|
THREAD_COUNT = 4 |
|
|
IMAGE_MAX_DIM = 1024 |
|
|
JPEG_QUALITY = 75 |
|
|
GPU_MEMORY_FRACTION = 0.8 |
|
|
PAD_TOKEN_ID = None |
|
|
|
|
|
|
|
|
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825" |
|
|
CACHE_MAX_SIZE = 128 |
|
|
DPI = 200 |
|
|
IMAGE_MAX_DIM = None |
|
|
JPEG_QUALITY = 80 |
|
|
GPU_MEMORY_FRACTION = 0.8 |
|
|
|
|
|
|
|
|
device = torch.device("cpu") |
|
|
torch.backends.cudnn.benchmark = True |
|
|
if device.type == 'cuda': |
|
|
torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0) |
|
|
|
|
|
|
|
|
from transformers import AutoProcessor, BitsAndBytesConfig |
|
|
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration |
|
|
|
|
|
bnb = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.float16 |
|
|
) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
|
MODEL_ID, |
|
|
quantization_config=bnb, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
).eval() |
|
|
processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id |
|
|
|
|
|
|
|
|
import traceback |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
|
|
|
import traceback |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()): |
|
|
try: |
|
|
file_path = file.name if hasattr(file, "name") else file |
|
|
filename = os.path.basename(file_path) |
|
|
ext = filename.lower().split('.')[-1] |
|
|
full_prompt = (prompt + "\n" + extra_prompt).strip() or "" |
|
|
|
|
|
print(f"[INFO] handle_file → {filename} (.{ext})") |
|
|
|
|
|
if ext == "pdf": |
|
|
try: |
|
|
with open(file_path, "rb") as f: |
|
|
pdf_bytes = f.read() |
|
|
print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes") |
|
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
pages = [] |
|
|
zoom = DPI |
|
|
mat = fitz.Matrix(zoom, zoom) |
|
|
for i, page in enumerate(doc): |
|
|
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) |
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
|
if max(img.size) > 3072: |
|
|
img.thumbnail((3072, 3072), Image.Resampling.LANCZOS) |
|
|
pages.append(img) |
|
|
print(f"[INFO] Converted PDF → {len(pages)} pages") |
|
|
|
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return filename, f"[ERROR] PDF conversion failed: {e}" |
|
|
|
|
|
outputs = [] |
|
|
with ThreadPoolExecutor(max_workers=4) as executor: |
|
|
futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages] |
|
|
for idx, future in enumerate(futures): |
|
|
try: |
|
|
out = future.result() |
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
out = f"[ERROR] Inference page {idx+1} failed: {e}" |
|
|
outputs.append(out) |
|
|
progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}") |
|
|
|
|
|
result = "\n\n--- Page Break ---\n\n".join(outputs) |
|
|
print("[INFO] handle_file done") |
|
|
return filename, result |
|
|
|
|
|
else: |
|
|
try: |
|
|
img = Image.open(file_path) |
|
|
print(f"[INFO] Opened image: {img.mode}, {img.size}") |
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return filename, f"[ERROR] Image open failed: {e}" |
|
|
|
|
|
return filename, run_inference(img, full_prompt, max_new_tokens) |
|
|
|
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
return "error", f"[ERROR] handle_file unexpected: {e}" |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def run_inference(img, prompt="", max_new_tokens=512): |
|
|
model.to("cuda") |
|
|
|
|
|
if img.mode != "RGB": |
|
|
img = img.convert("RGB") |
|
|
prompt_text = prompt.strip() |
|
|
|
|
|
messages = [{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": img}, |
|
|
{"type": "text", "text": prompt_text} |
|
|
] |
|
|
}] |
|
|
|
|
|
text_prompt = processor.apply_chat_template( |
|
|
messages, tokenize=False, add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = processor( |
|
|
text=[text_prompt], images=[img], return_tensors="pt", padding=True |
|
|
).to("cuda") |
|
|
|
|
|
with torch.inference_mode(), torch.cuda.amp.autocast(): |
|
|
gen = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_new_tokens, |
|
|
do_sample=False, |
|
|
eos_token_id=processor.tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)] |
|
|
result = processor.tokenizer.batch_decode( |
|
|
trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True |
|
|
)[0].strip() |
|
|
|
|
|
return result |
|
|
|
|
|
prompt_templates = { |
|
|
"Electrolux": """Extract all structured information from the delivery order document image. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_electrolux_form> |
|
|
<document_number>Số lệnh giao nhận hàng</document_number> |
|
|
<order_number>Số đơn hàng</order_number> |
|
|
<customer_code>Mã số khách hàng</customer_code> |
|
|
<customer_order_code>Mã đơn khách hàng</customer_order_code> |
|
|
<customer_order_date>Ngày đặt hàng của khách</customer_order_date> |
|
|
<delivery_date>Ngày giao hàng</delivery_date> |
|
|
<requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date> |
|
|
<invoice_number>Số hóa đơn</invoice_number> |
|
|
<shipper_company_name>Tên công ty gửi hàng</shipper_company_name> |
|
|
<shipper_address>Địa chỉ gửi hàng</shipper_address> |
|
|
<shipper_phone>Số điện thoại</shipper_phone> |
|
|
<shipper_fax>Số fax</shipper_fax> |
|
|
<shipper_tax_code>Mã số thuế</shipper_tax_code> |
|
|
<consignee_customer_code>Mã khách hàng</consignee_customer_code> |
|
|
<consignee_company_name>Tên công ty nhận hàng</consignee_company_name> |
|
|
<shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address> |
|
|
<city_province>Tỉnh/Thành phố</city_province> |
|
|
<postal_code>Mã bưu chính</postal_code> |
|
|
<preparer_name>Họ tên người lập phiếu</preparer_name> |
|
|
<preparer_date>Ngày lập phiếu</preparer_date> |
|
|
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> |
|
|
</s_electrolux_form> |
|
|
""", |
|
|
|
|
|
"Jotun": """Extract all structured information from the delivery order document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_jotun_form> |
|
|
<document_number>Số lệnh giao hàng</document_number> |
|
|
<delivery_order_code>Số lệnh giao hàng số</delivery_order_code> |
|
|
<customer_code>Mã khách hàng</customer_code> |
|
|
<customer_name>Tên khách hàng</customer_name> |
|
|
<customer_address>Địa chỉ khách hàng</customer_address> |
|
|
<customer_phone>Điện thoại khách hàng</customer_phone> |
|
|
<invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name> |
|
|
<invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address> |
|
|
<order_code>Số đơn đặt hàng</order_code> |
|
|
<order_date>Ngày đặt hàng</order_date> |
|
|
<order_number>Số đơn hàng</order_number> |
|
|
<delivery_date>Ngày giao hàng</delivery_date> |
|
|
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> |
|
|
</s_jotun_form> |
|
|
""", |
|
|
|
|
|
"MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_mawb_form> |
|
|
<air_waybill_number>Số MAWB</air_waybill_number> |
|
|
<shipper_name>Tên người gửi hàng</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi hàng</shipper_address> |
|
|
<shipper_account_number>Mã tài khoản người gửi</shipper_account_number> |
|
|
<consignee_name>Tên người nhận hàng</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận hàng</consignee_address> |
|
|
<consignee_account_number>Mã tài khoản người nhận</consignee_account_number> |
|
|
<dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
</s_mawb_form> |
|
|
""", |
|
|
|
|
|
"Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_weight_ticket> |
|
|
<awb_number>Số AWB</awb_number> |
|
|
<shipper_name>Tên người gửi hàng</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi hàng</shipper_address> |
|
|
<shipper_contact>Số điện thoại người gửi</shipper_contact> |
|
|
<consignee_name>Tên người nhận hàng</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận hàng</consignee_address> |
|
|
<cargo_description>Tên hàng hóa</cargo_description> |
|
|
<security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete> |
|
|
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> |
|
|
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> |
|
|
</s_weight_ticket> |
|
|
""", |
|
|
|
|
|
"PC 3U": """Extract all structured information from the PC 3U air cargo instruction document. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_pc3u_form> |
|
|
<awb_number>Số AWB</awb_number> |
|
|
<cargo_service_code>Mã dịch vụ</cargo_service_code> |
|
|
<shipper_name>Tên người gửi</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi</shipper_address> |
|
|
<shipper_contact>Thông tin liên hệ người gửi</shipper_contact> |
|
|
<payer_name>Người thanh toán</payer_name> |
|
|
<payer_tax_code>Mã số thuế người thanh toán</payer_tax_code> |
|
|
<consignee_name>Tên người nhận</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận</consignee_address> |
|
|
<consignee_contact>Thông tin liên hệ người nhận</consignee_contact> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> |
|
|
</s_pc3u_form> |
|
|
""", |
|
|
|
|
|
"SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'. |
|
|
You must return the result as a valid XML block that strictly follows the structure below. |
|
|
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: |
|
|
1. Return **ONLY** the XML block – nothing before or after it. |
|
|
2. DO NOT add, remove, rename, or reorder any XML tags. |
|
|
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. |
|
|
4. For every tag, fill in the exact value read from the image. |
|
|
• NEVER copy or repeat the label/placeholder text. |
|
|
• NEVER guess or invent values. |
|
|
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). |
|
|
6. DO NOT include Vietnamese text or translations inside tag values. |
|
|
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. |
|
|
8. Dates must be in YYYY-MM-DD format. |
|
|
9. Boolean tags must be exactly true or false (lower-case, no quotes). |
|
|
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false |
|
|
10. **Inside each value** |
|
|
• Replace every internal line-break with “, ” (comma + space). |
|
|
• Trim leading/trailing whitespace. |
|
|
• Escape XML special characters: & → &, < → <, > → >. |
|
|
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. |
|
|
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. |
|
|
13. Ignore any information not represented by the tags below. |
|
|
<s_avs_dad> |
|
|
<air_waybill_number>Số AWB</air_waybill_number> |
|
|
<form_code>Mã biểu mẫu</form_code> |
|
|
<shipper_name>Tên người gửi</shipper_name> |
|
|
<shipper_address>Địa chỉ người gửi</shipper_address> |
|
|
<shipper_phone>Điện thoại người gửi</shipper_phone> |
|
|
<shipper_email>Email người gửi</shipper_email> |
|
|
<shipper_tax_code>Mã số thuế người gửi</shipper_tax_code> |
|
|
<consignee_name>Tên người nhận</consignee_name> |
|
|
<consignee_address>Địa chỉ người nhận</consignee_address> |
|
|
<consignee_phone>Điện thoại người nhận</consignee_phone> |
|
|
<consignee_email>Email người nhận</consignee_email> |
|
|
<departure_airport>Nơi đi</departure_airport> |
|
|
<destination_airport>Nơi đến</destination_airport> |
|
|
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> |
|
|
<acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature> |
|
|
<acceptance_time>Thời điểm tiếp nhận</acceptance_time> |
|
|
<shipper_signature>Chữ ký người gửi</shipper_signature> |
|
|
<shipper_signature_date>Ngày ký người gửi</shipper_signature_date> |
|
|
</s_avs_dad> |
|
|
""" |
|
|
} |
|
|
|
|
|
def insert_template(name): |
|
|
return prompt_templates.get(name, "") |
|
|
|
|
|
def sanitize_filename(name): |
|
|
return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) |
|
|
|
|
|
def clean_text(text): |
|
|
text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text) |
|
|
text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text) |
|
|
return text.strip() |
|
|
|
|
|
def export_json(image_name, result_text): |
|
|
try: |
|
|
clean_name = sanitize_filename(image_name) |
|
|
content = {"image": image_name, "text_sequence": clean_text(result_text)} |
|
|
path = f"/tmp/{clean_name}.json" |
|
|
with open(path, "w", encoding="utf-8") as f: |
|
|
json.dump(content, f, ensure_ascii=False, indent=2) |
|
|
return path, json.dumps(content, ensure_ascii=False, indent=2) |
|
|
except Exception as e: |
|
|
return "", f"[Export JSON Failed]: {e}" |
|
|
|
|
|
|
|
|
|
|
|
css = """ |
|
|
.gradio-textbox textarea { |
|
|
font-size: 13px !important; |
|
|
line-height: 1.3 !important; |
|
|
padding: 6px 8px !important; |
|
|
} |
|
|
.gradio-textbox label { |
|
|
font-size: 13px !important; |
|
|
font-weight: 600 !important; |
|
|
margin-bottom: 4px !important; |
|
|
} |
|
|
.gradio-button { |
|
|
font-size: 12px !important; |
|
|
padding: 4px 8px !important; |
|
|
height: 28px !important; |
|
|
min-height: 28px !important; |
|
|
margin: 2px !important; |
|
|
} |
|
|
.gradio-button[data-variant="primary"] { |
|
|
height: 36px !important; |
|
|
font-size: 13px !important; |
|
|
padding: 8px 16px !important; |
|
|
} |
|
|
.gradio-file { |
|
|
font-size: 13px !important; |
|
|
} |
|
|
.gradio-file .file-upload { |
|
|
padding: 8px !important; |
|
|
min-height: 80px !important; |
|
|
} |
|
|
.gradio-markdown h3 { |
|
|
font-size: 14px !important; |
|
|
margin: 8px 0 4px 0 !important; |
|
|
} |
|
|
.gradio-markdown h2 { |
|
|
font-size: 18px !important; |
|
|
margin: 8px 0 !important; |
|
|
} |
|
|
.gradio-code { |
|
|
font-size: 12px !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo: |
|
|
gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📥 INPUT") |
|
|
|
|
|
|
|
|
file_input = gr.File( |
|
|
label="📤 Tải ảnh hoặc PDF", |
|
|
file_types=[".jpg", ".jpeg", ".png", ".pdf"], |
|
|
height=100 |
|
|
) |
|
|
|
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="Prompt thuần", |
|
|
lines=2, |
|
|
placeholder="Nhập prompt tùy chỉnh...", |
|
|
max_lines=3 |
|
|
) |
|
|
|
|
|
|
|
|
config_input = gr.Textbox( |
|
|
label="JSON Prompt", |
|
|
lines=6, |
|
|
placeholder="Cấu hình JSON sẽ xuất hiện ở đây...", |
|
|
max_lines=8 |
|
|
) |
|
|
|
|
|
|
|
|
max_new_tokens_input = gr.Radio( |
|
|
choices=[128, 256, 512, 1024, 1536, 2048], |
|
|
value=512, |
|
|
label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)", |
|
|
info="Chọn độ dài tối đa cho đầu ra của mô hình" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### 📑 Mẫu:") |
|
|
with gr.Row(): |
|
|
for key in list(prompt_templates.keys()): |
|
|
gr.Button(f"{key}", size="sm", scale=1).click( |
|
|
fn=lambda *, k=key: insert_template(k), |
|
|
inputs=[], |
|
|
outputs=config_input |
|
|
) |
|
|
|
|
|
|
|
|
run_btn = gr.Button("🚀 Chạy OCR", variant="primary") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📤 OUTPUT") |
|
|
|
|
|
|
|
|
result_output = gr.Textbox( |
|
|
label="Kết quả trích xuất", |
|
|
lines=10, |
|
|
placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...", |
|
|
max_lines=12 |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm") |
|
|
|
|
|
|
|
|
json_text = gr.Code( |
|
|
label="JSON Output", |
|
|
language="json", |
|
|
lines=6, |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
json_file = gr.File( |
|
|
label="File JSON để tải", |
|
|
visible=False, |
|
|
file_types=[".json"] |
|
|
) |
|
|
|
|
|
|
|
|
hidden_name = gr.Textbox(visible=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_btn.click( |
|
|
fn=handle_file, |
|
|
inputs=[file_input, prompt_input, config_input, max_new_tokens_input], |
|
|
outputs=[hidden_name, result_output] |
|
|
) |
|
|
|
|
|
|
|
|
export_btn.click( |
|
|
fn=export_json, |
|
|
inputs=[hidden_name, result_output], |
|
|
outputs=[json_file, json_text] |
|
|
) |
|
|
|
|
|
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file]) |
|
|
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860 |
|
|
) |