vithacocf commited on
Commit
a5d053e
·
verified ·
1 Parent(s): 49eb956

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +509 -77
app.py CHANGED
@@ -48,103 +48,535 @@
48
  # demo.launch()
49
 
50
  # Code fix
51
- import gradio as gr
 
 
 
 
 
 
52
  from PIL import Image, UnidentifiedImageError
53
- from transformers import AutoProcessor, BitsAndBytesConfig, TextIteratorStreamer
54
- from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
55
  import torch
56
- from threading import Thread
57
- import time
 
 
 
58
 
59
- # Cấu hình thiết bị
 
 
 
 
 
 
 
 
 
 
60
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
- torch.cuda.empty_cache()
 
 
 
 
 
62
 
63
- # Load hình Qwen2.5-VL với quantization 4-bit
64
- model_id = "prithivMLmods/Camel-Doc-OCR-062825"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # bnb_config = BitsAndBytesConfig(
67
- # load_in_4bit=True,
68
- # bnb_4bit_use_double_quant=True,
69
- # bnb_4bit_quant_type="nf4",
70
- # bnb_4bit_compute_dtype=torch.float16
71
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
74
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
75
- model_id,
76
- # quantization_config=bnb_config, Quantization
77
- device_map="auto",
78
- trust_remote_code=True
79
- ).eval()
80
-
81
- def convert_png_to_jpg(image):
82
- if image.mode in ["RGBA", "LA"]:
83
- converted = Image.new("RGB", image.size, (255, 255, 255))
84
- converted.paste(image, mask=image.split()[-1])
85
- return converted
86
  return image.convert("RGB")
87
 
88
- # Hàm dự đoán
89
- def predict(image, prompt=""):
90
- if image is None:
91
- return "=Vui lòng tải lên ảnh hợp lệ."
 
 
92
 
93
- try:
94
- image = convert_png_to_jpg(image)
95
- prompt = prompt.strip() if prompt else "Please describe the document."
 
 
 
 
 
 
96
 
97
- # Xây dựng prompt theo định dạng Qwen2.5-VL
98
  messages = [{
99
  "role": "user",
100
  "content": [
101
- {"type": "image", "image": image},
102
- {"type": "text", "text": prompt}
103
  ]
104
  }]
105
- text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
106
-
107
- inputs = processor(
108
- text=[text_prompt],
109
- images=[image],
110
- return_tensors="pt",
111
- padding=True
112
- ).to(model.device)
113
-
114
- # Dùng streamer để sinh kết quả mượt hơn
115
- streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)
116
- generation_kwargs = {
117
- **inputs,
118
- "streamer": streamer,
119
- "max_new_tokens": 512,
120
- "do_sample": False,
121
- "use_cache": True
122
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
125
- thread.start()
 
 
 
 
126
 
127
- buffer = ""
128
- for new_text in streamer:
129
- buffer += new_text
130
- time.sleep(0.01)
 
 
 
 
131
 
132
- return buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- except UnidentifiedImageError:
135
- return "Không thể đọc ảnh. Ảnh có thể bị hỏng hoặc sai định dạng."
136
  except Exception as e:
137
- return f"Lỗi khi xử lý ảnh: {str(e)}"
138
-
139
- demo = gr.Interface(
140
- fn=predict,
141
- inputs=[
142
- gr.Image(type="pil", label="Tải ảnh tài liệu lên"),
143
- gr.Textbox(label="Gợi ý (tuỳ chọn)", placeholder="VD: Trích số hóa đơn")
144
- ],
145
- outputs="text",
146
- title="Camel-Doc OCR - Trích xuất văn bản từ ảnh"
147
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  if __name__ == "__main__":
150
- demo.launch()
 
48
  # demo.launch()
49
 
50
  # Code fix
51
+ import os
52
+ import json
53
+ import re
54
+ import hashlib
55
+ import gc
56
+ from io import BytesIO
57
+ from collections import OrderedDict
58
  from PIL import Image, UnidentifiedImageError
 
 
59
  import torch
60
+ from transformers import AutoProcessor, BitsAndBytesConfig
61
+ from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
62
+ from pdf2image import convert_from_bytes
63
+ import gradio as gr
64
+ import fitz
65
 
66
+ # --- CONFIGURATION ---
67
+ MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
68
+ CACHE_MAX_SIZE = 128
69
+ DPI = 100
70
+ THREAD_COUNT = 4
71
+ IMAGE_MAX_DIM = 1024
72
+ JPEG_QUALITY = 75
73
+ GPU_MEMORY_FRACTION = 0.8 # use 80% of GPU memory
74
+ PAD_TOKEN_ID = None # set later to avoid warnings
75
+
76
+ # --- 1. Device & torch settings ---
77
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+ torch.backends.cudnn.benchmark = True
79
+ if device.type == 'cuda':
80
+ try:
81
+ torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=device)
82
+ except Exception:
83
+ pass
84
 
85
+ # --- 2. Model & tokenizer ---
86
+ bnb = BitsAndBytesConfig(
87
+ load_in_4bit=True,
88
+ bnb_4bit_use_double_quant=True,
89
+ bnb_4bit_quant_type="nf4",
90
+ bnb_4bit_compute_dtype=torch.float16
91
+ )
92
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
93
+ # load and compile model
94
+ try:
95
+ base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
96
+ MODEL_ID,
97
+ quantization_config=bnb,
98
+ device_map="auto",
99
+ trust_remote_code=True
100
+ )
101
+ model = torch.compile(base.eval())
102
+ except Exception as e:
103
+ print(f"[Warning] Model compile failed: {e}")
104
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
105
+ MODEL_ID,
106
+ quantization_config=bnb,
107
+ device_map="auto",
108
+ trust_remote_code=True
109
+ ).eval()
110
+ # avoid padding warnings
111
+ PAD_TOKEN_ID = processor.tokenizer.eos_token_id
112
+ processor.tokenizer.pad_token_id = PAD_TOKEN_ID
113
 
114
+ # --- 3. Memory utilities ---
115
+ def cleanup_memory():
116
+ gc.collect()
117
+ if device.type == 'cuda':
118
+ torch.cuda.empty_cache()
119
+
120
+ def get_memory_info():
121
+ if device.type == 'cuda':
122
+ return {
123
+ 'allocated': torch.cuda.memory_allocated() / (1024**3),
124
+ 'reserved': torch.cuda.memory_reserved() / (1024**3)
125
+ }
126
+ return {'allocated': 0, 'reserved': 0}
127
+
128
+ # --- 4. LRU Cache for inference ---
129
+ _mru_cache = OrderedDict()
130
+
131
+ def cache_get(key):
132
+ if key in _mru_cache:
133
+ _mru_cache.move_to_end(key)
134
+ return _mru_cache[key]
135
+ return None
136
+
137
+ def cache_set(key, value):
138
+ _mru_cache[key] = value
139
+ if len(_mru_cache) > CACHE_MAX_SIZE:
140
+ _mru_cache.popitem(last=False)
141
 
142
+ def cache_clear():
143
+ _mru_cache.clear()
144
+
145
+ # --- 5. Image preprocessing ---
146
+ def normalize_image(image: Image.Image) -> Image.Image:
147
+ if image.mode in ("RGBA", "LA"):
148
+ bg = Image.new("RGB", image.size, (255,255,255))
149
+ bg.paste(image, mask=image.split()[-1])
150
+ image = bg
151
+ image.thumbnail((IMAGE_MAX_DIM, IMAGE_MAX_DIM), Image.Resampling.LANCZOS)
 
 
 
152
  return image.convert("RGB")
153
 
154
+ # --- 6. Cache key generation ---
155
+ def make_cache_key(image: Image.Image, prompt: str) -> str:
156
+ bio = BytesIO()
157
+ image.save(bio, format="JPEG", quality=JPEG_QUALITY)
158
+ data = bio.getvalue() + prompt.encode('utf-8')
159
+ return hashlib.md5(data).hexdigest()
160
 
161
+ # --- 7. Inference with mixed precision & error handling ---
162
+ def run_inference(image: Image.Image, prompt: str = "") -> str:
163
+ prompt_text = prompt.strip() or "Read information from the document."
164
+ img = normalize_image(image)
165
+ key = make_cache_key(img, prompt_text)
166
+
167
+ cached = cache_get(key)
168
+ if cached is not None:
169
+ return cached
170
 
171
+ try:
172
  messages = [{
173
  "role": "user",
174
  "content": [
175
+ {"type": "image", "image": img},
176
+ {"type": "text", "text": prompt_text}
177
  ]
178
  }]
179
+ text_prompt = processor.apply_chat_template(
180
+ messages, tokenize=False, add_generation_prompt=True
181
+ )
182
+ inputs = processor(text=[text_prompt], images=[img], return_tensors="pt", padding=True)
183
+ inputs = {k: v.to(device) for k, v in inputs.items()}
184
+
185
+ with torch.inference_mode():
186
+ with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
187
+ gen = model.generate(
188
+ **inputs,
189
+ max_new_tokens=512,
190
+ do_sample=False,
191
+ eos_token_id=processor.tokenizer.eos_token_id
192
+ )
193
+ trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
194
+ result = processor.tokenizer.batch_decode(
195
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
196
+ )[0].strip()
197
+
198
+ cache_set(key, result)
199
+ cleanup_memory()
200
+ return result
201
+
202
+ except torch.cuda.OutOfMemoryError:
203
+ cleanup_memory()
204
+ return "[OOM] GPU out of memory. Try smaller image."
205
+ except Exception as e:
206
+ cleanup_memory()
207
+ return f"[Error] {str(e)}"
208
+
209
+ # --- 8. File handler ---
210
+ import traceback
211
+ def handle_file(file, prompt, extra_prompt, progress=gr.Progress()):
212
+ try:
213
+ # Xác định đường dẫn thật sự
214
+ # file có thể là UploadedFile với .name, hoặc đơn giản là str
215
+ file_path = file.name if hasattr(file, "name") else file
216
+ filename = os.path.basename(file_path)
217
+ ext = filename.lower().split('.')[-1]
218
+ full_prompt = (prompt + "\n" + extra_prompt).strip() or "Read information from file/image."
219
+
220
+ print(f"[INFO] handle_file → {filename} (.{ext})")
221
 
222
+ # ---- PDF branch ----
223
+ if ext == "pdf":
224
+ try:
225
+ with open(file_path, "rb") as f:
226
+ pdf_bytes = f.read()
227
+ print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
228
 
229
+ # Dùng PyMuPDF để convert
230
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
231
+ pages = []
232
+ for i, page in enumerate(doc, start=1):
233
+ pix = page.get_pixmap(dpi=DPI)
234
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
235
+ pages.append(img)
236
+ print(f"[INFO] Converted PDF → {len(pages)} pages")
237
 
238
+ except Exception as e:
239
+ traceback.print_exc()
240
+ return filename, f"[ERROR] PDF conversion failed: {e}"
241
+
242
+ # Inference trên từng trang
243
+ outputs = []
244
+ for idx, img in enumerate(pages, start=1):
245
+ try:
246
+ print(f"[INFO] Inference page {idx}")
247
+ out = run_inference(img, full_prompt)
248
+ except Exception as e:
249
+ traceback.print_exc()
250
+ out = f"[ERROR] Inference page {idx} failed: {e}"
251
+ outputs.append(out)
252
+ if idx % 3 == 0:
253
+ cleanup_memory()
254
+ progress((idx-1)/len(pages), desc=f"Page {idx}/{len(pages)}")
255
+
256
+ result = "\n\n--- Page Break ---\n\n".join(outputs)
257
+ print("[INFO] handle_file done")
258
+ return filename, result
259
+
260
+ # ---- Image branch ----
261
+ else:
262
+ try:
263
+ img = Image.open(file_path)
264
+ print(f"[INFO] Opened image: {img.mode}, {img.size}")
265
+ except Exception as e:
266
+ traceback.print_exc()
267
+ return filename, f"[ERROR] Image open failed: {e}"
268
+ return filename, run_inference(img, full_prompt)
269
 
 
 
270
  except Exception as e:
271
+ traceback.print_exc()
272
+ return "error", f"[ERROR] handle_file unexpected: {e}"
273
+
274
+ # --- 9. Prompt templates & JSON export ---
275
+ prompt_templates = {
276
+ "Electrolux": """Extract all structured information from the delivery order document image.
277
+ You must return the result as a valid XML block that strictly follows the structure below.
278
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
279
+ 1. Return **ONLY** the XML block – nothing before or after it.
280
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
281
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
282
+ 4. For every tag, fill in the exact value read from the image.
283
+ • NEVER copy or repeat the label/placeholder text.
284
+ • NEVER guess or invent values.
285
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
286
+ 6. DO NOT include Vietnamese text or translations inside tag values.
287
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
288
+ 8. Dates must be in YYYY-MM-DD format.
289
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
290
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
291
+ 10. **Inside each value**
292
+ • Replace every internal line-break with “, ” (comma + space).
293
+ • Trim leading/trailing whitespace.
294
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
295
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
296
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
297
+ 13. Ignore any information not represented by the tags below.
298
+ <s_electrolux_form>
299
+ <document_number>Số lệnh giao nhận hàng</document_number>
300
+ <order_number>Số đơn hàng</order_number>
301
+ <customer_code>Mã số khách hàng</customer_code>
302
+ <customer_order_code>Mã đơn khách hàng</customer_order_code>
303
+ <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
304
+ <delivery_date>Ngày giao hàng</delivery_date>
305
+ <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
306
+ <invoice_number>Số hóa đơn</invoice_number>
307
+ <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
308
+ <shipper_address>Địa chỉ gửi hàng</shipper_address>
309
+ <shipper_phone>Số điện thoại</shipper_phone>
310
+ <shipper_fax>Số fax</shipper_fax>
311
+ <shipper_tax_code>Mã số thuế</shipper_tax_code>
312
+ <consignee_customer_code>Mã khách hàng</consignee_customer_code>
313
+ <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
314
+ <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
315
+ <city_province>Tỉnh/Thành phố</city_province>
316
+ <postal_code>Mã bưu chính</postal_code>
317
+ <preparer_name>Họ tên người lập phiếu</preparer_name>
318
+ <preparer_date>Ngày lập phiếu</preparer_date>
319
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
320
+ </s_electrolux_form>
321
+ """,
322
+
323
+ "Jotun": """Extract all structured information from the delivery order document.
324
+ You must return the result as a valid XML block that strictly follows the structure below.
325
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
326
+ 1. Return **ONLY** the XML block – nothing before or after it.
327
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
328
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
329
+ 4. For every tag, fill in the exact value read from the image.
330
+ • NEVER copy or repeat the label/placeholder text.
331
+ • NEVER guess or invent values.
332
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
333
+ 6. DO NOT include Vietnamese text or translations inside tag values.
334
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
335
+ 8. Dates must be in YYYY-MM-DD format.
336
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
337
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
338
+ 10. **Inside each value**
339
+ • Replace every internal line-break with “, ” (comma + space).
340
+ • Trim leading/trailing whitespace.
341
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
342
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
343
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
344
+ 13. Ignore any information not represented by the tags below.
345
+ <s_jotun_form>
346
+ <document_number>Số lệnh giao hàng</document_number>
347
+ <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
348
+ <customer_code>Mã khách hàng</customer_code>
349
+ <customer_name>Tên khách hàng</customer_name>
350
+ <customer_address>Địa chỉ khách hàng</customer_address>
351
+ <customer_phone>Điện thoại khách hàng</customer_phone>
352
+ <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
353
+ <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
354
+ <order_code>Số đơn đặt hàng</order_code>
355
+ <order_date>Ngày đặt hàng</order_date>
356
+ <order_number>Số đơn hàng</order_number>
357
+ <delivery_date>Ngày giao hàng</delivery_date>
358
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
359
+ </s_jotun_form>
360
+ """,
361
+
362
+ "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
363
+ You must return the result as a valid XML block that strictly follows the structure below.
364
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
365
+ 1. Return **ONLY** the XML block – nothing before or after it.
366
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
367
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
368
+ 4. For every tag, fill in the exact value read from the image.
369
+ • NEVER copy or repeat the label/placeholder text.
370
+ • NEVER guess or invent values.
371
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
372
+ 6. DO NOT include Vietnamese text or translations inside tag values.
373
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
374
+ 8. Dates must be in YYYY-MM-DD format.
375
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
376
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
377
+ 10. **Inside each value**
378
+ • Replace every internal line-break with “, ” (comma + space).
379
+ • Trim leading/trailing whitespace.
380
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
381
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
382
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
383
+ 13. Ignore any information not represented by the tags below.
384
+ <s_mawb_form>
385
+ <air_waybill_number>Số MAWB</air_waybill_number>
386
+ <shipper_name>Tên người gửi hàng</shipper_name>
387
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
388
+ <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
389
+ <consignee_name>Tên người nhận hàng</consignee_name>
390
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
391
+ <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
392
+ <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
393
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
394
+ </s_mawb_form>
395
+ """,
396
+
397
+ "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
398
+ You must return the result as a valid XML block that strictly follows the structure below.
399
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
400
+ 1. Return **ONLY** the XML block – nothing before or after it.
401
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
402
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
403
+ 4. For every tag, fill in the exact value read from the image.
404
+ • NEVER copy or repeat the label/placeholder text.
405
+ • NEVER guess or invent values.
406
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
407
+ 6. DO NOT include Vietnamese text or translations inside tag values.
408
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
409
+ 8. Dates must be in YYYY-MM-DD format.
410
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
411
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
412
+ 10. **Inside each value**
413
+ • Replace every internal line-break with “, ” (comma + space).
414
+ • Trim leading/trailing whitespace.
415
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
416
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
417
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
418
+ 13. Ignore any information not represented by the tags below.
419
+ <s_weight_ticket>
420
+ <awb_number>Số AWB</awb_number>
421
+ <shipper_name>Tên người gửi hàng</shipper_name>
422
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
423
+ <shipper_contact>Số điện thoại người gửi</shipper_contact>
424
+ <consignee_name>Tên người nhận hàng</consignee_name>
425
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
426
+ <cargo_description>Tên hàng hóa</cargo_description>
427
+ <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
428
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
429
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
430
+ </s_weight_ticket>
431
+ """,
432
+
433
+ "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
434
+ You must return the result as a valid XML block that strictly follows the structure below.
435
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
436
+ 1. Return **ONLY** the XML block – nothing before or after it.
437
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
438
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
439
+ 4. For every tag, fill in the exact value read from the image.
440
+ • NEVER copy or repeat the label/placeholder text.
441
+ • NEVER guess or invent values.
442
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
443
+ 6. DO NOT include Vietnamese text or translations inside tag values.
444
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
445
+ 8. Dates must be in YYYY-MM-DD format.
446
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
447
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
448
+ 10. **Inside each value**
449
+ • Replace every internal line-break with “, ” (comma + space).
450
+ • Trim leading/trailing whitespace.
451
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
452
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
453
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
454
+ 13. Ignore any information not represented by the tags below.
455
+ <s_pc3u_form>
456
+ <awb_number>Số AWB</awb_number>
457
+ <cargo_service_code>Mã dịch vụ</cargo_service_code>
458
+ <shipper_name>Tên người gửi</shipper_name>
459
+ <shipper_address>Địa chỉ người gửi</shipper_address>
460
+ <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
461
+ <payer_name>Người thanh toán</payer_name>
462
+ <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
463
+ <consignee_name>Tên người nhận</consignee_name>
464
+ <consignee_address>Địa chỉ người nhận</consignee_address>
465
+ <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
466
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
467
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
468
+ </s_pc3u_form>
469
+ """,
470
+
471
+ "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
472
+ You must return the result as a valid XML block that strictly follows the structure below.
473
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
474
+ 1. Return **ONLY** the XML block – nothing before or after it.
475
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
476
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
477
+ 4. For every tag, fill in the exact value read from the image.
478
+ • NEVER copy or repeat the label/placeholder text.
479
+ • NEVER guess or invent values.
480
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
481
+ 6. DO NOT include Vietnamese text or translations inside tag values.
482
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
483
+ 8. Dates must be in YYYY-MM-DD format.
484
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
485
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
486
+ 10. **Inside each value**
487
+ • Replace every internal line-break with “, ” (comma + space).
488
+ • Trim leading/trailing whitespace.
489
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
490
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
491
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
492
+ 13. Ignore any information not represented by the tags below.
493
+ <s_avs_dad>
494
+ <air_waybill_number>Số AWB</air_waybill_number>
495
+ <form_code>Mã biểu mẫu</form_code>
496
+ <shipper_name>Tên người gửi</shipper_name>
497
+ <shipper_address>Địa chỉ người gửi</shipper_address>
498
+ <shipper_phone>Điện thoại người gửi</shipper_phone>
499
+ <shipper_email>Email người gửi</shipper_email>
500
+ <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
501
+ <consignee_name>Tên người nhận</consignee_name>
502
+ <consignee_address>Địa chỉ người nhận</consignee_address>
503
+ <consignee_phone>Điện thoại người nhận</consignee_phone>
504
+ <consignee_email>Email người nhận</consignee_email>
505
+ <departure_airport>Nơi đi</departure_airport>
506
+ <destination_airport>Nơi đến</destination_airport>
507
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
508
+ <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
509
+ <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
510
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
511
+ <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
512
+ </s_avs_dad>
513
+ """
514
+ }
515
+
516
+ def insert_template(name):
517
+ return prompt_templates.get(name, "")
518
+
519
+ def sanitize_filename(name):
520
+ return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
521
+
522
+ def clean_text(text):
523
+ text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
524
+ text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
525
+ return text.strip()
526
+
527
+ def export_json(image_name, result_text):
528
+ try:
529
+ clean_name = sanitize_filename(image_name)
530
+ content = {"image": image_name, "text_sequence": clean_text(result_text)}
531
+ path = f"/tmp/{clean_name}.json"
532
+ with open(path, "w", encoding="utf-8") as f:
533
+ json.dump(content, f, ensure_ascii=False, indent=2)
534
+ return path, json.dumps(content, ensure_ascii=False, indent=2)
535
+ except Exception as e:
536
+ return "", f"[Export JSON Failed]: {e}"
537
+
538
+ # --- 10. Gradio UI ---
539
+ with gr.Blocks(title="Camel-Doc-OCR") as demo:
540
+ gr.Markdown("Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
541
+
542
+ status_txt = gr.Textbox(label="Status & Memory", interactive=False)
543
+ cache_txt = gr.Textbox(label="Cache Stats", interactive=False)
544
+ clear_btn = gr.Button("Clear Cache")
545
+ clear_btn.click(fn=lambda: (cache_clear(), f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}"), outputs=[cache_txt])
546
+
547
+ file_input = gr.File(label="Tải ảnh hoặc PDF", file_types=[".jpg", ".jpeg", ".png", ".pdf"])
548
+ prompt_input = gr.Textbox(label="Prompt thuần", lines=2)
549
+ config_input = gr.Textbox(label="JSON Prompt", lines=12)
550
+
551
+ gr.Markdown("Chọn mẫu prompt:")
552
+ with gr.Row():
553
+ for key in prompt_templates:
554
+ btn = gr.Button(f"Mẫu {key}")
555
+ btn.click(fn=insert_template, inputs=[gr.State(key)], outputs=config_input)
556
+
557
+ run_btn = gr.Button("Chạy OCR")
558
+ export_btn = gr.Button("Xuất JSON", visible=False)
559
+
560
+ hidden_name = gr.Textbox(visible=False)
561
+ result_output = gr.Textbox(label="Kết quả trích xuất", lines=20)
562
+ json_file = gr.File(label="File JSON", visible=False, file_types=[".json"])
563
+ json_text = gr.Code(label="JSON Output", language="json", lines=20)
564
+
565
+ # Run inference
566
+ run_btn.click(
567
+ fn=handle_file,
568
+ inputs=[file_input, prompt_input, config_input],
569
+ outputs=[hidden_name, result_output]
570
+ )
571
+
572
+ # Update memory status
573
+ run_btn.click(fn=lambda: get_memory_info(), outputs=[status_txt])
574
+ run_btn.click(fn=lambda: f"Cache: {len(_mru_cache)}/{CACHE_MAX_SIZE}", outputs=[cache_txt])
575
+ run_btn.click(fn=lambda: gr.update(visible=True), outputs=[export_btn])
576
+
577
+ # Export
578
+ export_btn.click(fn=export_json, inputs=[hidden_name, result_output], outputs=[json_file, json_text])
579
+ export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
580
 
581
  if __name__ == "__main__":
582
+ demo.launch(share=True)