vithacocf commited on
Commit
5ebc1f7
·
verified ·
1 Parent(s): 775fa37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -74
app.py CHANGED
@@ -23,85 +23,117 @@ try:
23
  except AttributeError:
24
  RESAMPLE = Image.LANCZOS
25
 
26
- PROMPT_FREIGHT_JSON = """
27
- Please analyze the freight rate tables in the file I provide. This file may contain **multiple airlines' tariffs**. Your task is to extract **one JSON object per airline**, using the following schema:
28
-
29
- [
30
- {
31
- "shipping_line": "...",
32
- "shipping_line_code": "...",
33
- "shipping_line_reason": "Why this carrier is chosen?",
34
- "fee_type": "Air Freight",
35
- "valid_from": ...,
36
- "valid_to": ...,
37
- "charges": [
38
- {
39
- "frequency": "...",
40
- "package_type": "...",
41
- "aircraft_type": "...",
42
- "direction": "Export or Import or null",
43
- "origin": "...",
44
- "destination": "...",
45
- "charge_name": "...",
46
- "charge_code": "...",
47
- "charge_code_reason": "...",
48
- "cargo_type": "...",
49
- "currency": "...",
50
- "transit": "...",
51
- "transit_time": "...",
52
- "weight_breaks": {
53
- "M": ...,
54
- "N": ...,
55
- "+45kg": ...,
56
- "+100kg": ...,
57
- "+300kg": ...,
58
- "+500kg": ...,
59
- "+1000kg": ...,
60
- "other": {
61
- key: value
62
- },
63
- "weight_breaks_reason":"Why chosen weight_breaks?"
64
- },
65
- "remark": "..."
66
- }
67
- ],
68
- "local_charges": [
69
- {
70
- "charge_name": "...",
71
- "charge_code": "...",
72
- "unit": "...",
73
- "amount": ...,
74
- "remark": "..."
75
- }
76
- ]
77
  },
78
- ...
79
- ]
80
-
81
- ### Date rules
82
- - valid_from format:
83
- - `DD/MM/YYYY` (if full date)
84
- - `01/MM/YYYY` (if month+year only)
85
- - `01/01/YYYY` (if year only)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  - `UFN` if missing
87
  - valid_to:
88
  - exact `DD/MM/YYYY` if present
89
  - else `UFN`
90
 
91
- ### STRICT RULES:
92
- - Return a JSON **array** of airline objects (not just one).
93
- - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). Set null if N/A. No assumptions or interpolations.
94
- - If the table shows "RQ" or similar, set value as "RQST".
95
- - Group same-price destinations into one record separated by "/".
96
- - Always use IATA code for origin and destination.
97
- - Flight number (e.g. ZH118) is not charge code.
98
- - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
99
- - If local charges exist, list them.
100
- - If validity is missing, set null.
101
- - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
102
- - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
103
- - Replace commas in remarks with semicolons.
104
- - **Only return valid JSON. No text explanation. No markdown.**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  """
107
 
@@ -253,7 +285,7 @@ def run_process(file, question, model_choice, temperature, top_p, external_api_u
253
  check_result = check_pdf_structure(file_bytes)
254
  print(f"[PDF Check] {filename}: {check_result}")
255
 
256
- if check_result == "có" and 1=2: # bỏ qua if này test thử prompt nhiều hãng
257
  try:
258
  print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
259
  all_dfs = []
 
23
  except AttributeError:
24
  RESAMPLE = Image.LANCZOS
25
 
26
+ PROMPT_MULTI_AIRLINE_JSON = """
27
+ You are an expert in air freight rate extraction and normalization.
28
+
29
+ The document contains rate information for multiple airlines.
30
+ Please analyze all content (tables, headers, notes) and return **a list of JSON objects**, each representing a separate airline.
31
+
32
+ Each airline should follow this schema:
33
+
34
+ {
35
+ "shipping_line": "...",
36
+ "shipping_line_code": "...",
37
+ "shipping_line_reason": "Why this carrier is chosen?",
38
+ "fee_type": "Air Freight",
39
+ "valid_from": "...",
40
+ "valid_to": "...",
41
+ "charges": [ ... ], # List of charge objects (see below)
42
+ "local_charges": [ ... ] # Optional local charges if available
43
+ }
44
+
45
+ Each `charges` object must follow this schema:
46
+
47
+ {
48
+ "frequency": "...",
49
+ "package_type": "...", # e.g. Carton, Pallet, Skid
50
+ "aircraft_type": "...",
51
+ "direction": "Export / Import / null",
52
+ "origin": "...",
53
+ "destination": "...",
54
+ "charge_name": "...",
55
+ "charge_code": "GCR / PER / DGR / etc.",
56
+ "charge_code_reason": "...",
57
+ "cargo_type": "...",
58
+ "currency": "...",
59
+ "transit": "...",
60
+ "transit_time": "...",
61
+ "weight_breaks": {
62
+ "M": ...,
63
+ "N": ...,
64
+ "+45kg": ...,
65
+ "+100kg": ...,
66
+ "+300kg": ...,
67
+ "+500kg": ...,
68
+ "+1000kg": ...,
69
+ "other": { key: value },
70
+ "weight_breaks_reason": "Why chosen weight_breaks?"
 
 
 
 
 
 
71
  },
72
+ "remark": "..."
73
+ }
74
+
75
+ Each `local_charges` object:
76
+
77
+ {
78
+ "charge_name": "...",
79
+ "charge_code": "...",
80
+ "unit": "...",
81
+ "amount": ...,
82
+ "remark": "..."
83
+ }
84
+
85
+ ---
86
+
87
+ ### ✈️ Airline Separation Logic:
88
+ - If multiple airlines are detected in the document, separate each section and return a distinct JSON object per airline.
89
+ - Infer `shipping_line` and `shipping_line_code` from the header (e.g. "AIR CHINA CARGO (CA)" → name = "AIR CHINA CARGO", code = "CA").
90
+ - Each JSON object must include only data relevant to that airline.
91
+
92
+ ---
93
+
94
+ ### 💡 Date rules:
95
+ - valid_from:
96
+ - `DD/MM/YYYY` if exact
97
+ - `01/MM/YYYY` if only month/year
98
+ - `01/01/YYYY` if only year
99
  - `UFN` if missing
100
  - valid_to:
101
  - exact `DD/MM/YYYY` if present
102
  - else `UFN`
103
 
104
+ ---
105
+
106
+ ### 📦 Package and Surcharge Logic:
107
+ Apply these when the remark or note indicates such rules:
108
+
109
+ 1. **Default case**: If no package mentioned `"Carton"` is the default.
110
+ 2. **“Carton = Pallet”**: Duplicate rates with `package_type="Pallet"`.
111
+ 3. **“SKID shipment: add 10 cents (GEN & PER)”**: Add new charges with `+0.10 USD/kg` for GEN/PER, with `package_type="Pallet"` or `"Skid"`.
112
+ 4. **EU vs Non-EU surcharges**: If different pallet surcharges by region → split charges accordingly.
113
+ 5. **“All-in” or “inclusive of MY and SC”**: Record `FSC` and `WSC` as `local_charges` with `"NIL"` amount.
114
+ 6. **Flight number is not a charge code**. Always use standard cargo code (GCR, PER, etc.).
115
+
116
+ ---
117
+
118
+ ### ⚙️ Other Business Rules:
119
+ - RQ / Request → "RQST"
120
+ - Combine same-rate destinations using `/`
121
+ - Always use **IATA code** for origin/destination
122
+ - Direction = Export if origin is in Vietnam (SGN, HAN, DAD), else Import
123
+ - Frequency:
124
+ - D[1-7] = day of week
125
+ - "Daily" = D1234567
126
+ - Remarks: Replace `,` with `;`
127
+ - Add meaningful `"shipping_line_reason"` and `"charge_code_reason"`
128
+
129
+ ---
130
+
131
+ ### 🚨 STRICT OUTPUT:
132
+ - Return **a JSON array**, where each item is a full airline object
133
+ - Do NOT return markdown or explanation
134
+ - All fields must be valid
135
+ - All numbers = numeric types
136
+ - Use `null` if value missing
137
 
138
  """
139
 
 
285
  check_result = check_pdf_structure(file_bytes)
286
  print(f"[PDF Check] {filename}: {check_result}")
287
 
288
+ if check_result == "có" and 1==2: # bỏ qua if này test thử prompt nhiều hãng
289
  try:
290
  print("➡️ PDF có nhiều cột/nhiều trang → dùng pdfplumber extract trước rồi Gemini.")
291
  all_dfs = []