Dusit-P commited on
Commit
c968c6a
·
verified ·
1 Parent(s): 1ade647

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -34
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, json, importlib.util, tempfile, traceback, torch
2
  import torch.nn.functional as F
3
  import gradio as gr
4
  import pandas as pd
@@ -54,27 +54,38 @@ def load_model(model_name: str):
54
  def _format_pct(x: float) -> str:
55
  return f"{x*100:.2f}%"
56
 
57
- def _predict_batch(texts, model_name, batch_size=64):
58
- """รับ list[str] คืน list[dict] = review, negative(%), positive(%), label"""
59
- model, tok, cfg = load_model(model_name)
60
- results = []
61
- rows = [str(t) for t in texts if str(t).strip()]
62
- for i in range(0, len(rows), batch_size):
63
- chunk = rows[i:i+batch_size]
64
- enc = tok(chunk, padding=True, truncation=True, max_length=cfg["max_len"], return_tensors="pt")
65
- with torch.no_grad():
66
- logits = model(enc["input_ids"], enc["attention_mask"])
67
- probs = F.softmax(logits, dim=1).cpu().numpy()
68
- for txt, p in zip(chunk, probs):
69
- neg, pos = float(p[0]), float(p[1])
70
- label = "positive" if pos >= neg else "negative"
71
- results.append({
72
- "review": txt,
73
- "negative(%)": _format_pct(neg),
74
- "positive(%)": _format_pct(pos),
75
- "label": label,
76
- })
77
- return results
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def _detect_cols(df: pd.DataFrame):
80
  """เดาชื่อคอลัมน์รีวิว/ร้านอัตโนมัติ ถ้าไม่พบรีวิว เลือกคอลัมน์ object ตัวแรก"""
@@ -172,13 +183,36 @@ def _shop_summary(out_df: pd.DataFrame, max_shops=15):
172
  )
173
  return fig, table
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # ---------- API wrappers ----------
176
  def predict_one(text: str, model_choice: str):
177
  try:
178
- if not text.strip():
179
- return {"negative": 0.0, "positive": 0.0}, ""
 
180
  model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
181
- out = _predict_batch([text], model_name)[0]
182
  probs = {
183
  "negative": float(out["negative(%)"].rstrip("%"))/100.0,
184
  "positive": float(out["positive(%)"].rstrip("%"))/100.0,
@@ -192,12 +226,19 @@ def predict_one(text: str, model_choice: str):
192
  def predict_many(text_block: str, model_choice: str):
193
  try:
194
  model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
195
- lines = [ln.strip() for ln in (text_block or "").splitlines() if ln.strip()]
196
- results = _predict_batch(lines, model_name)
 
 
 
 
 
 
 
197
  df = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
198
- if len(df) == 0:
199
- return df, go.Figure(), go.Figure(), "No data"
200
  fig_bar, fig_pie, info_md = _make_figures(df)
 
201
  return df, fig_bar, fig_pie, info_md
202
  except Exception as e:
203
  print("ERROR in predict_many:", repr(e))
@@ -219,11 +260,21 @@ def predict_csv(file_obj, model_choice: str, review_col_override: str = "", shop
219
  if rev_col not in df.columns:
220
  raise ValueError(f"ไม่พบคอลัมน์รีวิว '{rev_col}' ใน CSV (columns = {list(df.columns)})")
221
 
222
- results = _predict_batch(df[rev_col].astype(str).tolist(), model_name)
 
 
 
 
 
 
 
 
 
 
223
  out = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
224
 
225
- if shop_col and shop_col in df.columns:
226
- out.insert(0, "shop", df[shop_col].astype(str).fillna(""))
227
 
228
  # ไฟล์ผลลัพธ์สำหรับดาวน์โหลด
229
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
@@ -234,8 +285,13 @@ def predict_csv(file_obj, model_choice: str, review_col_override: str = "", shop
234
  # กราฟ/ตารางต่อร้าน (ถ้ามี shop)
235
  fig_shop, tbl_shop = _shop_summary(out)
236
 
237
- # แนบข้อความบอกคอลัมน์ที่ใช้
238
- info_md = f"{info_md} \nใช้คอลัมน์รีวิว: {rev_col}" + (f" | คอลัมน์ร้าน: {shop_col}" if ("shop" in out.columns) else " | ไม่มีคอลัมน์ร้าน")
 
 
 
 
 
239
 
240
  return out, tmp.name, fig_bar, fig_pie, fig_shop, tbl_shop, info_md
241
  except Exception as e:
 
1
+ import os, json, importlib.util, tempfile, traceback, torch, re, math
2
  import torch.nn.functional as F
3
  import gradio as gr
4
  import pandas as pd
 
54
  def _format_pct(x: float) -> str:
55
  return f"{x*100:.2f}%"
56
 
57
+ # ====== ฟิลเตอร์ข้อความที่ไม่ใช่รีวิว / ค่าว่าง / สัญลักษณ์ ======
58
+ _INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""} # lower-case
59
+ _RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]") # ต้องมีอย่างน้อย 1 ตัวอักษรไทยหรืออังกฤษ
60
+
61
+ def _norm_text(v) -> str:
62
+ """แปลงค่าให้เป็นสตริงพร้อม trim และกัน NaN/None"""
63
+ if v is None:
64
+ return ""
65
+ if isinstance(v, float) and math.isnan(v):
66
+ return ""
67
+ s = str(v).strip()
68
+ return s
69
+
70
+ def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
71
+ """เงื่อนไขว่าเป็นข้อความที่พอจะวิเคราะห์ได้"""
72
+ if not s:
73
+ return False
74
+ s_lower = s.lower()
75
+ if s_lower in _INVALID_STRINGS:
76
+ return False
77
+ if not _RE_HAS_LETTER.search(s):
78
+ return False
79
+ if len(s.replace(" ", "")) < min_chars:
80
+ return False
81
+ return True
82
+
83
+ def _clean_texts(texts):
84
+ """รับ list ใด ๆ → คืน (รายการที่ใช้ได้, จำนวนที่ถูกข้าม)"""
85
+ all_norm = [_norm_text(t) for t in texts]
86
+ cleaned = [t for t in all_norm if _is_substantive_text(t)]
87
+ skipped = len(all_norm) - len(cleaned)
88
+ return cleaned, skipped
89
 
90
  def _detect_cols(df: pd.DataFrame):
91
  """เดาชื่อคอลัมน์รีวิว/ร้านอัตโนมัติ ถ้าไม่พบรีวิว เลือกคอลัมน์ object ตัวแรก"""
 
183
  )
184
  return fig, table
185
 
186
+ # ---------- core prediction ----------
187
+ def _predict_batch(texts, model_name, batch_size=64):
188
+ """รับ list[str] (ผ่านการกรองแล้ว) → คืน list[dict]"""
189
+ model, tok, cfg = load_model(model_name)
190
+ results = []
191
+ for i in range(0, len(texts), batch_size):
192
+ chunk = texts[i:i+batch_size]
193
+ enc = tok(chunk, padding=True, truncation=True, max_length=cfg["max_len"], return_tensors="pt")
194
+ with torch.no_grad():
195
+ logits = model(enc["input_ids"], enc["attention_mask"])
196
+ probs = F.softmax(logits, dim=1).cpu().numpy()
197
+ for txt, p in zip(chunk, probs):
198
+ neg, pos = float(p[0]), float(p[1])
199
+ label = "positive" if pos >= neg else "negative"
200
+ results.append({
201
+ "review": txt,
202
+ "negative(%)": _format_pct(neg),
203
+ "positive(%)": _format_pct(pos),
204
+ "label": label,
205
+ })
206
+ return results
207
+
208
  # ---------- API wrappers ----------
209
  def predict_one(text: str, model_choice: str):
210
  try:
211
+ s = _norm_text(text)
212
+ if not _is_substantive_text(s):
213
+ return {"negative": 0.0, "positive": 0.0}, "invalid"
214
  model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
215
+ out = _predict_batch([s], model_name)[0]
216
  probs = {
217
  "negative": float(out["negative(%)"].rstrip("%"))/100.0,
218
  "positive": float(out["positive(%)"].rstrip("%"))/100.0,
 
226
  def predict_many(text_block: str, model_choice: str):
227
  try:
228
  model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
229
+ raw_lines = (text_block or "").splitlines()
230
+ trimmed = [_norm_text(ln) for ln in raw_lines if _norm_text(ln)]
231
+ cleaned, skipped = _clean_texts(trimmed)
232
+
233
+ if len(cleaned) == 0:
234
+ empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
235
+ return empty, go.Figure(), go.Figure(), "No valid text"
236
+
237
+ results = _predict_batch(cleaned, model_name)
238
  df = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
239
+
 
240
  fig_bar, fig_pie, info_md = _make_figures(df)
241
+ info_md = f"{info_md} \n- Skipped (empty/non-text): {skipped}"
242
  return df, fig_bar, fig_pie, info_md
243
  except Exception as e:
244
  print("ERROR in predict_many:", repr(e))
 
260
  if rev_col not in df.columns:
261
  raise ValueError(f"ไม่พบคอลัมน์รีวิว '{rev_col}' ใน CSV (columns = {list(df.columns)})")
262
 
263
+ # === กรองแถวที่ใช้ได้จริง ===
264
+ reviews_norm = df[rev_col].apply(_norm_text)
265
+ mask_use = reviews_norm.apply(_is_substantive_text)
266
+ skipped = int((~mask_use).sum())
267
+
268
+ used_df = df.loc[mask_use].copy()
269
+ if used_df.empty:
270
+ empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
271
+ return empty, None, go.Figure(), go.Figure(), go.Figure(), pd.DataFrame(), "ไม่พบรีวิวที่เป็นข้อความ"
272
+
273
+ results = _predict_batch(used_df[rev_col].astype(str).tolist(), model_name)
274
  out = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
275
 
276
+ if shop_col and shop_col in used_df.columns:
277
+ out.insert(0, "shop", used_df[shop_col].astype(str).fillna(""))
278
 
279
  # ไฟล์ผลลัพธ์สำหรับดาวน์โหลด
280
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
 
285
  # กราฟ/ตารางต่อร้าน (ถ้ามี shop)
286
  fig_shop, tbl_shop = _shop_summary(out)
287
 
288
+ # แนบข้อความบอกคอลัมน์ที่ใช้ + จำนวนแถวที่ถูกข้าม
289
+ info_md = (
290
+ f"{info_md} \n"
291
+ f"ใช้คอลัมน์รีวิว: {rev_col}"
292
+ + (f" | คอลัมน์ร้าน: {shop_col}" if ("shop" in out.columns) else " | ไม่มีคอลัมน์ร้าน")
293
+ + f" \n- Skipped (empty/non-text): {skipped}"
294
+ )
295
 
296
  return out, tmp.name, fig_bar, fig_pie, fig_shop, tbl_shop, info_md
297
  except Exception as e: