Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import os, json, importlib.util, tempfile, traceback, torch
|
| 2 |
import torch.nn.functional as F
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
|
@@ -54,27 +54,38 @@ def load_model(model_name: str):
|
|
| 54 |
def _format_pct(x: float) -> str:
|
| 55 |
return f"{x*100:.2f}%"
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
def _detect_cols(df: pd.DataFrame):
|
| 80 |
"""เดาชื่อคอลัมน์รีวิว/ร้านอัตโนมัติ ถ้าไม่พบรีวิว เลือกคอลัมน์ object ตัวแรก"""
|
|
@@ -172,13 +183,36 @@ def _shop_summary(out_df: pd.DataFrame, max_shops=15):
|
|
| 172 |
)
|
| 173 |
return fig, table
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
# ---------- API wrappers ----------
|
| 176 |
def predict_one(text: str, model_choice: str):
|
| 177 |
try:
|
| 178 |
-
|
| 179 |
-
|
|
|
|
| 180 |
model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
|
| 181 |
-
out = _predict_batch([
|
| 182 |
probs = {
|
| 183 |
"negative": float(out["negative(%)"].rstrip("%"))/100.0,
|
| 184 |
"positive": float(out["positive(%)"].rstrip("%"))/100.0,
|
|
@@ -192,12 +226,19 @@ def predict_one(text: str, model_choice: str):
|
|
| 192 |
def predict_many(text_block: str, model_choice: str):
|
| 193 |
try:
|
| 194 |
model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
df = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
|
| 198 |
-
|
| 199 |
-
return df, go.Figure(), go.Figure(), "No data"
|
| 200 |
fig_bar, fig_pie, info_md = _make_figures(df)
|
|
|
|
| 201 |
return df, fig_bar, fig_pie, info_md
|
| 202 |
except Exception as e:
|
| 203 |
print("ERROR in predict_many:", repr(e))
|
|
@@ -219,11 +260,21 @@ def predict_csv(file_obj, model_choice: str, review_col_override: str = "", shop
|
|
| 219 |
if rev_col not in df.columns:
|
| 220 |
raise ValueError(f"ไม่พบคอลัมน์รีวิว '{rev_col}' ใน CSV (columns = {list(df.columns)})")
|
| 221 |
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
out = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
|
| 224 |
|
| 225 |
-
if shop_col and shop_col in
|
| 226 |
-
out.insert(0, "shop",
|
| 227 |
|
| 228 |
# ไฟล์ผลลัพธ์สำหรับดาวน์โหลด
|
| 229 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
|
@@ -234,8 +285,13 @@ def predict_csv(file_obj, model_choice: str, review_col_override: str = "", shop
|
|
| 234 |
# กราฟ/ตารางต่อร้าน (ถ้ามี shop)
|
| 235 |
fig_shop, tbl_shop = _shop_summary(out)
|
| 236 |
|
| 237 |
-
# แนบข้อความบอกคอลัมน์ที่ใช้
|
| 238 |
-
info_md =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
return out, tmp.name, fig_bar, fig_pie, fig_shop, tbl_shop, info_md
|
| 241 |
except Exception as e:
|
|
|
|
| 1 |
+
import os, json, importlib.util, tempfile, traceback, torch, re, math
|
| 2 |
import torch.nn.functional as F
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
|
|
|
| 54 |
def _format_pct(x: float) -> str:
|
| 55 |
return f"{x*100:.2f}%"
|
| 56 |
|
| 57 |
+
# ====== ฟิลเตอร์ข้อความที่ไม่ใช่รีวิว / ค่าว่าง / สัญลักษณ์ ======
|
| 58 |
+
_INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""} # lower-case
|
| 59 |
+
_RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]") # ต้องมีอย่างน้อย 1 ตัวอักษรไทยหรืออังกฤษ
|
| 60 |
+
|
| 61 |
+
def _norm_text(v) -> str:
|
| 62 |
+
"""แปลงค่าให้เป็นสตริงพร้อม trim และกัน NaN/None"""
|
| 63 |
+
if v is None:
|
| 64 |
+
return ""
|
| 65 |
+
if isinstance(v, float) and math.isnan(v):
|
| 66 |
+
return ""
|
| 67 |
+
s = str(v).strip()
|
| 68 |
+
return s
|
| 69 |
+
|
| 70 |
+
def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
|
| 71 |
+
"""เงื่อนไขว่าเป็นข้อความที่พอจะวิเคราะห์ได้"""
|
| 72 |
+
if not s:
|
| 73 |
+
return False
|
| 74 |
+
s_lower = s.lower()
|
| 75 |
+
if s_lower in _INVALID_STRINGS:
|
| 76 |
+
return False
|
| 77 |
+
if not _RE_HAS_LETTER.search(s):
|
| 78 |
+
return False
|
| 79 |
+
if len(s.replace(" ", "")) < min_chars:
|
| 80 |
+
return False
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
def _clean_texts(texts):
|
| 84 |
+
"""รับ list ใด ๆ → คืน (รายการที่ใช้ได้, จำนวนที่ถูกข้าม)"""
|
| 85 |
+
all_norm = [_norm_text(t) for t in texts]
|
| 86 |
+
cleaned = [t for t in all_norm if _is_substantive_text(t)]
|
| 87 |
+
skipped = len(all_norm) - len(cleaned)
|
| 88 |
+
return cleaned, skipped
|
| 89 |
|
| 90 |
def _detect_cols(df: pd.DataFrame):
|
| 91 |
"""เดาชื่อคอลัมน์รีวิว/ร้านอัตโนมัติ ถ้าไม่พบรีวิว เลือกคอลัมน์ object ตัวแรก"""
|
|
|
|
| 183 |
)
|
| 184 |
return fig, table
|
| 185 |
|
| 186 |
+
# ---------- core prediction ----------
|
| 187 |
+
def _predict_batch(texts, model_name, batch_size=64):
|
| 188 |
+
"""รับ list[str] (ผ่านการกรองแล้ว) → คืน list[dict]"""
|
| 189 |
+
model, tok, cfg = load_model(model_name)
|
| 190 |
+
results = []
|
| 191 |
+
for i in range(0, len(texts), batch_size):
|
| 192 |
+
chunk = texts[i:i+batch_size]
|
| 193 |
+
enc = tok(chunk, padding=True, truncation=True, max_length=cfg["max_len"], return_tensors="pt")
|
| 194 |
+
with torch.no_grad():
|
| 195 |
+
logits = model(enc["input_ids"], enc["attention_mask"])
|
| 196 |
+
probs = F.softmax(logits, dim=1).cpu().numpy()
|
| 197 |
+
for txt, p in zip(chunk, probs):
|
| 198 |
+
neg, pos = float(p[0]), float(p[1])
|
| 199 |
+
label = "positive" if pos >= neg else "negative"
|
| 200 |
+
results.append({
|
| 201 |
+
"review": txt,
|
| 202 |
+
"negative(%)": _format_pct(neg),
|
| 203 |
+
"positive(%)": _format_pct(pos),
|
| 204 |
+
"label": label,
|
| 205 |
+
})
|
| 206 |
+
return results
|
| 207 |
+
|
| 208 |
# ---------- API wrappers ----------
|
| 209 |
def predict_one(text: str, model_choice: str):
|
| 210 |
try:
|
| 211 |
+
s = _norm_text(text)
|
| 212 |
+
if not _is_substantive_text(s):
|
| 213 |
+
return {"negative": 0.0, "positive": 0.0}, "invalid"
|
| 214 |
model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
|
| 215 |
+
out = _predict_batch([s], model_name)[0]
|
| 216 |
probs = {
|
| 217 |
"negative": float(out["negative(%)"].rstrip("%"))/100.0,
|
| 218 |
"positive": float(out["positive(%)"].rstrip("%"))/100.0,
|
|
|
|
| 226 |
def predict_many(text_block: str, model_choice: str):
|
| 227 |
try:
|
| 228 |
model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
|
| 229 |
+
raw_lines = (text_block or "").splitlines()
|
| 230 |
+
trimmed = [_norm_text(ln) for ln in raw_lines if _norm_text(ln)]
|
| 231 |
+
cleaned, skipped = _clean_texts(trimmed)
|
| 232 |
+
|
| 233 |
+
if len(cleaned) == 0:
|
| 234 |
+
empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
|
| 235 |
+
return empty, go.Figure(), go.Figure(), "No valid text"
|
| 236 |
+
|
| 237 |
+
results = _predict_batch(cleaned, model_name)
|
| 238 |
df = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
|
| 239 |
+
|
|
|
|
| 240 |
fig_bar, fig_pie, info_md = _make_figures(df)
|
| 241 |
+
info_md = f"{info_md} \n- Skipped (empty/non-text): {skipped}"
|
| 242 |
return df, fig_bar, fig_pie, info_md
|
| 243 |
except Exception as e:
|
| 244 |
print("ERROR in predict_many:", repr(e))
|
|
|
|
| 260 |
if rev_col not in df.columns:
|
| 261 |
raise ValueError(f"ไม่พบคอลัมน์รีวิว '{rev_col}' ใน CSV (columns = {list(df.columns)})")
|
| 262 |
|
| 263 |
+
# === กรองแถวที่ใช้ได้จริง ===
|
| 264 |
+
reviews_norm = df[rev_col].apply(_norm_text)
|
| 265 |
+
mask_use = reviews_norm.apply(_is_substantive_text)
|
| 266 |
+
skipped = int((~mask_use).sum())
|
| 267 |
+
|
| 268 |
+
used_df = df.loc[mask_use].copy()
|
| 269 |
+
if used_df.empty:
|
| 270 |
+
empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
|
| 271 |
+
return empty, None, go.Figure(), go.Figure(), go.Figure(), pd.DataFrame(), "ไม่พบรีวิวที่เป็นข้อความ"
|
| 272 |
+
|
| 273 |
+
results = _predict_batch(used_df[rev_col].astype(str).tolist(), model_name)
|
| 274 |
out = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
|
| 275 |
|
| 276 |
+
if shop_col and shop_col in used_df.columns:
|
| 277 |
+
out.insert(0, "shop", used_df[shop_col].astype(str).fillna(""))
|
| 278 |
|
| 279 |
# ไฟล์ผลลัพธ์สำหรับดาวน์โหลด
|
| 280 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
|
|
|
| 285 |
# กราฟ/ตารางต่อร้าน (ถ้ามี shop)
|
| 286 |
fig_shop, tbl_shop = _shop_summary(out)
|
| 287 |
|
| 288 |
+
# แนบข้อความบอกคอลัมน์ที่ใช้ + จำนวนแถวที่ถูกข้าม
|
| 289 |
+
info_md = (
|
| 290 |
+
f"{info_md} \n"
|
| 291 |
+
f"ใช้คอลัมน์รีวิว: {rev_col}"
|
| 292 |
+
+ (f" | คอลัมน์ร้าน: {shop_col}" if ("shop" in out.columns) else " | ไม่มีคอลัมน์ร้าน")
|
| 293 |
+
+ f" \n- Skipped (empty/non-text): {skipped}"
|
| 294 |
+
)
|
| 295 |
|
| 296 |
return out, tmp.name, fig_bar, fig_pie, fig_shop, tbl_shop, info_md
|
| 297 |
except Exception as e:
|