Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 23

Commit

60a93e1

verified ·

1 Parent(s): 9789731

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -176

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # app.py
-# From Talk to Task — Multilingual (EN/FR/DE/IT)
-# Focus: ACCURACY evaluation against UBS Ground Truth + rich diagnostics + downloadable artifacts
 import os
 import io
@@ -18,7 +20,6 @@ import gradio as gr
 DEFAULT_REPO = "swiss-ai/Apertus-8B-Instruct-2509"
-# Default label set (can be overridden by uploading a Rules JSON with {"labels":[...]}).
 DEFAULT_LABEL_SET = [
     "plan_contact",
     "schedule_meeting",
@@ -30,29 +31,58 @@ DEFAULT_LABEL_SET = [
     "update_kyc_total_assets",
 ]
-SYSTEM_INSTRUCTIONS = (
-    "You are a task extraction assistant.\n"
-    "Input transcript language can be English, French, German, or Italian.\n"
-    "Output valid JSON ONLY (no prose) with a single field:\n"
     '"labels": a list of strings chosen ONLY from the allowed label set.\n'
-    "Do not invent other fields. Do not translate labels. Return JSON only."
 )
 CONTEXT_GUIDE = (
-    "- plan_contact: contact without firm date/time\n"
-    "- schedule_meeting: explicit date/time/modality confirmed\n"
     "- update_contact_info_non_postal: email/phone updates\n"
     "- update_contact_info_postal_address: mailing address updates\n"
     "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)\n"
 )
 # --------------------- WRITABLE HF CACHE -----------------------------
 HOME = Path(os.environ.get("HOME", "/home/user"))
 CACHE_DIR = HOME / ".cache" / "huggingface"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
 os.environ.setdefault("HF_HOME", str(CACHE_DIR))
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster downloads when supported
 HF_TOKEN = (os.environ.get("HF_TOKEN") or "").strip() or None
@@ -63,21 +93,14 @@ try:
     from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 except Exception as e:
     raise RuntimeError(
-        "Missing deps. In requirements.txt include: transformers>=4.56.0, torch, accelerate, huggingface_hub, bitsandbytes, gradio"
     ) from e
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 GPU_NAME = torch.cuda.get_device_name(0) if DEVICE == "cuda" else "cpu"
-# Force fp16 on CUDA (T4 doesn’t support bf16) for stable perf
 DTYPE_FALLBACK = torch.float16 if DEVICE == "cuda" else torch.float32
-# Optional ZeroGPU presence
-try:
-    import spaces  # noqa: F401
-    ON_ZERO_GPU = True
-except Exception:
-    ON_ZERO_GPU = False
 # -------------------------- HELPERS ---------------------------------
 RE_DISCLAIMER = re.compile(r"^\s*disclaimer\s*:", re.IGNORECASE)
@@ -115,20 +138,57 @@ def read_rules_labels(file_obj: Optional[gr.File]) -> Optional[List[str]]:
     except Exception:
         return None
-def build_prompt(system: str, context: str, transcript: str) -> str:
     return (
         f"### System\n{system}\n\n"
-        f"### Context\n{context}\n\n"
         f"### Transcript\n{transcript}\n\n"
-        "### Output\nReturn JSON only."
     )
 def prf1_accuracy(pred: List[str], gold: List[str]) -> Tuple[float, float, float, float, Dict[str, int]]:
-    """Micro P/R/F1 + Jaccard-like accuracy (intersection/union)."""
     pset, gset = set(pred), set(gold)
-    tp = len(pset & gset)
-    fp = len(pset - gset)
-    fn = len(gset - pset)
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec = tp / (tp + fn) if (tp + fn) else 0.0
     f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
@@ -137,7 +197,6 @@ def prf1_accuracy(pred: List[str], gold: List[str]) -> Tuple[float, float, float
     return prec, rec, f1, acc, {"tp": tp, "fp": fp, "fn": fn, "pred_total": len(pset), "gold_total": len(gset)}
 def per_label_counts(pred: List[str], gold: List[str], all_labels: List[str]) -> Dict[str, Dict[str, int]]:
-    """TP/FP/FN per label."""
     pset, gset = set(pred), set(gold)
     out = {}
     for lab in all_labels:
@@ -148,48 +207,16 @@ def per_label_counts(pred: List[str], gold: List[str], all_labels: List[str]) ->
     return out
 def hamming_loss(pred: List[str], gold: List[str], all_labels: List[str]) -> float:
-    """Hamming loss over the label universe."""
     pset, gset = set(pred), set(gold)
     wrong = 0
     for lab in all_labels:
         in_p, in_g = (lab in pset), (lab in gset)
-        if in_p != in_g:
-            wrong += 1
     return wrong / max(1, len(all_labels))
-def read_single_ground_truth(file_obj: Optional[gr.File]) -> Optional[List[str]]:
-    if not file_obj:
-        return None
-    try:
-        data = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
-        labels = data.get("labels", [])
-        return [lab for lab in labels if isinstance(lab, str)]
-    except Exception:
-        return None
-def read_batch_ground_truth_zip(zip_file: Optional[gr.File]) -> Dict[str, List[str]]:
-    out: Dict[str, List[str]] = {}
-    if not zip_file:
-        return out
-    try:
-        with zipfile.ZipFile(zip_file.name) as z:
-            for name in z.namelist():
-                if not name.lower().endswith(".json"):
-                    continue
-                try:
-                    data = json.loads(z.read(name).decode("utf-8", errors="replace"))
-                    labs = [lab for lab in data.get("labels", []) if isinstance(lab, str)]
-                    out[Path(name).with_suffix("").name] = labs
-                except Exception:
-                    pass
-    except Exception:
-        pass
-    return out
 def write_csv(path: Path, rows: List[List[str]]):
     with path.open("w", newline="", encoding="utf-8") as f:
-        w = csv.writer(f)
-        w.writerows(rows)
 # -------------------------- MODEL -----------------------------------
@@ -234,10 +261,10 @@ class HFModel:
             self.model = self.model.to(DEVICE)
     @torch.inference_mode()
-    def generate_json(self, prompt: str, max_new_tokens=32) -> Tuple[str, Dict[str, int]]:
         """
-        Deterministic generation, returns (json_text, token_stats)
-        token_stats: dict with prompt_tokens, output_tokens, total_tokens
         """
         tok = self.tokenizer
         mdl = self.model
@@ -246,15 +273,17 @@ class HFModel:
         templated = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = tok([templated], return_tensors="pt", add_special_tokens=False).to(mdl.device)
-        out = mdl.generate(
-            **inputs,
             max_new_tokens=max_new_tokens,
-            do_sample=False,      # deterministic for classification
-            temperature=0.0,
-            top_p=1.0,
             pad_token_id=tok.eos_token_id,
             eos_token_id=tok.eos_token_id,
         )
         prompt_tokens = int(inputs.input_ids.shape[-1])
         output_tokens = int(out.shape[-1] - inputs.input_ids.shape[-1])
@@ -290,11 +319,21 @@ def preprocess_text(txt: str, add_header: bool, strip_smalltalk: bool) -> str:
     cleaned = "\n".join(lines[-32768:])
     return f"[EMAIL/MESSAGE SIGNAL]\n{cleaned}" if add_header else cleaned
 def run_single(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
-    system: str,
-    context: str,
     transcript: str,
     soft_token_cap: int,
     preprocess: bool,
@@ -303,95 +342,118 @@ def run_single(
     load_in_4bit: bool,
     hourly_rate: float,
     gt_json_file: Optional[gr.File],
 ):
-    """
-    Returns: repo, revision, json_out, diagnostics_text, metrics_json
-    """
-    t0 = time.perf_counter()
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
-    # Resolve allowed labels
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
-    # Preprocess
     effective_len = len(transcript)
     if preprocess:
         transcript = preprocess_text(transcript, add_header, strip_smalltalk)
         effective_len = len(transcript)
-    # Soft cap (~4 chars / token rough heuristic)
     cap_info = ""
     if soft_token_cap and soft_token_cap > 0:
         approx_chars = int(soft_token_cap * 4)
         if len(transcript) > approx_chars:
             transcript = transcript[-approx_chars:]
-            cap_info = f"(soft cap ~{soft_token_cap}t applied)"
-    prompt = build_prompt(system or SYSTEM_INSTRUCTIONS, context or CONTEXT_GUIDE, transcript)
     model = get_model(repo, revision, load_in_4bit)
-    gen_t0 = time.perf_counter()
-    raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=32)
-    gen_latency = time.perf_counter() - gen_t0
     pred_labels = safe_json_labels(raw_json, allowed)
     total_latency = time.perf_counter() - t0
     est_cost = (total_latency / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     # Ground truth
     gt_labels = read_single_ground_truth(gt_json_file)
-    detailed = {}
     pr = rc = f1 = acc = 0.0
-    ham = 0.0
-    missing = []
-    extra = []
-    per_label = {}
     if gt_labels is not None:
         pr, rc, f1, acc, counts = prf1_accuracy(pred_labels, gt_labels)
         ham = hamming_loss(pred_labels, gt_labels, allowed)
         per_label = per_label_counts(pred_labels, gt_labels, allowed)
         missing = sorted(list(set(gt_labels) - set(pred_labels)))
-        extra = sorted(list(set(pred_labels) - set(gt_labels)))
-        detailed = {
-            "tp": counts["tp"], "fp": counts["fp"], "fn": counts["fn"],
-            "missing_labels": missing, "extra_labels": extra,
-            "per_label": per_label
-        }
-    diagnostics = "\n".join([
-        f"Repo: {repo} | Rev: {revision}",
-        f"Device: {DEVICE} ({GPU_NAME}) | DType: {DTYPE_FALLBACK} | 4bit: {bool(load_in_4bit)}",
-        f"Allowed labels: {allowed}",
-        f"Effective text length (chars): {effective_len} {cap_info}",
-        f"Tokens — prompt: {tok_stats['prompt_tokens']} | output: {tok_stats['output_tokens']} | total: {tok_stats['total_tokens']}",
-        f"Latency — generation: {gen_latency:.2f}s | total: {total_latency:.2f}s",
-        f"Cost estimate (@{hourly_rate:.4f}/hr): ${est_cost:.6f}",
-    ])
-    metrics = {
         "labels_pred": pred_labels,
         "ground_truth_labels": gt_labels,
-        "precision": round(pr, 4),
-        "recall": round(rc, 4),
-        "f1": round(f1, 4),
-        "exact_match": 1.0 if gt_labels is not None and set(pred_labels) == set(gt_labels) else 0.0 if gt_labels is not None else None,
-        "hamming_loss": round(ham, 4) if gt_labels is not None else None,
-        "jaccard": round(prf1_accuracy(pred_labels, gt_labels)[3], 4) if gt_labels is not None else None,
-        "detailed": detailed or None,
         "token_stats": tok_stats,
         "latency_seconds": round(total_latency, 3),
         "estimated_cost_usd": round(est_cost, 6),
     }
-    return repo, revision, json.dumps({"labels": pred_labels}, ensure_ascii=False), diagnostics, json.dumps(metrics, indent=2)
 def run_batch(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
-    system: str,
-    context: str,
     transcripts_zip: Optional[gr.File],
     gt_zip: Optional[gr.File],
     soft_token_cap: int,
@@ -400,46 +462,36 @@ def run_batch(
     strip_smalltalk: bool,
     load_in_4bit: bool,
     hourly_rate: float,
 ):
-    """
-    Batch: transcripts ZIP of *.txt, optional ground-truth ZIP of *.json matching filenames.
-    Returns: repo, revision, csv_text, diagnostics, summary_json, downloads (3 files)
-    """
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
     if not transcripts_zip:
         return repo, revision, "filename,labels\n", "No transcript ZIP provided.", "{}", None, None, None
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
     try:
         z = zipfile.ZipFile(transcripts_zip.name)
         txt_names = [n for n in z.namelist() if n.lower().endswith(".txt")]
     except Exception as e:
         return repo, revision, "filename,labels\n", f"Bad transcript ZIP: {e}", "{}", None, None, None
-    gt_map = read_batch_ground_truth_zip(gt_zip)  # stem -> labels
     model = get_model(repo, revision, load_in_4bit)
     rows = [["filename","labels"]]
     per_sample_rows = [["filename","pred_labels","gold_labels","precision","recall","f1","exact_match","hamming_loss","missing","extra"]]
     totals = {"tp":0,"fp":0,"fn":0,"pred_total":0,"gold_total":0}
     label_global = {lab: {"tp":0,"fp":0,"fn":0} for lab in allowed}
-    total_prompt_tokens = 0
-    total_output_tokens = 0
-    total_secs = 0.0
-    n = 0
-    samples_with_gt = 0
     for name in txt_names:
         try:
             txt = z.read(name).decode("utf-8", errors="replace")
         except Exception:
-            rows.append([name, "[]  # unreadable"])
-            continue
         if preprocess:
             txt = preprocess_text(txt, add_header, strip_smalltalk)
@@ -449,37 +501,39 @@ def run_batch(
             if len(txt) > approx_chars:
                 txt = txt[-approx_chars:]
-        prompt = build_prompt(system or SYSTEM_INSTRUCTIONS, context or CONTEXT_GUIDE, txt)
         t0 = time.perf_counter()
-        raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=32)
         total_secs += (time.perf_counter() - t0)
         total_prompt_tokens += tok_stats["prompt_tokens"]
         total_output_tokens += tok_stats["output_tokens"]
         n += 1
-        pred = safe_json_labels(raw_json, allowed)
         rows.append([name, json.dumps(pred, ensure_ascii=False)])
         stem = Path(name).with_suffix("").name
         gold = gt_map.get(stem)
         if gold is not None:
-            samples_with_gt += 1
             pr, rc, f1, acc, counts = prf1_accuracy(pred, gold)
             ham = hamming_loss(pred, gold, allowed)
             missing = sorted(list(set(gold) - set(pred)))
             extra = sorted(list(set(pred) - set(gold)))
-            # aggregate
             for k in ["tp","fp","fn","pred_total","gold_total"]:
                 totals[k] += counts[k]
-            # per-label global
             pl = per_label_counts(pred, gold, allowed)
             for lab, c in pl.items():
                 for k in ["tp","fp","fn"]:
                     label_global[lab][k] += c[k]
             per_sample_rows.append([
                 name,
                 json.dumps(pred, ensure_ascii=False),
@@ -491,16 +545,12 @@ def run_batch(
                 json.dumps(extra, ensure_ascii=False),
             ])
-    # macro summary (micro over totals)
     tp, fp, fn = totals["tp"], totals["fp"], totals["fn"]
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec  = tp / (tp + fn) if (tp + fn) else 0.0
     f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
-    hourly_rate = max(0.0, float(hourly_rate or 0.0))
-    est_cost = (total_secs / 3600.0) * hourly_rate
-    # coverage: did we ever predict each label at least once?
     coverage = {lab: 0 for lab in allowed}
     for r in rows[1:]:
         try:
@@ -513,7 +563,7 @@ def run_batch(
     summary = {
         "files_processed": n,
-        "files_with_ground_truth": samples_with_gt,
         "labels_allowed": allowed,
         "precision_micro": round(prec, 4),
         "recall_micro": round(rec, 4),
@@ -532,21 +582,24 @@ def run_batch(
         "estimated_cost_usd": round(est_cost, 6),
     }
-    diagnostics = (
-        f"Repo: {repo} | Rev: {revision} | Device: {DEVICE} ({GPU_NAME}) | "
-        f"DType: {DTYPE_FALLBACK} | 4bit: {bool(load_in_4bit)}\n"
-        f"Files processed: {n} (with GT: {samples_with_gt})\n"
-        f"Tokens — prompt_total: {total_prompt_tokens} | output_total: {total_output_tokens}\n"
-        f"Latency — total: {summary['latency_seconds_total']}s | avg: {summary['avg_latency_seconds']}s\n"
-        f"Cost estimate (@{hourly_rate:.4f}/hr): ${summary['estimated_cost_usd']}\n"
-        f"Allowed labels: {allowed}"
-    )
     # Write artifacts
     tmp_dir = Path("/tmp")
     pred_csv = tmp_dir / "predictions.csv"
     per_sample_csv = tmp_dir / "per_sample_metrics.csv"
     summary_json = tmp_dir / "summary_metrics.json"
     write_csv(pred_csv, rows)
     write_csv(per_sample_csv, per_sample_rows)
     summary_json.write_text(json.dumps(summary, indent=2), encoding="utf-8")
@@ -554,7 +607,7 @@ def run_batch(
     return (
         repo, revision,
         "\n".join([",".join(r) for r in rows]),
-        diagnostics,
         json.dumps(summary, indent=2),
         str(pred_csv), str(per_sample_csv), str(summary_json)
     )
@@ -566,33 +619,33 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics") as demo:
         f"""
         # From Talk to Task — Accuracy & Diagnostics (EN/FR/DE/IT)
-        **Default model:** `{DEFAULT_REPO}` (recommended with GPU + 4-bit).
-        Upload **UBS Ground Truth** to compute **precision / recall / F1 / accuracy** and detailed error analysis.
-        Optionally upload a **Rules JSON** (`{{"labels":[...]}}`) to override the default allowed label set.
-        **Output schema (model):** `{{"labels": [...]}}`
         """
     )
     with gr.Row():
         custom_repo = gr.Textbox(
-            label="Model repo (leave empty to use default)",
             placeholder="e.g. swiss-ai/Apertus-8B-Instruct-2509"
         )
         load_4bit = gr.Checkbox(value=True, label="Load in 4-bit (GPU only)")
     rules_file = gr.File(label="Rules JSON (optional) — overrides allowed labels", file_types=[".json"])
-    system = gr.Textbox(label="Instructions (System)", value=SYSTEM_INSTRUCTIONS, lines=6)
     context = gr.Textbox(label="Context (User prefix)", value=CONTEXT_GUIDE, lines=6)
     with gr.Row():
         soft_cap = gr.Slider(512, 32768, value=2048, step=1, label="Soft token cap (approx)")
         preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
-    with gr.Row():
         add_header = gr.Checkbox(value=True, label="Add cues header")
         strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
-        hourly_rate = gr.Number(value=0.40, precision=4, label="Hourly hardware price (USD) for cost estimate")
     with gr.Tabs():
         with gr.Tab("Single Transcript"):
@@ -603,8 +656,11 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics") as demo:
             repo_used = gr.Textbox(label="Repo used", interactive=False)
             rev_used = gr.Textbox(label="Revision", interactive=False)
             json_out = gr.Code(label="Predicted JSON", language="json")
-            diag_out = gr.Textbox(label="Diagnostics", lines=12)
-            metrics_out = gr.Code(label="Metrics (PR/RC/F1/Acc, tokens, latency, errors)", language="json")
             def _single(*args):
                 return run_single(*args)
@@ -614,9 +670,9 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics") as demo:
                 inputs=[
                     custom_repo, rules_file, system, context, transcript,
                     soft_cap, preprocess, add_header, strip_smalltalk,
-                    load_4bit, hourly_rate, gt_single
                 ],
-                outputs=[repo_used, rev_used, json_out, diag_out, metrics_out],
             )
         with gr.Tab("Batch (ZIP)"):
@@ -627,10 +683,10 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics") as demo:
             repo_used_b = gr.Textbox(label="Repo used", interactive=False)
             rev_used_b = gr.Textbox(label="Revision", interactive=False)
             csv_out = gr.Textbox(label="Predictions CSV (filename,labels)", lines=12)
-            diag_out_b = gr.Textbox(label="Diagnostics", lines=12)
-            metrics_out_b = gr.Code(label="Summary Metrics (micro PR/RC/F1, per-label counts, tokens, latency)", language="json")
-            # Downloadables
             preds_file = gr.File(label="Download predictions.csv")
             per_sample_file = gr.File(label="Download per_sample_metrics.csv")
             summary_file = gr.File(label="Download summary_metrics.json")
@@ -643,9 +699,9 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics") as demo:
                 inputs=[
                     custom_repo, rules_file, system, context, zip_in, gt_zip,
                     soft_cap, preprocess, add_header, strip_smalltalk,
-                    load_4bit, hourly_rate
                 ],
-                outputs=[repo_used_b, rev_used_b, csv_out, diag_out_b, metrics_out_b, preds_file, per_sample_file, summary_file],
             )
     gr.Markdown(

 # app.py
+# From Talk to Task — Accuracy & Diagnostics with user-friendly metric cards
+# Model: swiss-ai/Apertus-8B-Instruct-2509
+# Multilingual (EN/FR/DE/IT), writable cache, few-shot prompting, smart fallback,
+# per-sample & batch metrics, and downloadable artifacts.
 import os
 import io
 DEFAULT_REPO = "swiss-ai/Apertus-8B-Instruct-2509"
 DEFAULT_LABEL_SET = [
     "plan_contact",
     "schedule_meeting",
     "update_kyc_total_assets",
 ]
+SYSTEM_INSTRUCTIONS_BASE = (
+    "You are a task extraction assistant. Input transcript language may be English, French, "
+    "German, or Italian. Return ONLY valid JSON with a single field:\n"
     '"labels": a list of strings chosen ONLY from the allowed label set.\n'
+    "Do NOT add other fields or prose. Do NOT translate labels. If multiple labels apply, return all.\n"
+    "If none apply, return an empty list."
 )
 CONTEXT_GUIDE = (
+    "- plan_contact: conversation without a firm date/time\n"
+    "- schedule_meeting: explicit date/time/modality is agreed\n"
     "- update_contact_info_non_postal: email/phone updates\n"
     "- update_contact_info_postal_address: mailing address updates\n"
     "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)\n"
 )
+# Few-shot exemplars to improve recall/F1 across languages
+FEW_SHOTS = [
+    # EN
+    {
+        "transcript": "Agent: Can we meet on Friday at 3pm on Teams?\nClient: Yes, Friday 3pm works.\nAgent: Great, I'll send an invite.",
+        "labels": ["schedule_meeting"]
+    },
+    # DE
+    {
+        "transcript": "Kunde: Meine Telefonnummer hat sich geändert: +41 44 000 00 00.\nBerater: Alles klar, ich aktualisiere Ihre Kontaktdaten.",
+        "labels": ["update_contact_info_non_postal"]
+    },
+    # FR
+    {
+        "transcript": "Client: Nous avons acheté un nouvel appartement, l'adresse postale est Avenue X 12, 1200 Genève.\nConseiller: Merci, je mets à jour l'adresse postale.",
+        "labels": ["update_contact_info_postal_address"]
+    },
+    # IT
+    {
+        "transcript": "Cliente: Vorrei chiarire lo scopo del rapporto: gestione patrimoniale a lungo termine.\nConsulente: Perfetto, aggiorno lo scopo KYC.",
+        "labels": ["update_kyc_purpose_of_businessrelation"]
+    },
+    # EN KYC totals
+    {
+        "transcript": "Agent: To confirm, your total assets are 8,000,000 CHF with 3,700,000 in real estate.\nClient: Yes, correct.",
+        "labels": ["update_kyc_total_assets"]
+    },
+]
 # --------------------- WRITABLE HF CACHE -----------------------------
 HOME = Path(os.environ.get("HOME", "/home/user"))
 CACHE_DIR = HOME / ".cache" / "huggingface"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
 os.environ.setdefault("HF_HOME", str(CACHE_DIR))
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 HF_TOKEN = (os.environ.get("HF_TOKEN") or "").strip() or None
     from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 except Exception as e:
     raise RuntimeError(
+        "Missing deps. requirements.txt must include: transformers>=4.56.0, torch, accelerate, huggingface_hub, bitsandbytes, gradio"
     ) from e
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 GPU_NAME = torch.cuda.get_device_name(0) if DEVICE == "cuda" else "cpu"
+# T4 doesn't support bf16 → use fp16; CPU uses fp32
 DTYPE_FALLBACK = torch.float16 if DEVICE == "cuda" else torch.float32
 # -------------------------- HELPERS ---------------------------------
 RE_DISCLAIMER = re.compile(r"^\s*disclaimer\s*:", re.IGNORECASE)
     except Exception:
         return None
+def read_single_ground_truth(file_obj: Optional[gr.File]) -> Optional[List[str]]:
+    if not file_obj:
+        return None
+    try:
+        data = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
+        labels = data.get("labels", [])
+        return [lab for lab in labels if isinstance(lab, str)]
+    except Exception:
+        return None
+def read_batch_ground_truth_zip(zip_file: Optional[gr.File]) -> Dict[str, List[str]]:
+    out: Dict[str, List[str]] = {}
+    if not zip_file:
+        return out
+    try:
+        with zipfile.ZipFile(zip_file.name) as z:
+            for name in z.namelist():
+                if not name.lower().endswith(".json"):
+                    continue
+                try:
+                    data = json.loads(z.read(name).decode("utf-8", errors="replace"))
+                    labs = [lab for lab in data.get("labels", []) if isinstance(lab, str)]
+                    out[Path(name).with_suffix("").name] = labs
+                except Exception:
+                    pass
+    except Exception:
+        pass
+    return out
+def build_fewshot_block(allowed: List[str]) -> str:
+    shots = []
+    for ex in FEW_SHOTS:
+        shots.append(
+            f"- Transcript:\n{ex['transcript']}\n- Correct labels (choose subset from {allowed}): {ex['labels']}\n"
+        )
+    return "\n".join(shots)
+def build_prompt(system: str, context: str, transcript: str, allowed: List[str], use_fewshot: bool) -> str:
+    fewshot_section = f"\n### Examples\n{build_fewshot_block(allowed)}\n" if use_fewshot else ""
     return (
         f"### System\n{system}\n\n"
+        f"### Allowed label set\n{allowed}\n\n"
+        f"### Context\n{context}\n"
+        f"{fewshot_section}\n"
         f"### Transcript\n{transcript}\n\n"
+        "### Output\nReturn JSON only: {\"labels\": [...]}"
     )
 def prf1_accuracy(pred: List[str], gold: List[str]) -> Tuple[float, float, float, float, Dict[str, int]]:
     pset, gset = set(pred), set(gold)
+    tp = len(pset & gset); fp = len(pset - gset); fn = len(gset - pset)
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec = tp / (tp + fn) if (tp + fn) else 0.0
     f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
     return prec, rec, f1, acc, {"tp": tp, "fp": fp, "fn": fn, "pred_total": len(pset), "gold_total": len(gset)}
 def per_label_counts(pred: List[str], gold: List[str], all_labels: List[str]) -> Dict[str, Dict[str, int]]:
     pset, gset = set(pred), set(gold)
     out = {}
     for lab in all_labels:
     return out
 def hamming_loss(pred: List[str], gold: List[str], all_labels: List[str]) -> float:
     pset, gset = set(pred), set(gold)
     wrong = 0
     for lab in all_labels:
         in_p, in_g = (lab in pset), (lab in gset)
+        wrong += int(in_p != in_g)
     return wrong / max(1, len(all_labels))
 def write_csv(path: Path, rows: List[List[str]]):
     with path.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f); w.writerows(rows)
 # -------------------------- MODEL -----------------------------------
             self.model = self.model.to(DEVICE)
     @torch.inference_mode()
+    def generate_json(self, prompt: str, max_new_tokens=64, allow_sampling=False) -> Tuple[str, Dict[str, int]]:
         """
+        Deterministic by default. If allow_sampling=True (fallback), we use mild temperature.
+        Returns (json_text, token_stats)
         """
         tok = self.tokenizer
         mdl = self.model
         templated = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = tok([templated], return_tensors="pt", add_special_tokens=False).to(mdl.device)
+        kwargs = dict(
             max_new_tokens=max_new_tokens,
             pad_token_id=tok.eos_token_id,
             eos_token_id=tok.eos_token_id,
         )
+        if allow_sampling:
+            kwargs.update(dict(do_sample=True, temperature=0.25, top_p=0.9))
+        else:
+            kwargs.update(dict(do_sample=False, temperature=0.0, top_p=1.0))
+        out = mdl.generate(**inputs, **kwargs)
         prompt_tokens = int(inputs.input_ids.shape[-1])
         output_tokens = int(out.shape[-1] - inputs.input_ids.shape[-1])
     cleaned = "\n".join(lines[-32768:])
     return f"[EMAIL/MESSAGE SIGNAL]\n{cleaned}" if add_header else cleaned
+def card_markdown(title: str, value: str, hint: str = "") -> str:
+    hint_md = f"<div style='font-size:12px;opacity:0.8'>{hint}</div>" if hint else ""
+    return f"""
+<div style="border:1px solid #3a3a3a;border-radius:10px;padding:10px;margin:6px">
+  <div style="font-weight:600">{title}</div>
+  <div style="font-size:20px;margin-top:4px">{value}</div>
+  {hint_md}
+</div>
+"""
 def run_single(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
+    system_instructions: str,
+    context_text: str,
     transcript: str,
     soft_token_cap: int,
     preprocess: bool,
     load_in_4bit: bool,
     hourly_rate: float,
     gt_json_file: Optional[gr.File],
+    use_fewshot: bool,
 ):
+    """Returns: repo, revision, predicted_json, metrics_cards_md, diag_cards_md, raw_metrics_json"""
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
+    # Preprocess + cap
     effective_len = len(transcript)
     if preprocess:
         transcript = preprocess_text(transcript, add_header, strip_smalltalk)
         effective_len = len(transcript)
     cap_info = ""
     if soft_token_cap and soft_token_cap > 0:
         approx_chars = int(soft_token_cap * 4)
         if len(transcript) > approx_chars:
             transcript = transcript[-approx_chars:]
+            cap_info = f"(soft cap ~{soft_token_cap}t)"
+    # Build prompt (few-shot helps recall)
+    system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
+    prompt = build_prompt(system, context_text or CONTEXT_GUIDE, transcript, allowed, use_fewshot)
     model = get_model(repo, revision, load_in_4bit)
+    # First pass: deterministic
+    t0 = time.perf_counter()
+    raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=64, allow_sampling=False)
     pred_labels = safe_json_labels(raw_json, allowed)
+    # Fallback: if empty, try mild sampling once
+    fallback_used = False
+    if not pred_labels:
+        raw_json2, tok_stats2 = model.generate_json(prompt, max_new_tokens=64, allow_sampling=True)
+        pred_labels2 = safe_json_labels(raw_json2, allowed)
+        if pred_labels2:
+            pred_labels = pred_labels2
+            tok_stats = tok_stats2
+            fallback_used = True
     total_latency = time.perf_counter() - t0
     est_cost = (total_latency / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     # Ground truth
     gt_labels = read_single_ground_truth(gt_json_file)
     pr = rc = f1 = acc = 0.0
+    ham = None
+    missing = []; extra = []; per_label = {}
     if gt_labels is not None:
         pr, rc, f1, acc, counts = prf1_accuracy(pred_labels, gt_labels)
         ham = hamming_loss(pred_labels, gt_labels, allowed)
         per_label = per_label_counts(pred_labels, gt_labels, allowed)
         missing = sorted(list(set(gt_labels) - set(pred_labels)))
+        extra   = sorted(list(set(pred_labels) - set(gt_labels)))
+    # ------- User-friendly metric cards -------
+    metric_cards = ""
+    metric_cards += card_markdown("Precision", f"{pr:.3f}" if gt_labels is not None else "—", "Correct positive labels / All predicted positive labels")
+    metric_cards += card_markdown("Recall", f"{rc:.3f}" if gt_labels is not None else "—", "Correct positive labels / All actual positive labels")
+    metric_cards += card_markdown("F1 score", f"{f1:.3f}" if gt_labels is not None else "—", "Harmonic mean of Precision and Recall")
+    metric_cards += card_markdown("Exact match", f"{1.0 if gt_labels and set(pred_labels)==set(gt_labels) else 0.0 if gt_labels is not None else '—'}", "1.0 if predicted labels exactly equal ground truth")
+    metric_cards += card_markdown("Hamming loss", f"{ham:.3f}" if ham is not None else "—", "Fraction of labels where prediction disagrees with truth (lower is better)")
+    metric_cards += card_markdown("Missing labels", json.dumps(missing, ensure_ascii=False) if gt_labels is not None else "—", "Expected but not predicted")
+    metric_cards += card_markdown("Extra labels", json.dumps(extra, ensure_ascii=False) if gt_labels is not None else "—", "Predicted but not expected")
+    # ------- Diagnostics cards -------
+    diag_cards = ""
+    diag_cards += card_markdown("Model / Rev", f"{repo} / {revision}")
+    diag_cards += card_markdown("Device", f"{DEVICE} ({GPU_NAME})")
+    diag_cards += card_markdown("Precision dtype", f"{DTYPE_FALLBACK}")
+    diag_cards += card_markdown("4-bit", f"{bool(load_in_4bit)}")
+    diag_cards += card_markdown("Allowed labels", json.dumps(allowed, ensure_ascii=False))
+    diag_cards += card_markdown("Effective text length", f"{effective_len} chars {cap_info}")
+    diag_cards += card_markdown("Tokens", f"prompt={tok_stats['prompt_tokens']}, output={tok_stats['output_tokens']}, total={tok_stats['total_tokens']}", "Token counts help explain latency and cost")
+    diag_cards += card_markdown("Latency", f"{total_latency:.2f} s", "End-to-end time (first run includes caching)")
+    diag_cards += card_markdown("Cost (est.)", f"${(est_cost):.6f} @ {hourly_rate:.4f}/hr")
+    if fallback_used:
+        diag_cards += card_markdown("Fallback used", "Yes", "Empty prediction in first pass; retried with mild sampling to improve recall")
+    else:
+        diag_cards += card_markdown("Fallback used", "No")
+    raw_metrics = {
         "labels_pred": pred_labels,
         "ground_truth_labels": gt_labels,
+        "precision": round(pr, 4) if gt_labels is not None else None,
+        "recall": round(rc, 4) if gt_labels is not None else None,
+        "f1": round(f1, 4) if gt_labels is not None else None,
+        "exact_match": 1.0 if gt_labels and set(pred_labels)==set(gt_labels) else (0.0 if gt_labels is not None else None),
+        "hamming_loss": round(ham, 4) if ham is not None else None,
+        "missing": missing if gt_labels is not None else None,
+        "extra": extra if gt_labels is not None else None,
+        "per_label": per_label if gt_labels is not None else None,
         "token_stats": tok_stats,
         "latency_seconds": round(total_latency, 3),
         "estimated_cost_usd": round(est_cost, 6),
+        "fallback_used": fallback_used,
     }
+    return (
+        repo, revision,
+        json.dumps({"labels": pred_labels}, ensure_ascii=False),
+        metric_cards, diag_cards,
+        json.dumps(raw_metrics, indent=2)
+    )
 def run_batch(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
+    system_instructions: str,
+    context_text: str,
     transcripts_zip: Optional[gr.File],
     gt_zip: Optional[gr.File],
     soft_token_cap: int,
     strip_smalltalk: bool,
     load_in_4bit: bool,
     hourly_rate: float,
+    use_fewshot: bool,
 ):
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
     if not transcripts_zip:
         return repo, revision, "filename,labels\n", "No transcript ZIP provided.", "{}", None, None, None
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
     try:
         z = zipfile.ZipFile(transcripts_zip.name)
         txt_names = [n for n in z.namelist() if n.lower().endswith(".txt")]
     except Exception as e:
         return repo, revision, "filename,labels\n", f"Bad transcript ZIP: {e}", "{}", None, None, None
+    gt_map = read_batch_ground_truth_zip(gt_zip)
     model = get_model(repo, revision, load_in_4bit)
     rows = [["filename","labels"]]
     per_sample_rows = [["filename","pred_labels","gold_labels","precision","recall","f1","exact_match","hamming_loss","missing","extra"]]
     totals = {"tp":0,"fp":0,"fn":0,"pred_total":0,"gold_total":0}
     label_global = {lab: {"tp":0,"fp":0,"fn":0} for lab in allowed}
+    total_prompt_tokens = 0; total_output_tokens = 0; total_secs = 0.0; n=0; with_gt=0
+    system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
     for name in txt_names:
         try:
             txt = z.read(name).decode("utf-8", errors="replace")
         except Exception:
+            rows.append([name, "[]  # unreadable"]); continue
         if preprocess:
             txt = preprocess_text(txt, add_header, strip_smalltalk)
             if len(txt) > approx_chars:
                 txt = txt[-approx_chars:]
+        prompt = build_prompt(system, context_text or CONTEXT_GUIDE, txt, allowed, use_fewshot)
         t0 = time.perf_counter()
+        raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=64, allow_sampling=False)
+        pred = safe_json_labels(raw_json, allowed)
+        if not pred:
+            raw_json2, tok_stats2 = model.generate_json(prompt, max_new_tokens=64, allow_sampling=True)
+            pred2 = safe_json_labels(raw_json2, allowed)
+            if pred2:
+                pred = pred2
+                tok_stats = tok_stats2
         total_secs += (time.perf_counter() - t0)
         total_prompt_tokens += tok_stats["prompt_tokens"]
         total_output_tokens += tok_stats["output_tokens"]
         n += 1
         rows.append([name, json.dumps(pred, ensure_ascii=False)])
         stem = Path(name).with_suffix("").name
         gold = gt_map.get(stem)
         if gold is not None:
+            with_gt += 1
             pr, rc, f1, acc, counts = prf1_accuracy(pred, gold)
             ham = hamming_loss(pred, gold, allowed)
             missing = sorted(list(set(gold) - set(pred)))
             extra = sorted(list(set(pred) - set(gold)))
             for k in ["tp","fp","fn","pred_total","gold_total"]:
                 totals[k] += counts[k]
             pl = per_label_counts(pred, gold, allowed)
             for lab, c in pl.items():
                 for k in ["tp","fp","fn"]:
                     label_global[lab][k] += c[k]
             per_sample_rows.append([
                 name,
                 json.dumps(pred, ensure_ascii=False),
                 json.dumps(extra, ensure_ascii=False),
             ])
     tp, fp, fn = totals["tp"], totals["fp"], totals["fn"]
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec  = tp / (tp + fn) if (tp + fn) else 0.0
     f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
+    est_cost = (total_secs / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     coverage = {lab: 0 for lab in allowed}
     for r in rows[1:]:
         try:
     summary = {
         "files_processed": n,
+        "files_with_ground_truth": with_gt,
         "labels_allowed": allowed,
         "precision_micro": round(prec, 4),
         "recall_micro": round(rec, 4),
         "estimated_cost_usd": round(est_cost, 6),
     }
+    diag_cards = ""
+    diag_cards += card_markdown("Model / Rev", f"{repo} / {revision}")
+    diag_cards += card_markdown("Device", f"{DEVICE} ({GPU_NAME})")
+    diag_cards += card_markdown("Precision dtype", f"{DTYPE_FALLBACK}")
+    diag_cards += card_markdown("4-bit", f"{bool(load_in_4bit)}")
+    diag_cards += card_markdown("Files processed", f"{n} (with GT: {with_gt})")
+    diag_cards += card_markdown("Tokens (totals)", f"prompt={total_prompt_tokens}, output={total_output_tokens}")
+    diag_cards += card_markdown("Latency", f"total={summary['latency_seconds_total']} s, avg={summary['avg_latency_seconds']} s")
+    diag_cards += card_markdown("Cost (est.)", f"${summary['estimated_cost_usd']} @ {hourly_rate:.4f}/hr")
+    diag_cards += card_markdown("Allowed labels", json.dumps(allowed, ensure_ascii=False))
     # Write artifacts
     tmp_dir = Path("/tmp")
     pred_csv = tmp_dir / "predictions.csv"
     per_sample_csv = tmp_dir / "per_sample_metrics.csv"
     summary_json = tmp_dir / "summary_metrics.json"
+    # CSV/text outputs
     write_csv(pred_csv, rows)
     write_csv(per_sample_csv, per_sample_rows)
     summary_json.write_text(json.dumps(summary, indent=2), encoding="utf-8")
     return (
         repo, revision,
         "\n".join([",".join(r) for r in rows]),
+        diag_cards,
         json.dumps(summary, indent=2),
         str(pred_csv), str(per_sample_csv), str(summary_json)
     )
         f"""
         # From Talk to Task — Accuracy & Diagnostics (EN/FR/DE/IT)
+        **Default model:** `{DEFAULT_REPO}` (GPU + 4-bit recommended).
+        Upload **ground truth** to compute **Precision / Recall / F1 / Exact match / Hamming loss**.
+        You can also upload a **Rules JSON** (`{{"labels":[...]}}`) to override the allowed label set.
+        **Model Output schema:** `{{"labels": [...]}}`
         """
     )
     with gr.Row():
         custom_repo = gr.Textbox(
+            label="Model repo (empty → default)",
             placeholder="e.g. swiss-ai/Apertus-8B-Instruct-2509"
         )
         load_4bit = gr.Checkbox(value=True, label="Load in 4-bit (GPU only)")
+        use_fewshot = gr.Checkbox(value=True, label="Use few-shot examples (better recall/F1)")
     rules_file = gr.File(label="Rules JSON (optional) — overrides allowed labels", file_types=[".json"])
+    system = gr.Textbox(label="Instructions (System)", value=SYSTEM_INSTRUCTIONS_BASE, lines=6)
     context = gr.Textbox(label="Context (User prefix)", value=CONTEXT_GUIDE, lines=6)
     with gr.Row():
         soft_cap = gr.Slider(512, 32768, value=2048, step=1, label="Soft token cap (approx)")
         preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
         add_header = gr.Checkbox(value=True, label="Add cues header")
         strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
+    hourly_rate = gr.Number(value=0.40, precision=4, label="Hourly hardware price (USD) for cost estimate")
     with gr.Tabs():
         with gr.Tab("Single Transcript"):
             repo_used = gr.Textbox(label="Repo used", interactive=False)
             rev_used = gr.Textbox(label="Revision", interactive=False)
             json_out = gr.Code(label="Predicted JSON", language="json")
+            # Metric & Diagnostic cards (rendered as HTML)
+            metric_cards_md = gr.HTML(label="Metrics (cards)")
+            diag_cards_md = gr.HTML(label="Diagnostics (cards)")
+            raw_metrics = gr.Code(label="Raw metrics JSON", language="json")
             def _single(*args):
                 return run_single(*args)
                 inputs=[
                     custom_repo, rules_file, system, context, transcript,
                     soft_cap, preprocess, add_header, strip_smalltalk,
+                    load_4bit, hourly_rate, gt_single, use_fewshot
                 ],
+                outputs=[repo_used, rev_used, json_out, metric_cards_md, diag_cards_md, raw_metrics],
             )
         with gr.Tab("Batch (ZIP)"):
             repo_used_b = gr.Textbox(label="Repo used", interactive=False)
             rev_used_b = gr.Textbox(label="Revision", interactive=False)
             csv_out = gr.Textbox(label="Predictions CSV (filename,labels)", lines=12)
+            diag_cards_b = gr.HTML(label="Diagnostics (cards)")
+            metrics_out_b = gr.Code(label="Summary metrics JSON", language="json")
             preds_file = gr.File(label="Download predictions.csv")
             per_sample_file = gr.File(label="Download per_sample_metrics.csv")
             summary_file = gr.File(label="Download summary_metrics.json")
                 inputs=[
                     custom_repo, rules_file, system, context, zip_in, gt_zip,
                     soft_cap, preprocess, add_header, strip_smalltalk,
+                    load_4bit, hourly_rate, use_fewshot
                 ],
+                outputs=[repo_used_b, rev_used_b, csv_out, diag_cards_b, metrics_out_b, preds_file, per_sample_file, summary_file],
             )
     gr.Markdown(