Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 25

Commit

9949cc9

verified ·

1 Parent(s): 6acd2cc

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -89

app.py CHANGED Viewed

@@ -1,15 +1,121 @@
-Allowed Labels (canonical; use only these):
-{allowed_labels_list}
-Context cues (keywords/phrases that often indicate each label):
-{keyword_context}
-Instructions:
-1) Identify EVERY concrete task implied by the conversation.
-2) Choose ONE label from Allowed Labels for each task (or none if truly inapplicable).
-3) Return STRICT JSON only in the exact schema described by the system prompt.
-"""
 # =========================
 # Utilities
@@ -55,8 +161,7 @@ def restrict_to_allowed(pred: Dict[str, Any], allowed: List[str]) -> Dict[str, A
             continue
         k = str(t.get("label", "")).strip().lower()
         if k in allowed_map:
-            new_t = dict(t)
-            new_t["label"] = allowed_map[k]
             filt_tasks.append(new_t)
     merged = normalize_labels(list(set(filt_labels) | {tt["label"] for tt in filt_tasks}))
     out["labels"] = merged
@@ -64,10 +169,8 @@ def restrict_to_allowed(pred: Dict[str, Any], allowed: List[str]) -> Dict[str, A
     return out
 # =========================
-# Default pre-processing
 # =========================
-# These are conservative; they remove boilerplate that appears in many files
-# and does not affect tasks. You can toggle this in the UI.
 _DISCLAIMER_PATTERNS = [
     r"(?is)^\s*(?:disclaimer|legal notice|confidentiality notice).+?(?:\n{2,}|$)",
     r"(?is)^\s*the information contained.+?(?:\n{2,}|$)",
@@ -87,8 +190,7 @@ def clean_transcript(text: str) -> str:
     if not text:
         return text
     s = text
-    # Remove common timestamps and speaker prefixes (line-wise)
     lines = []
     for ln in s.splitlines():
         ln2 = ln
@@ -96,16 +198,13 @@ def clean_transcript(text: str) -> str:
             ln2 = re.sub(pat, "", ln2, flags=re.IGNORECASE)
         lines.append(ln2)
     s = "\n".join(lines)
-    # Remove top disclaimers
     for pat in _DISCLAIMER_PATTERNS:
         s = re.sub(pat, "", s).strip()
-    # Remove trailing footers/signatures
     for pat in _FOOTER_PATTERNS:
         s = re.sub(pat, "", s)
-    # Collapse repeated whitespace
     s = re.sub(r"[ \t]+", " ", s)
     s = re.sub(r"\n{3,}", "\n\n", s).strip()
     return s
@@ -194,8 +293,7 @@ _MODEL_CACHE: Dict[str, ModelWrapper] = {}
 def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool) -> ModelWrapper:
     key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}"
     if key not in _MODEL_CACHE:
-        m = ModelWrapper(repo_id, hf_token, load_in_4bit)
-        m.load()
         _MODEL_CACHE[key] = m
     return _MODEL_CACHE[key]
@@ -211,7 +309,6 @@ def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> fl
     def _process_sample_labels(sample_labels: List[str], sample_name: str) -> List[str]:
         if not isinstance(sample_labels, list):
             raise ValueError(f"{sample_name} must be a list of strings, got {type(sample_labels)}")
-        # dedupe
         seen, uniq = set(), []
         for label in sample_labels:
             if not isinstance(label, str):
@@ -219,7 +316,6 @@ def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> fl
             if label in seen:
                 raise ValueError(f"{sample_name} contains duplicate label: '{label}'")
             seen.add(label); uniq.append(label)
-        # validity
         valid = []
         for label in uniq:
             if label not in ALLOWED_LABELS:
@@ -257,10 +353,7 @@ def build_keyword_context(allowed: List[str]) -> str:
     parts = []
     for lab in allowed:
         kws = LABEL_KEYWORDS.get(lab, [])
-        if kws:
-            parts.append(f"- {lab}: " + ", ".join(kws))
-        else:
-            parts.append(f"- {lab}: (no default cues)")
     return "\n".join(parts)
 def run_single(
@@ -276,29 +369,23 @@ def run_single(
     t0 = _now_ms()
-    # Get transcript
     raw_text = read_text_from_file(transcript_file) if transcript_file else (transcript_text or "")
     raw_text = (raw_text or "").strip()
     if not raw_text:
         return "", "", "No transcript provided.", json.dumps({"labels": [], "tasks": []}, indent=2)
-    # Cleaning
     text = clean_transcript(raw_text) if use_cleaning else raw_text
-    # Allowed labels
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
     allowed = normalize_labels(user_allowed or OFFICIAL_LABELS)
-    # Model
     try:
         model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
         return "", "", f"Model load failed: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
-    # Truncate
     trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
-    # Build prompt
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     keyword_ctx = build_keyword_context(allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
@@ -307,7 +394,6 @@ def run_single(
         keyword_context=keyword_ctx,
     )
-    # Generate
     t1 = _now_ms()
     try:
         out = model.generate(SYSTEM_PROMPT, user_prompt)
@@ -315,11 +401,9 @@ def run_single(
         return "", "", f"Generation error: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
     t2 = _now_ms()
-    # Parse + filter
     parsed = robust_json_extract(out)
     filtered = restrict_to_allowed(parsed, allowed)
-    # Diagnostics
     diag = "\n".join([
         f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
         f"Model: {model_repo}",
@@ -329,7 +413,6 @@ def run_single(
         f"Allowed labels: {', '.join(allowed)}",
     ])
-    # Summary
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
     summary = "Detected labels:\n" + ("\n".join(f"- {l}" for l in labs) if labs else "(none)")
@@ -350,11 +433,7 @@ def read_zip(fileobj: io.BytesIO, exdir: Path) -> List[Path]:
     exdir.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(fileobj) as zf:
         zf.extractall(exdir)
-    out = []
-    for p in exdir.rglob("*"):
-        if p.is_file():
-            out.append(p)
-    return out
 def run_batch(
     zip_file: gr.File,
@@ -364,25 +443,27 @@ def run_batch(
     max_input_tokens: int,
     hf_token: str,
     limit_files: int,
-) -> Tuple[str, str, str, pd.DataFrame, str]:
     if not zip_file:
-        return ("No ZIP provided.", "", "", pd.DataFrame(), "")
     work = Path("/tmp/batch")
     if work.exists():
-        for p in work.rglob("*"):
-            try: p.unlink()
-            except Exception: pass
-        try: work.rmdir()
-        except Exception: pass
     work.mkdir(parents=True, exist_ok=True)
-    # Unzip
     data = zip_file.read()
     files = read_zip(io.BytesIO(data), work)
-    # Gather pairs by stem
     txts: Dict[str, Path] = {}
     gts: Dict[str, Path] = {}
     for p in files:
@@ -395,15 +476,14 @@ def run_batch(
     if limit_files > 0:
         stems = stems[:limit_files]
     if not stems:
-        return ("No .txt transcripts found in ZIP.", "", "", pd.DataFrame(), "")
-    # Model
     try:
         model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
-        return (f"Model load failed: {e}", "", "", pd.DataFrame(), "")
-    allowed = OFFICIAL_LABELS[:]  # fixed for scoring
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     keyword_ctx = build_keyword_context(allowed)
@@ -431,20 +511,17 @@ def run_batch(
         pred_labels = filtered.get("labels", [])
         y_pred.append(pred_labels)
-        # Ground truth (optional)
         gt_labels = []
         if stem in gts:
             try:
                 gt_obj = json.loads(gts[stem].read_text(encoding="utf-8", errors="ignore"))
-                if isinstance(gt_obj, dict) and "labels" in gt_obj and isinstance(gt_obj["labels"], list):
                     gt_labels = [x for x in gt_obj["labels"] if x in OFFICIAL_LABELS]
             except Exception:
                 pass
         y_true.append(gt_labels)
-        # FP/FN counts for table
-        gt_set = set(gt_labels)
-        pr_set = set(pred_labels)
         tp = sorted(gt_set & pr_set)
         fp = sorted(pr_set - gt_set)
         fn = sorted(gt_set - pr_set)
@@ -457,8 +534,6 @@ def run_batch(
             "gen_ms": t1 - t0
         })
-    # Metrics
-    # If there is no ground truth in the ZIP, we still compute a table and skip score.
     have_truth = any(len(v) > 0 for v in y_true)
     score = evaluate_predictions(y_true, y_pred) if have_truth else None
@@ -472,7 +547,6 @@ def run_batch(
         f"Batch time: {_now_ms()-t_start} ms",
     ]
     if have_truth and score is not None:
-        # Simple derived metrics
         total_tp = int(df["TP"].sum())
         total_fp = int(df["FP"].sum())
         total_fn = int(df["FN"].sum())
@@ -486,12 +560,11 @@ def run_batch(
         ]
     diag_str = "\n".join(diag)
-    # CSV preview and data URL
-    csv_buf = io.StringIO()
-    df.to_csv(csv_buf, index=False)
-    csv_data = csv_buf.getvalue()
-    return ("Batch done.", diag_str, csv_data, df, csv_data)
 # =========================
 # UI
@@ -505,10 +578,8 @@ MODEL_CHOICES = [
 with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
     gr.Markdown("# Talk2Task — Task Extraction (UBS Challenge)")
     gr.Markdown(
-        "This tool extracts challenge labels from transcripts. "
-        "Use **Single** for quick tests; use **Batch** to score a ZIP with transcripts + truths. "
-        "_Note: False negatives are penalised twice as much as false positives in the official metric; "
-        "we bias for recall._"
     )
     with gr.Tab("Single transcript"):
@@ -520,9 +591,12 @@ with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
                     type="filepath",
                 )
                 text = gr.Textbox(label="Or paste transcript", lines=14)
-                use_cleaning = gr.Checkbox(label="Apply default cleaning (remove disclaimers, timestamps, footers)", value=True)
                 labels_text = gr.Textbox(
-                    label="Allowed Labels (one per line; leave empty to use official list)",
                     value="",
                     lines=8,
                 )
@@ -561,23 +635,15 @@ with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
         with gr.Row():
             status = gr.Textbox(label="Status", lines=1)
-            diag_b = gr.Textbox(label="Batch diagnostics & metrics", lines=10)
-        with gr.Row():
-            df_out = gr.Dataframe(label="Per-file results (TP/FP/FN, times)", interactive=False)
-        csv_out = gr.File(label="Download CSV (click to save)", interactive=False)
-        def _save_csv(csv_text: str) -> str:
-            if not csv_text:
-                return ""
-            out_path = Path("/tmp/batch_results.csv")
-            out_path.write_text(csv_text, encoding="utf-8")
-            return str(out_path)
         run_batch_btn.click(
             fn=run_batch,
             inputs=[zip_in, use_cleaning_b, repo_b, use_4bit_b, max_tokens_b, hf_token_b, limit_files],
-            outputs=[status, diag_b, csv_out, df_out, gr.Textbox(visible=False)],
         )
 if __name__ == "__main__":

+# app.py
+import os
+import re
+import io
+import json
+import time
+import zipfile
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Optional
+import numpy as np
+import pandas as pd
+import gradio as gr
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    GenerationConfig,
+)
+# =========================
+# Global config
+# =========================
+SPACE_CACHE = Path.home() / ".cache" / "huggingface"
+SPACE_CACHE.mkdir(parents=True, exist_ok=True)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+GEN_CONFIG = GenerationConfig(
+    temperature=0.2,
+    top_p=0.9,
+    do_sample=False,
+    max_new_tokens=256,
+)
+# Official UBS label set (strict)
+OFFICIAL_LABELS = [
+    "plan_contact",
+    "schedule_meeting",
+    "update_contact_info_non_postal",
+    "update_contact_info_postal_address",
+    "update_kyc_activity",
+    "update_kyc_origin_of_assets",
+    "update_kyc_purpose_of_businessrelation",
+    "update_kyc_total_assets",
+]
+# Per-label keyword cues (static prompt context to improve recall)
+LABEL_KEYWORDS: Dict[str, List[str]] = {
+    "plan_contact": [
+        "call back", "follow up", "reach out", "contact later", "check-in",
+        "email them", "touch base", "remind", "send a note"
+    ],
+    "schedule_meeting": [
+        "book a meeting", "set up a meeting", "schedule a call",
+        "appointment", "calendar", "meeting next week", "meet on", "time slot"
+    ],
+    "update_contact_info_non_postal": [
+        "phone change", "new phone", "email change", "new email",
+        "update contact details", "update mobile", "alternate phone"
+    ],
+    "update_contact_info_postal_address": [
+        "moved to", "new address", "postal address", "mailing address",
+        "change of address", "residential address"
+    ],
+    "update_kyc_activity": [
+        "activity update", "economic activity", "employment status",
+        "occupation", "job change", "business activity"
+    ],
+    "update_kyc_origin_of_assets": [
+        "source of funds", "origin of assets", "where money comes from",
+        "inheritance", "salary", "business income", "asset origin"
+    ],
+    "update_kyc_purpose_of_businessrelation": [
+        "purpose of relationship", "why the account", "reason for banking",
+        "investment purpose", "relationship purpose"
+    ],
+    "update_kyc_total_assets": [
+        "total assets", "net worth", "assets under ownership",
+        "portfolio size", "how much you own"
+    ],
+}
+# =========================
+# Instructions (string-safe; concatenated)
+# =========================
+SYSTEM_PROMPT = (
+    "You are a precise banking assistant that extracts ACTIONABLE TASKS from "
+    "client–advisor transcripts. Be conservative with hallucinations but "
+    "prioritise RECALL: if unsure and the transcript plausibly implies an "
+    "action, include the label and explain briefly.\n\n"
+    "Output STRICT JSON only:\n\n"
+    "{\n"
+    '  "labels": ["<Label1>", "..."],\n'
+    '  "tasks": [\n'
+    '    {"label": "<Label1>", "explanation": "<why>", "evidence": "<quoted text/snippet>"}\n'
+    "  ]\n"
+    "}\n\n"
+    "Rules:\n"
+    "- Use ONLY allowed labels supplied to you. Case-insensitive during reasoning, "
+    "  but output the canonical label text exactly.\n"
+    "- If none truly apply, return empty lists.\n"
+    "- Keep explanations concise; put the minimal evidence snippet that justifies the task.\n"
+)
+USER_PROMPT_TEMPLATE = (
+    "Transcript (cleaned):\n"
+    "```\n{transcript}\n```\n\n"
+    "Allowed Labels (canonical; use only these):\n"
+    "{allowed_labels_list}\n\n"
+    "Context cues (keywords/phrases that often indicate each label):\n"
+    "{keyword_context}\n\n"
+    "Instructions:\n"
+    "- Identify EVERY concrete task implied by the conversation.\n"
+    "- Choose ONE label from Allowed Labels for each task (or none if truly inapplicable).\n"
+    "- Return STRICT JSON only in the exact schema described by the system prompt.\n"
+)
 # =========================
 # Utilities
             continue
         k = str(t.get("label", "")).strip().lower()
         if k in allowed_map:
+            new_t = dict(t); new_t["label"] = allowed_map[k]
             filt_tasks.append(new_t)
     merged = normalize_labels(list(set(filt_labels) | {tt["label"] for tt in filt_tasks}))
     out["labels"] = merged
     return out
 # =========================
+# Default pre-processing (toggleable)
 # =========================
 _DISCLAIMER_PATTERNS = [
     r"(?is)^\s*(?:disclaimer|legal notice|confidentiality notice).+?(?:\n{2,}|$)",
     r"(?is)^\s*the information contained.+?(?:\n{2,}|$)",
     if not text:
         return text
     s = text
+    # remove timestamps/speaker prefixes line-wise
     lines = []
     for ln in s.splitlines():
         ln2 = ln
             ln2 = re.sub(pat, "", ln2, flags=re.IGNORECASE)
         lines.append(ln2)
     s = "\n".join(lines)
+    # remove top disclaimers
     for pat in _DISCLAIMER_PATTERNS:
         s = re.sub(pat, "", s).strip()
+    # remove trailing footers
     for pat in _FOOTER_PATTERNS:
         s = re.sub(pat, "", s)
+    # collapse whitespace
     s = re.sub(r"[ \t]+", " ", s)
     s = re.sub(r"\n{3,}", "\n\n", s).strip()
     return s
 def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool) -> ModelWrapper:
     key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}"
     if key not in _MODEL_CACHE:
+        m = ModelWrapper(repo_id, hf_token, load_in_4bit); m.load()
         _MODEL_CACHE[key] = m
     return _MODEL_CACHE[key]
     def _process_sample_labels(sample_labels: List[str], sample_name: str) -> List[str]:
         if not isinstance(sample_labels, list):
             raise ValueError(f"{sample_name} must be a list of strings, got {type(sample_labels)}")
         seen, uniq = set(), []
         for label in sample_labels:
             if not isinstance(label, str):
             if label in seen:
                 raise ValueError(f"{sample_name} contains duplicate label: '{label}'")
             seen.add(label); uniq.append(label)
         valid = []
         for label in uniq:
             if label not in ALLOWED_LABELS:
     parts = []
     for lab in allowed:
         kws = LABEL_KEYWORDS.get(lab, [])
+        parts.append(f"- {lab}: " + (", ".join(kws) if kws else "(no default cues)"))
     return "\n".join(parts)
 def run_single(
     t0 = _now_ms()
     raw_text = read_text_from_file(transcript_file) if transcript_file else (transcript_text or "")
     raw_text = (raw_text or "").strip()
     if not raw_text:
         return "", "", "No transcript provided.", json.dumps({"labels": [], "tasks": []}, indent=2)
     text = clean_transcript(raw_text) if use_cleaning else raw_text
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
     allowed = normalize_labels(user_allowed or OFFICIAL_LABELS)
     try:
         model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
         return "", "", f"Model load failed: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
     trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     keyword_ctx = build_keyword_context(allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
         keyword_context=keyword_ctx,
     )
     t1 = _now_ms()
     try:
         out = model.generate(SYSTEM_PROMPT, user_prompt)
         return "", "", f"Generation error: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
     t2 = _now_ms()
     parsed = robust_json_extract(out)
     filtered = restrict_to_allowed(parsed, allowed)
     diag = "\n".join([
         f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
         f"Model: {model_repo}",
         f"Allowed labels: {', '.join(allowed)}",
     ])
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
     summary = "Detected labels:\n" + ("\n".join(f"- {l}" for l in labs) if labs else "(none)")
     exdir.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(fileobj) as zf:
         zf.extractall(exdir)
+    return [p for p in exdir.rglob("*") if p.is_file()]
 def run_batch(
     zip_file: gr.File,
     max_input_tokens: int,
     hf_token: str,
     limit_files: int,
+) -> Tuple[str, str, pd.DataFrame, str]:
     if not zip_file:
+        return ("No ZIP provided.", "", pd.DataFrame(), "")
     work = Path("/tmp/batch")
     if work.exists():
+        for p in sorted(work.rglob("*"), reverse=True):
+            try:
+                p.unlink()
+            except Exception:
+                pass
+        try:
+            work.rmdir()
+        except Exception:
+            pass
     work.mkdir(parents=True, exist_ok=True)
     data = zip_file.read()
     files = read_zip(io.BytesIO(data), work)
     txts: Dict[str, Path] = {}
     gts: Dict[str, Path] = {}
     for p in files:
     if limit_files > 0:
         stems = stems[:limit_files]
     if not stems:
+        return ("No .txt transcripts found in ZIP.", "", pd.DataFrame(), "")
     try:
         model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
+        return (f"Model load failed: {e}", "", pd.DataFrame(), "")
+    allowed = OFFICIAL_LABELS[:]
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     keyword_ctx = build_keyword_context(allowed)
         pred_labels = filtered.get("labels", [])
         y_pred.append(pred_labels)
         gt_labels = []
         if stem in gts:
             try:
                 gt_obj = json.loads(gts[stem].read_text(encoding="utf-8", errors="ignore"))
+                if isinstance(gt_obj, dict) and isinstance(gt_obj.get("labels"), list):
                     gt_labels = [x for x in gt_obj["labels"] if x in OFFICIAL_LABELS]
             except Exception:
                 pass
         y_true.append(gt_labels)
+        gt_set, pr_set = set(gt_labels), set(pred_labels)
         tp = sorted(gt_set & pr_set)
         fp = sorted(pr_set - gt_set)
         fn = sorted(gt_set - pr_set)
             "gen_ms": t1 - t0
         })
     have_truth = any(len(v) > 0 for v in y_true)
     score = evaluate_predictions(y_true, y_pred) if have_truth else None
         f"Batch time: {_now_ms()-t_start} ms",
     ]
     if have_truth and score is not None:
         total_tp = int(df["TP"].sum())
         total_fp = int(df["FP"].sum())
         total_fn = int(df["FN"].sum())
         ]
     diag_str = "\n".join(diag)
+    # save CSV for download
+    out_csv = Path("/tmp/batch_results.csv")
+    df.to_csv(out_csv, index=False, encoding="utf-8")
+    return ("Batch done.", diag_str, df, str(out_csv))
 # =========================
 # UI
 with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
     gr.Markdown("# Talk2Task — Task Extraction (UBS Challenge)")
     gr.Markdown(
+        "Extract challenge labels from transcripts. False negatives are penalised 2× more than false positives "
+        "in the official score, so the app biases for recall."
     )
     with gr.Tab("Single transcript"):
                     type="filepath",
                 )
                 text = gr.Textbox(label="Or paste transcript", lines=14)
+                use_cleaning = gr.Checkbox(
+                    label="Apply default cleaning (remove disclaimers, timestamps, speakers, footers)",
+                    value=True,
+                )
                 labels_text = gr.Textbox(
+                    label="Allowed Labels (one per line; empty = official list)",
                     value="",
                     lines=8,
                 )
         with gr.Row():
             status = gr.Textbox(label="Status", lines=1)
+            diag_b = gr.Textbox(label="Batch diagnostics & metrics", lines=12)
+        df_out = gr.Dataframe(label="Per-file results (TP/FP/FN, latency)", interactive=False)
+        csv_out = gr.File(label="Download CSV", interactive=False)
         run_batch_btn.click(
             fn=run_batch,
             inputs=[zip_in, use_cleaning_b, repo_b, use_4bit_b, max_tokens_b, hf_token_b, limit_files],
+            outputs=[status, diag_b, df_out, csv_out],
         )
 if __name__ == "__main__":