Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 22

Commit

f066995

verified ·

1 Parent(s): 28f5fab

Create app.py

Browse files

Files changed (1) hide show

app.py +546 -0

app.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import os, io, re, sys, time, json, zipfile, statistics
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Tuple
+import gradio as gr
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+# ---------------- Constants / Labels ----------------
+ALLOWED_LABELS = [
+    "plan_contact",
+    "schedule_meeting",
+    "update_contact_info_non_postal",
+    "update_contact_info_postal_address",
+    "update_kyc_activity",
+    "update_kyc_origin_of_assets",
+    "update_kyc_purpose_of_businessrelation",
+    "update_kyc_total_assets",
+]
+LABEL_TO_IDX = {l:i for i,l in enumerate(ALLOWED_LABELS)}
+FN_PENALTY = 2.0
+FP_PENALTY = 1.0
+# ---------------- Helpers ----------------
+def safe_json_load(s: str):
+    try:
+        return json.loads(s)
+    except Exception:
+        pass
+    m = re.search(r'\{.*\}', s, re.S)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except Exception:
+            pass
+    return {"labels": [], "notes": "WARN: model output not valid JSON; fallback used"}
+def _coerce_labels_list(x):
+    if isinstance(x, list):
+        out = []
+        for it in x:
+            if isinstance(it, str): out.append(it)
+            elif isinstance(it, dict):
+                for k in ("label","value","task","category","name"):
+                    v = it.get(k)
+                    if isinstance(v, str):
+                        out.append(v); break
+                else:
+                    if isinstance(it.get("labels"), list):
+                        out += [s for s in it["labels"] if isinstance(s, str)]
+        # dedupe
+        seen=set(); norm=[]
+        for s in out:
+            if s not in seen:
+                norm.append(s); seen.add(s)
+        return norm
+    if isinstance(x, dict):
+        for k in ("expected_labels","labels","targets","y_true"):
+            if k in x: return _coerce_labels_list(x[k])
+        if "one_hot" in x and isinstance(x["one_hot"], dict):
+            return [k for k,v in x["one_hot"].items() if v]
+    return []
+def classic_metrics(pred_labels, exp_labels):
+    pred_labels = [str(x) for x in (pred_labels or []) if isinstance(x, (str,int,float,bool))]
+    exp_labels  = [str(x) for x in (exp_labels  or []) if isinstance(x, (str,int,float,bool))]
+    pred = set(pred_labels); gold = set(exp_labels)
+    if not pred and not gold:
+        return True, 1.0, 1.0, 1.0, 1.0
+    inter = pred & gold; union = pred | gold
+    exact = (sorted(pred) == sorted(gold))
+    precision = (len(inter) / (len(pred) if pred else 1e-9))
+    recall    = (len(inter) / (len(gold) if gold else 1e-9))
+    f1 = 0.0 if len(inter) == 0 else 2*len(inter) / (len(pred)+len(gold)+1e-9)
+    hamming = (len(inter) / (len(union) if union else 1e-9))
+    return exact, precision, recall, f1, hamming
+def ubs_score_one(true_labels, pred_labels) -> float:
+    tset = [l for l in (true_labels or []) if l in LABEL_TO_IDX]
+    pset = [l for l in (pred_labels or []) if l in LABEL_TO_IDX]
+    n_labels = len(ALLOWED_LABELS)
+    tpos = set(tset); ppos = set(pset)
+    fn = sum(1 for l in ALLOWED_LABELS if (l in tpos and l not in ppos))
+    fp = sum(1 for l in ALLOWED_LABELS if (l not in tpos and l in ppos))
+    weighted = FN_PENALTY*fn + FP_PENALTY*fp
+    t_count = len(tpos)
+    max_err = FN_PENALTY*t_count + FP_PENALTY*(n_labels - t_count)
+    score = 1.0 if max_err == 0 else (1.0 - (weighted / max_err))
+    return float(max(0.0, min(1.0, score)))
+# ---------------- Preprocess ----------------
+EMAIL_RX   = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', re.I)
+TIME_RX    = re.compile(r'\b(\d{1,2}:\d{2}\b|\b\d{1,2}\s?(am|pm)\b|\bafternoon\b|\bmorning\b|\bevening\b)', re.I)
+DATE_RX    = re.compile(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b|\b\d{1,2}[/-]\d{1,2}([/-]\d{2,4})?\b|\b20\d{2}\b', re.I)
+MEET_RX    = re.compile(r'\b(meet(ing)?|call|appointment|schedule|invite|agenda|online|in[- ]?person|phone|zoom|teams)\b', re.I)
+MODAL_RX   = re.compile(r'\b(online|in[- ]?person|phone|zoom|teams)\b', re.I)
+SMALLTALK_RX = re.compile(r'^\s*(user|advisor):\s*(thanks( you)?|thank you|anything else|have a great day|you too)\b', re.I)
+TYPO_FIXES = [
+    (re.compile(r'\bschedulin\s*g\b', re.I), 'scheduling'),
+    (re.compile(r'\beeting\b', re.I), 'meeting'),
+    (re.compile(r'\bdi?i?gtal\b', re.I), 'digital'),
+    (re.compile(r'\bdigi\s+tal\b', re.I), 'digital'),
+    (re.compile(r'\bspread\s*sheet\b', re.I), 'spreadsheet'),
+    (re.compile(r'\bseats\b', re.I), 'sheets'),
+    (re.compile(r'\bver(s|z)ion meters\b', re.I), 'version metrics'),
+]
+def normalize_text(text: str, fix_typos: bool = True) -> str:
+    t = text.replace('\r\n', '\n')
+    t = re.sub(r'^\s*Speaker\s*1\s*:\s*', 'USER: ', t, flags=re.I | re.M)
+    t = re.sub(r'^\s*Speaker\s*2\s*:\s*', 'ADVISOR: ', t, flags=re.I | re.M)
+    t = re.sub(r'[ \t]+', ' ', t)
+    t = re.sub(r'\n{3,}', '\n\n', t)
+    if fix_typos:
+        for rx, rep in TYPO_FIXES:
+            t = rx.sub(rep, t)
+    return t.strip()
+def extract_cues(text: str):
+    emails = EMAIL_RX.findall(text)
+    email_new, email_old = (emails[-1], emails[-2]) if len(emails)>=2 else ((emails[-1], None) if emails else (None, None))
+    has_time = bool(TIME_RX.search(text))
+    has_date = bool(DATE_RX.search(text))
+    has_meet = bool(MEET_RX.search(text))
+    modality = None
+    m = MODAL_RX.search(text)
+    if m:
+        modality = m.group(0).upper().replace('IN PERSON','IN_PERSON').replace('IN-PERSON','IN_PERSON')
+    meeting_confirmed = (has_meet and (has_time or has_date))
+    tm = TIME_RX.search(text)
+    norm_tm = tm.group(0) if tm else None
+    return {
+        "email_new": email_new,
+        "email_old": email_old,
+        "contact_pref": "EMAIL" if email_new else None,
+        "meeting_time_fragment": norm_tm,
+        "meeting_modality": modality,
+        "meeting_confirmed": meeting_confirmed
+    }
+def build_cues_header(cues: dict) -> str:
+    has_any = any([cues.get("email_new"), cues.get("email_old"), cues.get("contact_pref"), cues.get("meeting_confirmed")])
+    if not has_any:
+        return ""
+    lines = ["[DETECTED_CUES]"]
+    if cues.get("email_new"): lines.append(f"EMAIL_NEW: {cues['email_new']}")
+    if cues.get("email_old"): lines.append(f"EMAIL_OLD: {cues['email_old']}")
+    if cues.get("contact_pref"): lines.append(f"CONTACT_PREF: {cues['contact_pref']}")
+    if cues.get("meeting_confirmed"):
+        mod = cues.get("meeting_modality") or ""
+        tm  = cues.get("meeting_time_fragment") or ""
+        lines.append(f"MEETING: {(tm + ' ' + mod).strip()} CONFIRMED")
+    lines.append("[/DETECTED_CUES]")
+    return "\n".join(lines)
+def find_cue_lines(lines):
+    idx = set()
+    for i, ln in enumerate(lines):
+        if EMAIL_RX.search(ln) or (MEET_RX.search(ln) and (TIME_RX.search(ln) or DATE_RX.search(ln))):
+            idx.add(i)
+    return sorted(idx)
+def prune_by_window(lines, cue_idx, window=3, strip_smalltalk=False):
+    n = len(lines); keep = set()
+    for k in cue_idx:
+        lo, hi = max(0, k-window), min(n-1, k+window)
+        keep.update(range(lo,hi+1))
+    out=[]
+    for i, ln in enumerate(lines):
+        if i in keep:
+            if strip_smalltalk and SMALLTALK_RX.search(ln): continue
+            out.append(ln)
+    return out
+# ---------------- HF Model wrapper ----------------
+class HFModel:
+    def __init__(self, repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
+        self.repo_id = repo_id
+        self.tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True, trust_remote_code=trust_remote_code)
+        quant = None
+        torch_dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}.get(dtype, torch.bfloat16)
+        if load_4bit:
+            quant = BitsAndBytesConfig(load_in_4bit=True,
+                                       bnb_4bit_use_double_quant=True,
+                                       bnb_4bit_compute_dtype=torch_dtype,
+                                       bnb_4bit_quant_type="nf4")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                repo_id, device_map="auto", trust_remote_code=trust_remote_code,
+                quantization_config=quant, torch_dtype=torch_dtype
+            )
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                repo_id, device_map="auto", trust_remote_code=trust_remote_code,
+                torch_dtype=torch_dtype
+            )
+        self.max_context = getattr(self.model.config, "max_position_embeddings", None) \
+                           or getattr(self.model.config, "max_sequence_length", None) or 8192
+    def encode_len(self, text: str) -> int:
+        return len(self.tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids)
+    def apply_chat_template(self, system_text: str, user_text: str) -> str:
+        if getattr(self.tokenizer, "chat_template", None):
+            messages = [{"role":"system","content":system_text},
+                        {"role":"user","content":user_text}]
+            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        return ("### System\n" + system_text.strip() + "\n\n" +
+                "### User\n" + user_text.strip() + "\n\n" +
+                "### Assistant\n")
+    @torch.inference_mode()
+    def generate_json(self, system_text: str, user_text: str, max_new_tokens: int = 256):
+        prompt = self.apply_chat_template(system_text, user_text)
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        t0 = time.perf_counter()
+        out = self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            temperature=None,
+            top_p=None,
+            eos_token_id=self.tokenizer.eos_token_id,
+        )
+        latency_ms = int((time.perf_counter() - t0) * 1000)
+        text = self.tokenizer.decode(out[0], skip_special_tokens=True)
+        if text.startswith(prompt):
+            text = text[len(prompt):]
+        return latency_ms, text, prompt
+# ---------------- Core pipeline ----------------
+def shrink_to_token_cap_by_lines(text: str, soft_cap_tokens: int, tokenizer,
+                                 min_lines_keep: int = 30,
+                                 apply_only_if_ratio: float = 1.15) -> str:
+    ids = tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids
+    est = len(ids)
+    threshold = int(soft_cap_tokens * apply_only_if_ratio)
+    if est <= threshold:
+        return text
+    parts = text.splitlines()
+    if len(parts) <= min_lines_keep:
+        return text
+    # keep header + cue-like lines
+    keep_flags=[]
+    for ln in parts:
+        is_header = ln.startswith("[DETECTED_CUES]") or ln.startswith("[/DETECTED_CUES]") \
+                    or ln.startswith("EMAIL_") or ln.startswith("CONTACT_") or ln.startswith("MEETING:")
+        is_cue = bool(EMAIL_RX.search(ln) or MEET_RX.search(ln) or DATE_RX.search(ln) or TIME_RX.search(ln))
+        keep_flags.append(is_header or is_cue)
+    pruned = [ln for ln, keep in zip(parts, keep_flags) if keep]
+    if len(pruned) < min_lines_keep:
+        pad_needed = min_lines_keep - len(pruned)
+        non_cue_lines = [ln for ln, keep in zip(parts, keep_flags) if not keep]
+        pruned = pruned + non_cue_lines[:pad_needed]
+    candidate = "\n".join(pruned)
+    cand_tokens = len(tokenizer(candidate, return_tensors=None, add_special_tokens=False).input_ids)
+    if cand_tokens > threshold:
+        mid = len(parts)//2
+        half = max(min_lines_keep//2, 50)
+        slice_parts = parts[max(0, mid-half): min(len(parts), mid+half)]
+        candidate2 = "\n".join(slice_parts)
+        candidate2_tokens = len(tokenizer(candidate2, return_tensors=None, add_special_tokens=False).input_ids)
+        candidate = candidate if cand_tokens <= candidate2_tokens else candidate2
+    if len(candidate.splitlines()) < min_lines_keep:
+        return text
+    return candidate
+def enforce_rules(labels, transcript_text):
+    labels = set(labels or [])
+    if (TIME_RX.search(transcript_text) or DATE_RX.search(transcript_text)) and MEET_RX.search(transcript_text):
+        labels.add("schedule_meeting")
+        labels.discard("plan_contact")
+    if EMAIL_RX.search(transcript_text) and re.search(r'\b(update|new|set|change|confirm(ed)?|for all communication)\b', transcript_text, re.I):
+        labels.add("update_contact_info_non_postal")
+    kyc_rx = re.compile(r'\b(kyc|aml|compliance|employer|occupation|purpose of (relationship|account)|source of (wealth|funds)|net worth|total assets)\b', re.I)
+    if "update_kyc_activity" in labels and not kyc_rx.search(transcript_text):
+        labels.discard("update_kyc_activity")
+    return sorted(labels)
+# ---------------- Gradio app logic ----------------
+MODEL_CACHE: Dict[str, HFModel] = {}
+def get_model(repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
+    if repo_id not in MODEL_CACHE:
+        MODEL_CACHE[repo_id] = HFModel(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
+    return MODEL_CACHE[repo_id]
+def parse_zip(zip_bytes: bytes) -> Dict[str, Tuple[str, List[str]]]:
+    """
+    Returns mapping: sample_id -> (transcript_text, expected_labels[])
+    Expect pairs: <id>.txt and <id>.json (json optional).
+    """
+    zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
+    names = zf.namelist()
+    samples = {}
+    for n in names:
+        p = Path(n)
+        if p.suffix.lower() == ".txt":
+            sample_id = p.stem
+            txt = zf.read(n).decode("utf-8", "replace")
+            samples.setdefault(sample_id, ["", []])[0] = txt
+        elif p.suffix.lower() == ".json":
+            sample_id = p.stem
+            try:
+                js = json.loads(zf.read(n).decode("utf-8", "replace"))
+            except Exception:
+                js = []
+            samples.setdefault(sample_id, ["", []])[1] = _coerce_labels_list(js)
+    return samples
+def run_batch_ui(models_str, instructions_text, context_text, dataset_zip,
+                 soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
+                 repeats, max_total_runs, load_4bit, dtype, trust_remote_code):
+    if not dataset_zip:
+        return pd.DataFrame(), None, "Please upload a ZIP with *.txt (+ optional matching *.json)."
+    models = [m.strip() for m in (models_str or "").split(",") if m.strip()]
+    if not models:
+        return pd.DataFrame(), None, "Please enter at least one model repo id (e.g., mistralai/Mistral-7B-Instruct-v0.2)."
+    try:
+        samples = parse_zip(dataset_zip)
+    except Exception as e:
+        return pd.DataFrame(), None, f"Failed to read ZIP: {e}"
+    rows = []
+    total_runs = 0
+    all_artifacts = io.BytesIO()
+    zout = zipfile.ZipFile(all_artifacts, "w", zipfile.ZIP_DEFLATED)
+    for repo_id in models:
+        hf = get_model(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
+        for sample_id, (transcript_text, exp_labels) in samples.items():
+            if not transcript_text.strip():
+                continue
+            latencies = []
+            last_pred = None
+            for r in range(1, repeats+1):
+                if total_runs >= max_total_runs:
+                    break
+                # ---- Preprocess
+                before_tok = hf.encode_len(transcript_text)
+                proc_text = transcript_text
+                if preprocess:
+                    t_norm = normalize_text(proc_text, fix_typos=True)
+                    lines = [ln.strip() for ln in t_norm.splitlines() if ln.strip()]
+                    cue_lines = find_cue_lines(lines)
+                    if cue_lines:
+                        lines_kept = prune_by_window(lines, cue_lines, window=pre_window, strip_smalltalk=strip_smalltalk)
+                    else:
+                        lines_kept = [ln for ln in lines if not (strip_smalltalk and SMALLTALK_RX.search(ln))]
+                    t_kept = "\n".join(lines_kept)
+                    cues = extract_cues(t_kept)
+                    header = build_cues_header(cues) if add_cues else ""
+                    proc_text = (header + "\n\n" + t_kept).strip() if header else t_kept
+                    proc_text = shrink_to_token_cap_by_lines(proc_text, soft_cap, hf.tokenizer)
+                    if len(proc_text.splitlines()) < 30:
+                        proc_text = t_norm
+                after_tok = hf.encode_len(proc_text)
+                system_text = instructions_text.strip()
+                user_text = context_text.strip() + "\n\nTRANSCRIPT\n" + proc_text.strip()
+                t0 = time.perf_counter()
+                latency_ms, raw_text, prompt = hf.generate_json(system_text, user_text, max_new_tokens=256)
+                latency_ms = int((time.perf_counter() - t0) * 1000)  # includes tokenization overhead
+                out = safe_json_load(raw_text)
+                pred_labels = enforce_rules(out.get("labels", []), proc_text)
+                exact, prec, rec, f1, ham = classic_metrics(pred_labels, exp_labels)
+                ubs = ubs_score_one(exp_labels, pred_labels)
+                rows.append({
+                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
+                    "sample_id": sample_id,
+                    "model": repo_id,
+                    "is_summary": False,
+                    "run_index": r,
+                    "preprocess": preprocess,
+                    "pre_window": pre_window,
+                    "add_cues_header": add_cues,
+                    "strip_smalltalk": strip_smalltalk,
+                    "soft_cap": soft_cap,
+                    "latency_ms": latency_ms,
+                    "token_before": before_tok,
+                    "token_after": after_tok,
+                    "model_calls": 1,
+                    "pred_labels": json.dumps(pred_labels, ensure_ascii=False),
+                    "exp_labels": json.dumps(exp_labels, ensure_ascii=False),
+                    "exact_match": exact,
+                    "precision": round(prec, 6),
+                    "recall": round(rec, 6),
+                    "f1": round(f1, 6),
+                    "hamming": round(ham, 6),
+                    "ubs_score": round(ubs, 6),
+                })
+                # artifacts
+                base = f"{repo_id.replace('/','_')}/{sample_id}/pre{int(preprocess)}_win{pre_window}_cues{int(add_cues)}_small{int(strip_smalltalk)}_cap{soft_cap}_r{r}"
+                zout.writestr(base + "/PREPROCESSED.txt", proc_text)
+                zout.writestr(base + "/MODEL_OUTPUT.raw.txt", raw_text)
+                final_json = {
+                    "labels": pred_labels,
+                    "diagnostics": {
+                        "model_name": repo_id,
+                        "latency_ms": latency_ms,
+                        "token_in_est_before": before_tok,
+                        "token_in_est_after": after_tok,
+                        "preprocess": preprocess,
+                        "pre_window": pre_window,
+                        "pre_add_cues_header": add_cues if preprocess else False,
+                        "pre_strip_smalltalk": strip_smalltalk if preprocess else False,
+                        "pre_soft_token_cap": soft_cap if preprocess else None,
+                        "model_calls": 1
+                    }
+                }
+                zout.writestr(base + "/FINAL.json", json.dumps(final_json, ensure_ascii=False, indent=2))
+                latencies.append(latency_ms)
+                last_pred = pred_labels
+                total_runs += 1
+            if latencies:
+                med = int(statistics.median(latencies))
+                exact, prec, rec, f1, ham = classic_metrics(last_pred, exp_labels) if last_pred is not None else (None,)*5
+                ubs = ubs_score_one(exp_labels, last_pred) if last_pred is not None else None
+                rows.append({
+                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
+                    "sample_id": sample_id,
+                    "model": repo_id,
+                    "is_summary": True,
+                    "run_index": None,
+                    "preprocess": preprocess,
+                    "pre_window": pre_window,
+                    "add_cues_header": add_cues,
+                    "strip_smalltalk": strip_smalltalk,
+                    "soft_cap": soft_cap,
+                    "median_latency_ms": med,
+                    "latency_ms": None,
+                    "token_before": None,
+                    "token_after": None,
+                    "model_calls": None,
+                    "pred_labels": json.dumps(last_pred or [], ensure_ascii=False),
+                    "exp_labels": json.dumps(exp_labels or [], ensure_ascii=False),
+                    "exact_match": exact,
+                    "precision": round(prec, 6) if prec is not None else None,
+                    "recall": round(rec, 6) if rec is not None else None,
+                    "f1": round(f1, 6) if f1 is not None else None,
+                    "hamming": round(ham, 6) if ham is not None else None,
+                    "ubs_score": round(ubs, 6) if ubs is not None else None,
+                })
+        if total_runs >= max_total_runs:
+            break
+    zout.close()
+    df = pd.DataFrame(rows)
+    csv_bytes = df.to_csv(index=False).encode("utf-8")
+    return df, ("results.csv", csv_bytes), all_artifacts.getvalue()
+# ---------------- Gradio UI ----------------
+with gr.Blocks(title="From Talk to Task — HF Space") as demo:
+    gr.Markdown("# From Talk to Task — Batch Task Extraction (Hugging Face Space)")
+    with gr.Row():
+        models = gr.Textbox(label="Models (comma-separated HF repo IDs)", value="mistralai/Mistral-7B-Instruct-v0.2")
+    with gr.Row():
+        instructions = gr.Textbox(label="Instructions (System)", lines=8, value=(
+            "You are a task extraction assistant. "
+            "Always output valid JSON with a field \"labels\" (list of strings). "
+            "Use only from this set: "
+            + json.dumps(ALLOWED_LABELS)
+            + ". Return JSON only."
+        ))
+    with gr.Row():
+        context = gr.Textbox(label="Context (User prefix before transcript)", lines=6, value=(
+            "- plan_contact: conversation without a concrete meeting (no date/time)\n"
+            "- schedule_meeting: explicit date/time/modality confirmation\n"
+            "- update_contact_info_non_postal: changes to email/phone\n"
+            "- update_contact_info_postal_address: changes to mailing address\n"
+            "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)"
+        ))
+    with gr.Row():
+        dataset_zip = gr.File(label="Upload ZIP of transcripts (*.txt) + expected (*.json)", file_types=[".zip"])
+    gr.Markdown("### Parameters")
+    with gr.Row():
+        soft_cap = gr.Slider(1024, 32768, value=8192, step=512, label="Soft token cap")
+        preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
+        pre_window = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
+        add_cues = gr.Checkbox(value=True, label="Add cues header")
+        strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
+    with gr.Row():
+        repeats = gr.Slider(1, 6, value=4, step=1, label="Repeats per config")
+        max_total_runs = gr.Slider(1, 200, value=40, step=1, label="Max total runs")
+    gr.Markdown("### Model loading")
+    with gr.Row():
+        load_4bit = gr.Checkbox(value=True, label="Load in 4-bit (bitsandbytes, GPU)")
+        dtype = gr.Dropdown(choices=["bfloat16","float16","float32"], value="bfloat16", label="Compute dtype")
+        trust_remote_code = gr.Checkbox(value=True, label="Trust remote code")
+    run_btn = gr.Button("Run Batch")
+    with gr.Row():
+        table = gr.Dataframe(label="Results", interactive=False, wrap=True, height=400)
+    with gr.Row():
+        csv_dl = gr.File(label="Download CSV", interactive=False)
+        zip_dl = gr.File(label="Download Artifacts ZIP", interactive=False)
+    status = gr.Markdown("")
+    def _run(*args):
+        df, csv_pair, zip_bytes = run_batch_ui(*args)
+        if isinstance(df, pd.DataFrame) and not df.empty:
+            csv_name, csv_data = csv_pair
+            csv_buf = io.BytesIO(csv_data); csv_buf.name = csv_name
+            zip_buf = io.BytesIO(zip_bytes); zip_buf.name = "artifacts.zip"
+            return df, csv_buf, zip_buf, "Done."
+        else:
+            return pd.DataFrame(), None, None, csv_pair  # csv_pair holds error string here
+    run_btn.click(
+        _run,
+        inputs=[models, instructions, context, dataset_zip,
+                soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
+                repeats, max_total_runs, load_4bit, dtype, trust_remote_code],
+        outputs=[table, csv_dl, zip_dl, status]
+    )
+demo.queue().launch()