Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 23

Commit

62c9ed8

verified ·

1 Parent(s): f214cbe

Update app.py

Browse files

Files changed (1) hide show

app.py +353 -880

app.py CHANGED Viewed

@@ -1,937 +1,410 @@
-# app.py — From Talk to Task (robust snapshot loader, revision pinning)
-# Keeps your full feature set: Single + Batch, preprocessing, metrics, UBS score, artifacts.
-# Key fix: models are downloaded atomically via snapshot_download at a pinned revision
-# and then loaded from local dir to avoid partial shard errors (e.g., *-00003-of-00003.safetensors).
-import os, io, re, sys, time, json, zipfile, statistics
 from pathlib import Path
-from typing import List, Dict, Tuple, Union, Optional
 import gradio as gr
-import pandas as pd
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-from huggingface_hub import snapshot_download  # <-- robust model fetch
-# ========= ZeroGPU support =========
-try:
-    import spaces  # available on HF Spaces
-except Exception:
-    class _DummySpaces:
-        def GPU(self, *args, **kwargs):
-            def deco(f): return f
-            return deco
-    spaces = _DummySpaces()
-# ========= Persistent cache for Spaces =========
-# Ensures model files survive restarts and prevents re-downloading shards.
-os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
-# ========= Auth token =========
-HF_TOKEN = (
-    os.getenv("HF_TOKEN")
-    or os.getenv("HUGGINGFACE_HUB_TOKEN")
-    or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-)
-# Console warning at startup (helps when logs are open)
-if not HF_TOKEN:
-    print(
-        "[WARN] HF_TOKEN is not set. Gated models will fail. "
-        "Set it in Space → Settings → Variables and secrets.",
-        file=sys.stderr
     )
-# ========= Labels & metrics =========
-ALLOWED_LABELS = [
-    "plan_contact",
-    "schedule_meeting",
-    "update_contact_info_non_postal",
-    "update_contact_info_postal_address",
-    "update_kyc_activity",
-    "update_kyc_origin_of_assets",
-    "update_kyc_purpose_of_businessrelation",
-    "update_kyc_total_assets",
-]
-LABEL_TO_IDX = {l: i for i, l in enumerate(ALLOWED_LABELS)}
-FN_PENALTY = 2.0
-FP_PENALTY = 1.0
-def safe_json_load(s: str):
-    """Best-effort JSON extractor; returns {'labels': []} shape on fallback."""
-    try:
-        return json.loads(s)
-    except Exception:
-        pass
-    m = re.search(r"\{.*\}", s, re.S)
-    if m:
-        try:
-            return json.loads(m.group(0))
-        except Exception:
-            pass
-    return {"labels": [], "notes": "WARN: model output not valid JSON; fallback used"}
-def _coerce_labels_list(x):
-    if isinstance(x, list):
-        out = []
-        for it in x:
-            if isinstance(it, str): out.append(it)
-            elif isinstance(it, dict):
-                for k in ("label", "value", "task", "category", "name"):
-                    v = it.get(k)
-                    if isinstance(v, str):
-                        out.append(v); break
-                else:
-                    if isinstance(it.get("labels"), list):
-                        out += [s for s in it["labels"] if isinstance(s, str)]
-        # dedupe keep order
-        seen = set(); norm = []
-        for s in out:
-            if s not in seen:
-                norm.append(s); seen.add(s)
-        return norm
-    if isinstance(x, dict):
-        for k in ("expected_labels", "labels", "targets", "y_true"):
-            if k in x: return _coerce_labels_list(x[k])
-        if "one_hot" in x and isinstance(x["one_hot"], dict):
-            return [k for k, v in x["one_hot"].items() if v]
-    return []
-def classic_metrics(pred_labels, exp_labels):
-    pred = set([str(x) for x in (pred_labels or []) if isinstance(x, (str,int,float,bool))])
-    gold = set([str(x) for x in (exp_labels  or []) if isinstance(x, (str,int,float,bool))])
-    if not pred and not gold:
-        return True, 1.0, 1.0, 1.0, 1.0
-    inter = pred & gold; union = pred | gold
-    exact = (sorted(pred) == sorted(gold))
-    precision = (len(inter) / (len(pred) if pred else 1e-9))
-    recall    = (len(inter) / (len(gold) if gold else 1e-9))
-    f1 = 0.0 if len(inter) == 0 else 2*len(inter) / (len(pred)+len(gold)+1e-9)
-    hamming = (len(inter) / (len(union) if union else 1e-9))
-    return exact, precision, recall, f1, hamming
-def ubs_score_one(true_labels, pred_labels) -> float:
-    tset = [l for l in (true_labels or []) if l in LABEL_TO_IDX]
-    pset = [l for l in (pred_labels or []) if l in LABEL_TO_IDX]
-    n_labels = len(ALLOWED_LABELS)
-    tpos = set(tset); ppos = set(pset)
-    fn = sum(1 for l in ALLOWED_LABELS if (l in tpos and l not in ppos))
-    fp = sum(1 for l in ALLOWED_LABELS if (l not in tpos and l in ppos))
-    weighted = FN_PENALTY*fn + FP_PENALTY*fp
-    t_count = len(tpos)
-    max_err = FN_PENALTY*t_count + FP_PENALTY*(n_labels - t_count)
-    score = 1.0 if max_err == 0 else (1.0 - (weighted / max_err))
-    return float(max(0.0, min(1.0, score)))
-# ========= Lightweight preprocessing =========
-EMAIL_RX   = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', re.I)
-TIME_RX    = re.compile(r'\b(\d{1,2}:\d{2}\b|\b\d{1,2}\s?(am|pm)\b|\bafternoon\b|\bmorning\b|\bevening\b)', re.I)
-DATE_RX    = re.compile(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b|\b\d{1,2}[/-]\d{1,2}([/-]\d{2,4})?\b|\b20\d{2}\b', re.I)
-MEET_RX    = re.compile(r'\b(meet(ing)?|call|appointment|schedule|invite|agenda|online|in[- ]?person|phone|zoom|teams)\b', re.I)
-MODAL_RX   = re.compile(r'\b(online|in[- ]?person|phone|zoom|teams)\b', re.I)
-SMALLTALK_RX = re.compile(r'^\s*(user|advisor):\s*(thanks( you)?|thank you|anything else|have a great day|you too)\b', re.I)
-TYPO_FIXES = [
-    (re.compile(r'\bschedulin\s*g\b', re.I), 'scheduling'),
-    (re.compile(r'\beeting\b', re.I), 'meeting'),
-    (re.compile(r'\bdi?i?gtal\b', re.I), 'digital'),
-    (re.compile(r'\bdigi\s+tal\b', re.I), 'digital'),
-    (re.compile(r'\bspread\s*sheet\b', re.I), 'spreadsheet'),
-    (re.compile(r'\bseats\b', re.I), 'sheets'),
-    (re.compile(r'\bver(s|z)ion meters\b', re.I), 'version metrics'),
 ]
-def normalize_text(text: str, fix_typos: bool = True) -> str:
-    t = text.replace('\r\n', '\n')
-    t = re.sub(r'^\s*Speaker\s*1\s*:\s*', 'USER: ', t, flags=re.I | re.M)
-    t = re.sub(r'^\s*Speaker\s*2\s*:\s*', 'ADVISOR: ', t, flags=re.I | re.M)
-    t = re.sub(r'[ \t]+', ' ', t)
-    t = re.sub(r'\n{3,}', '\n\n', t)
-    if fix_typos:
-        for rx, rep in TYPO_FIXES:
-            t = rx.sub(rep, t)
-    return t.strip()
-def extract_cues(text: str):
-    emails = EMAIL_RX.findall(text)
-    email_new, email_old = (emails[-1], emails[-2]) if len(emails)>=2 else ((emails[-1], None) if emails else (None, None))
-    has_time = bool(TIME_RX.search(text))
-    has_date = bool(DATE_RX.search(text))
-    has_meet = bool(MEET_RX.search(text))
-    modality = None
-    m = MODAL_RX.search(text)
-    if m:
-        modality = m.group(0).upper().replace('IN PERSON','IN_PERSON').replace('IN-PERSON','IN_PERSON')
-    meeting_confirmed = (has_meet and (has_time or has_date))
-    tm = TIME_RX.search(text)
-    norm_tm = tm.group(0) if tm else None
-    return {
-        "email_new": email_new,
-        "email_old": email_old,
-        "contact_pref": "EMAIL" if email_new else None,
-        "meeting_time_fragment": norm_tm,
-        "meeting_modality": modality,
-        "meeting_confirmed": meeting_confirmed
-    }
-def build_cues_header(cues: dict) -> str:
-    has_any = any([cues.get("email_new"), cues.get("email_old"), cues.get("contact_pref"), cues.get("meeting_confirmed")])
-    if not has_any:
-        return ""
-    lines = ["[DETECTED_CUES]"]
-    if cues.get("email_new"): lines.append(f"EMAIL_NEW: {cues['email_new']}")
-    if cues.get("email_old"): lines.append(f"EMAIL_OLD: {cues['email_old']}")
-    if cues.get("contact_pref"): lines.append(f"CONTACT_PREF: {cues['contact_pref']}")
-    if cues.get("meeting_confirmed"):
-        mod = cues.get("meeting_modality") or ""
-        tm  = cues.get("meeting_time_fragment") or ""
-        lines.append(f"MEETING: {(tm + ' ' + mod).strip()} CONFIRMED")
-    lines.append("[/DETECTED_CUES]")
-    return "\n".join(lines)
-def find_cue_lines(lines):
-    idx = set()
-    for i, ln in enumerate(lines):
-        if EMAIL_RX.search(ln) or (MEET_RX.search(ln) and (TIME_RX.search(ln) or DATE_RX.search(ln))):
-            idx.add(i)
-    return sorted(idx)
-def prune_by_window(lines, cue_idx, window=3, strip_smalltalk=False):
-    n = len(lines); keep = set()
-    for k in cue_idx:
-        lo, hi = max(0, k-window), min(n-1, k+window)
-        keep.update(range(lo,hi+1))
-    out=[]
-    for i, ln in enumerate(lines):
-        if i in keep:
-            if strip_smalltalk and SMALLTALK_RX.search(ln): continue
-            out.append(ln)
-    return out
-def shrink_to_token_cap_by_lines(text: str, soft_cap_tokens: int, tokenizer,
-                                 min_lines_keep: int = 30,
-                                 apply_only_if_ratio: float = 1.15) -> str:
-    ids = tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids
-    est = len(ids)
-    threshold = int(soft_cap_tokens * apply_only_if_ratio)
-    if est <= threshold: return text
-    parts = text.splitlines()
-    if len(parts) <= min_lines_keep: return text
-    keep_flags=[]
-    for ln in parts:
-        is_header = ln.startswith("[DETECTED_CUES]") or ln.startswith("[/DETECTED_CUES]") \
-                    or ln.startswith("EMAIL_") or ln.startswith("CONTACT_") or ln.startswith("MEETING:")
-        is_cue = bool(EMAIL_RX.search(ln) or MEET_RX.search(ln) or DATE_RX.search(ln) or TIME_RX.search(ln))
-        keep_flags.append(is_header or is_cue)
-    pruned = [ln for ln, keep in zip(parts, keep_flags) if keep]
-    if len(pruned) < min_lines_keep:
-        pad_needed = min_lines_keep - len(pruned)
-        non_cue_lines = [ln for ln, keep in zip(parts, keep_flags) if not keep]
-        pruned = pruned + non_cue_lines[:pad_needed]
-    candidate = "\n".join(pruned)
-    cand_tokens = len(tokenizer(candidate, return_tensors=None, add_special_tokens=False).input_ids)
-    if cand_tokens > threshold:
-        mid = len(parts)//2
-        half = max(min_lines_keep//2, 50)
-        slice_parts = parts[max(0, mid-half): min(len(parts), mid+half)]
-        candidate2 = "\n".join(slice_parts)
-        candidate2_tokens = len(tokenizer(candidate2, return_tensors=None, add_special_tokens=False).input_ids)
-        candidate = candidate if cand_tokens <= candidate2_tokens else candidate2
-    if len(candidate.splitlines()) < min_lines_keep: return text
-    return candidate
-def enforce_rules(labels, transcript_text):
-    labels = set(labels or [])
-    if (TIME_RX.search(transcript_text) or DATE_RX.search(transcript_text)) and MEET_RX.search(transcript_text):
-        labels.add("schedule_meeting"); labels.discard("plan_contact")
-    if EMAIL_RX.search(transcript_text) and re.search(r'\b(update|new|set|change|confirm(ed)?|for all communication)\b', transcript_text, re.I):
-        labels.add("update_contact_info_non_postal")
-    kyc_rx = re.compile(r'\b(kyc|aml|compliance|employer|occupation|purpose of (relationship|account)|source of (wealth|funds)|net worth|total assets)\b', re.I)
-    if "update_kyc_activity" in labels and not kyc_rx.search(transcript_text):
-        labels.discard("update_kyc_activity")
-    return sorted(labels)
-# ========= Revision pinning =========
-# Map repo_id -> default revision (None -> "main").
-MODEL_REVISIONS: Dict[str, Optional[str]] = {
-    "mistralai/Mistral-7B-Instruct-v0.2": None,  # set an env var to pin a commit if desired
-    "Qwen/Qwen2.5-7B-Instruct": None,
     "HuggingFaceH4/zephyr-7b-beta": None,
     "tiiuae/falcon-7b-instruct": None,
 }
-def _slug_repo_id(repo_id: str) -> str:
-    return re.sub(r"[^A-Za-z0-9]", "_", repo_id).upper()
-def resolve_revision(repo_id: str) -> str:
-    """Order: env var MODEL_REVISION__<ORG_MODEL> > dict default > 'main'."""
-    env_key = f"MODEL_REVISION__{_slug_repo_id(repo_id)}"
-    env_rev = os.getenv(env_key, "").strip()
-    if env_rev:
-        return env_rev
-    default_rev = MODEL_REVISIONS.get(repo_id)
-    return (default_rev.strip() if isinstance(default_rev, str) and default_rev.strip() else "main")
-def ensure_local_dir(repo_id: str) -> str:
-    """Download a pinned snapshot to cache and return its local path."""
-    rev = resolve_revision(repo_id)
-    local_dir = snapshot_download(
-        repo_id=repo_id,
-        revision=rev,
-        allow_patterns=[
-            "*.json", "*.safetensors", "*.bin", "*.model",
-            "tokenizer.*", "config.json", "generation_config.json", "*.py"
-        ],
-        resume_download=True,
-        local_dir=None,                 # use HF cache under HF_HOME
-        local_dir_use_symlinks=False,
-        token=HF_TOKEN,
     )
-    return local_dir
-# ========= HF model wrapper (loads from local snapshot) =========
 class HFModel:
-    def __init__(self, repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
         self.repo_id = repo_id
-        self.revision = resolve_revision(repo_id)
-        # Always load from a complete local snapshot to avoid partial shards
-        self.local_dir = ensure_local_dir(repo_id)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.local_dir, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN
-        )
-        torch_dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}.get(dtype, torch.bfloat16)
-        self.model = None
-        if load_4bit:
-            try:
-                q = BitsAndBytesConfig(
-                    load_in_4bit=True, bnb_4bit_use_double_quant=True,
-                    bnb_4bit_compute_dtype=torch_dtype, bnb_4bit_quant_type="nf4"
-                )
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.local_dir, device_map="auto", trust_remote_code=trust_remote_code,
-                    quantization_config=q, torch_dtype=torch_dtype, token=HF_TOKEN
-                )
-            except Exception as e:
-                print(f"[WARN] 4-bit load failed for {repo_id}@{self.revision}: {e}\nFalling back to normal load...", file=sys.stderr)
-        if self.model is None:
             self.model = AutoModelForCausalLM.from_pretrained(
-                self.local_dir, device_map="auto", trust_remote_code=trust_remote_code,
-                torch_dtype=torch_dtype, token=HF_TOKEN
             )
-        self.max_context = getattr(self.model.config, "max_position_embeddings", None) \
-                           or getattr(self.model.config, "max_sequence_length", None) or 8192
-    def apply_chat_template(self, system_text: str, user_text: str) -> str:
-        if getattr(self.tokenizer, "chat_template", None):
-            messages = [{"role":"system","content":system_text},
-                        {"role":"user","content":user_text}]
-            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        return ("### System\n" + system_text.strip() + "\n\n" +
-                "### User\n" + user_text.strip() + "\n\n" +
-                "### Assistant\n")
     @torch.inference_mode()
-    def generate_json(self, system_text: str, user_text: str, max_new_tokens: int = 256):
-        prompt = self.apply_chat_template(system_text, user_text)
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        t0 = time.perf_counter()
-        out = self.model.generate(
-            **inputs, max_new_tokens=max_new_tokens,
-            do_sample=False, temperature=None, top_p=None,
-            eos_token_id=self.tokenizer.eos_token_id
         )
-        latency_ms = int((time.perf_counter() - t0) * 1000)
-        text = self.tokenizer.decode(out[0], skip_special_tokens=True)
-        if text.startswith(prompt): text = text[len(prompt):]
-        return latency_ms, text, prompt
-# Cache now includes revision implicitly via HFModel (we also add revision to key)
-MODEL_CACHE: Dict[Tuple[str, bool, str, bool, str], HFModel] = {}
-def get_model(repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
-    rev = resolve_revision(repo_id)
-    key = (repo_id, bool(load_4bit), dtype, bool(trust_remote_code), rev)
-    if key not in MODEL_CACHE:
-        MODEL_CACHE[key] = HFModel(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
-    return MODEL_CACHE[key]
-# ========= ZeroGPU functions =========
-@spaces.GPU(duration=180, secrets=["HF_TOKEN"])  # pass token into ZeroGPU job
-def gpu_generate(repo_id: str, system_text: str, user_text: str,
-                 load_4bit: bool, dtype: str, trust_remote_code: bool):
-    token_seen = bool(os.getenv("HF_TOKEN"))
-    hf = get_model(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
-    lat, txt, prmpt = hf.generate_json(system_text.strip(), user_text.strip(), max_new_tokens=256)
-    return lat, txt, prmpt, token_seen
-@spaces.GPU(duration=15, secrets=["HF_TOKEN"])
-def gpu_check_token():
-    return bool(os.getenv("HF_TOKEN"))
-# ========= ZIP helpers =========
-def _read_zip_bytes(dataset_zip: Union[bytes, str, dict, None]) -> bytes:
-    if dataset_zip is None: raise ValueError("No ZIP provided")
-    if isinstance(dataset_zip, bytes): return dataset_zip
-    if isinstance(dataset_zip, str):
-        with open(dataset_zip, "rb") as f: return f.read()
-    if isinstance(dataset_zip, dict) and "path" in dataset_zip:
-        with open(dataset_zip["path"], "rb") as f: return f.read()
-    path = getattr(dataset_zip, "name", None)
-    if path and os.path.exists(path):
-        with open(path, "rb") as f: return f.read()
-    raise ValueError("Unsupported file object from Gradio")
-def parse_zip(zip_bytes: bytes) -> Dict[str, Tuple[str, List[str]]]:
-    zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
-    samples = {}
-    for n in zf.namelist():
-        p = Path(n)
-        if p.suffix.lower() == ".txt":
-            samples.setdefault(p.stem, ["", []])[0] = zf.read(n).decode("utf-8", "replace")
-        elif p.suffix.lower() == ".json":
-            try:
-                js = json.loads(zf.read(n).decode("utf-8", "replace"))
-            except Exception:
-                js = []
-            samples.setdefault(p.stem, ["", []])[1] = _coerce_labels_list(js)
-    return samples
-# ========= Prompts =========
-DEFAULT_SYSTEM = (
-    "You are a task extraction assistant. "
-    "Always output valid JSON with a field \"labels\" (list of strings). "
-    "Use only from this set: " + json.dumps(ALLOWED_LABELS) + ". "
-    "Return JSON only."
-)
-DEFAULT_CONTEXT = (
-    "- plan_contact: conversation without a concrete meeting (no date/time)\n"
-    "- schedule_meeting: explicit date/time/modality confirmation\n"
-    "- update_contact_info_non_postal: changes to email/phone\n"
-    "- update_contact_info_postal_address: changes to mailing address\n"
-    "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)"
-)
-# ========= Preprocess + build input =========
-def prepare_input_text(raw_txt: str, soft_cap: int, preprocess: bool, pre_window: int,
-                       add_cues: bool, strip_smalltalk: bool, tokenizer) -> Tuple[str, int, int]:
-    before = len(tokenizer(raw_txt, return_tensors=None, add_special_tokens=False).input_ids)
-    proc_text = raw_txt
-    if preprocess:
-        t_norm = normalize_text(proc_text, fix_typos=True)
-        lines = [ln.strip() for ln in t_norm.splitlines() if ln.strip()]
-        cue_lines = find_cue_lines(lines)
-        if cue_lines:
-            kept = prune_by_window(lines, cue_lines, window=pre_window, strip_smalltalk=strip_smalltalk)
-        else:
-            kept = [ln for ln in lines if not (strip_smalltalk and SMALLTALK_RX.search(ln))]
-        t_kept = "\n".join(kept)
-        cues = extract_cues(t_kept)
-        header = build_cues_header(cues) if add_cues else ""
-        proc_text = (header + "\n\n" + t_kept).strip() if header else t_kept
-        proc_text = shrink_to_token_cap_by_lines(proc_text, soft_cap, tokenizer)
-        if len(proc_text.splitlines()) < 30:
-            proc_text = t_norm
-    after = len(tokenizer(proc_text, return_tensors=None, add_special_tokens=False).input_ids)
-    return proc_text, before, after
-def explain_params_markdown() -> str:
-    return (
-        "**Parameter help**  \n"
-        "- **Soft token cap**: target max input size; we prune long transcripts toward this size to save latency.  \n"
-        "- **Enable preprocessing**: normalizes speaker tags, fixes obvious typos, and focuses on cue lines.  \n"
-        "- **Window ± lines around cues**: how many lines we keep around detected cues (dates/emails/‘meeting’, etc.).  \n"
-        "- **Add cues header**: inserts a short summary block (email, meeting signal) above the transcript to guide the model.  \n"
-        "- **Strip smalltalk**: removes lines like ‘thanks, bye’ to keep only useful content.  \n"
-        "- **Load in 4-bit (GPU only)**: memory-saving quantization; has no effect on CPU Spaces."
     )
-# ========= Single mode =========
-def single_mode(
-    preset_model: str, custom_model: str,
-    system_text: str, context_text: str,
-    transcript_text: str, transcript_file,
-    expected_labels_json,
-    soft_cap: int, preprocess: bool, pre_window: int, add_cues: bool, strip_smalltalk: bool,
-    load_4bit: bool, dtype: str, trust_remote_code: bool
-):
-    repo_id = custom_model.strip() or preset_model.strip()
-    if not repo_id:
-        return "Please choose a model.", "", "", "", None, None, None, ""
-    txt = (transcript_text or "").strip()
-    if transcript_file and hasattr(transcript_file, "name") and os.path.exists(transcript_file.name):
-        with open(transcript_file.name, "r", encoding="utf-8", errors="replace") as f:
-            txt = f.read()
-    if not txt:
-        return "Please paste a transcript or upload a .txt file.", "", "", "", None, None, None, ""
-    exp = []
-    if expected_labels_json and hasattr(expected_labels_json, "name") and os.path.exists(expected_labels_json.name):
-        try:
-            with open(expected_labels_json.name, "r", encoding="utf-8", errors="replace") as f:
-                exp = _coerce_labels_list(json.load(f))
-        except Exception:
-            exp = []
-    # tokenizer for preprocessing — from local snapshot to avoid streaming
     try:
-        local_dir = ensure_local_dir(repo_id)
-        dummy_tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN)
     except Exception as e:
-        msg = (f"Failed to load tokenizer for `{repo_id}`. "
-               "If gated, accept license and set HF_TOKEN in Space → Settings → Secrets.\n\nError: " + str(e))
-        return msg, "", "", "", None, None, None, banner_text()
-    proc_text, tok_before, tok_after = prepare_input_text(
-        txt, soft_cap, preprocess, pre_window, add_cues, strip_smalltalk, dummy_tok
-    )
-    system = (system_text or DEFAULT_SYSTEM).strip()
-    user = (context_text or DEFAULT_CONTEXT).strip() + "\n\nTRANSCRIPT\n" + proc_text.strip()
     try:
-        latency_ms, raw_text, _prompt, gpu_token_seen = gpu_generate(
-            repo_id, system, user, load_4bit, dtype, trust_remote_code
-        )
     except Exception as e:
-        msg = (f"Failed to run `{repo_id}`. If gated, accept license and set HF_TOKEN.\n\nError: {e}")
-        return msg, "", "", "", None, None, None, banner_text()
-    out = safe_json_load(raw_text)
-    pred_labels = enforce_rules(out.get("labels", []), proc_text)
-    exact, prec, rec, f1, ham = classic_metrics(pred_labels, exp)
-    ubs = ubs_score_one(exp, pred_labels) if exp else None
-    kpi1 = f"**F1**\n\n{f1:.3f}" if exp else "**F1**\n\n—"
-    kpi2 = f"**UBS score**\n\n{ubs:.3f}" if ubs is not None else "**UBS score**\n\n—"
-    kpi3 = f"**Latency (ms)**\n\n{latency_ms}"
-    zbuf = io.BytesIO()
-    with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as zout:
-        zout.writestr("PREPROCESSED.txt", proc_text)
-        zout.writestr("MODEL_OUTPUT.raw.txt", raw_text)
-        final_json = {
-            "labels": pred_labels,
-            "diagnostics": {
-                "model_name": repo_id,
-                "latency_ms": latency_ms,
-                "token_in_est_before": tok_before,
-                "token_in_est_after": tok_after,
-                "preprocess": preprocess,
-                "pre_window": pre_window,
-                "pre_add_cues_header": add_cues if preprocess else False,
-                "pre_strip_smalltalk": strip_smalltalk if preprocess else False,
-                "pre_soft_token_cap": soft_cap if preprocess else None,
-                "model_calls": 1
-            },
-            "evaluation": None if not exp else {
-                "exact_match": exact, "precision": prec, "recall": rec,
-                "f1": f1, "hamming": ham, "ubs_score": ubs
-            }
-        }
-        zout.writestr("FINAL.json", json.dumps(final_json, ensure_ascii=False, indent=2))
-    zbuf.seek(0); zbuf.name = "artifacts_single.zip"
-    row = pd.DataFrame([{
-        "model": repo_id,
-        "latency_ms": latency_ms,
-        "token_before": tok_before,
-        "token_after": tok_after,
-        "model_calls": 1,
-        "pred_labels": json.dumps(pred_labels, ensure_ascii=False),
-        "exp_labels": json.dumps(exp, ensure_ascii=False),
-        "exact_match": exact if exp else None,
-        "precision": round(prec,6) if exp else None,
-        "recall": round(rec,6) if exp else None,
-        "f1": round(f1,6) if exp else None,
-        "hamming": round(ham,6) if exp else None,
-        "ubs_score": round(ubs,6) if ubs is not None else None
-    }])
-    csv_buf = io.BytesIO(row.to_csv(index=False).encode("utf-8")); csv_buf.name = "results_single.csv"
-    return (
-        "Done.",
-        kpi1, kpi2, kpi3,
-        row, csv_buf, zbuf,
-        banner_text(gpu_token_seen)
-    )
-# ========= Batch mode =========
-def run_batch_ui(models_list, custom_models_str, instructions_text, context_text, dataset_zip,
-                 soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
-                 repeats, max_total_runs, load_4bit, dtype, trust_remote_code):
-    models = [m for m in (models_list or [])]
-    models += [m.strip() for m in (custom_models_str or "").split(",") if m.strip()]
-    if not models:
-        return pd.DataFrame(), None, None, "Please pick at least one model.", banner_text()
-    if not dataset_zip:
-        return pd.DataFrame(), None, None, "Please upload a ZIP with *.txt (+ optional matching *.json).", banner_text()
     try:
-        zip_bytes = _read_zip_bytes(dataset_zip)
-        samples = parse_zip(zip_bytes)
     except Exception as e:
-        return pd.DataFrame(), None, None, f"Failed to read ZIP: {e}", banner_text()
-    rows = []; total_runs = 0
-    all_artifacts = io.BytesIO()
-    zout = zipfile.ZipFile(all_artifacts, "w", zipfile.ZIP_DEFLATED)
-    last_gpu_token_seen = None
-    for repo_id in models:
-        # tokenizer for preprocessing (auth check) — also from local snapshot
         try:
-            local_dir = ensure_local_dir(repo_id)
-            dummy_tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN)
         except Exception as e:
-            # gated or missing token; record a summary row and continue
-            rows.append({
-                "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                "sample_id": None,
-                "model": repo_id,
-                "is_summary": True,
-                "run_index": None,
-                "preprocess": preprocess,
-                "pre_window": pre_window,
-                "add_cues_header": add_cues,
-                "strip_smalltalk": strip_smalltalk,
-                "soft_cap": soft_cap,
-                "median_latency_ms": None,
-                "latency_ms": None,
-                "token_before": None,
-                "token_after": None,
-                "model_calls": None,
-                "pred_labels": "[]",
-                "exp_labels": "[]",
-                "exact_match": None,
-                "precision": None,
-                "recall": None,
-                "f1": None,
-                "hamming": None,
-                "ubs_score": None,
-            })
-            continue
-        for sample_id, (transcript_text, exp_labels) in samples.items():
-            if not transcript_text.strip(): continue
-            latencies = []; last_pred = None
-            for r in range(1, repeats+1):
-                if total_runs >= max_total_runs: break
-                proc_text, before_tok, after_tok = prepare_input_text(
-                    transcript_text, soft_cap, preprocess, pre_window, add_cues, strip_smalltalk, dummy_tok
-                )
-                system_text = (instructions_text or DEFAULT_SYSTEM).strip()
-                user_text = (context_text or DEFAULT_CONTEXT).strip() + "\n\nTRANSCRIPT\n" + proc_text.strip()
-                try:
-                    latency_ms, raw_text, _prompt, token_seen = gpu_generate(
-                        repo_id, system_text, user_text, load_4bit, dtype, trust_remote_code
-                    )
-                    last_gpu_token_seen = token_seen
-                except Exception as e:
-                    base = f"{repo_id.replace('/','_')}/{sample_id}/error_r{r}"
-                    zout.writestr(base + "/ERROR.txt", f"Failed to run model via @spaces.GPU. If gated, accept license and set HF_TOKEN.\n\n{e}")
-                    total_runs += 1
-                    continue
-                out = safe_json_load(raw_text)
-                pred_labels = enforce_rules(out.get("labels", []), proc_text)
-                exact, prec, rec, f1, ham = classic_metrics(pred_labels, exp_labels)
-                ubs = ubs_score_one(exp_labels, pred_labels)
-                rows.append({
-                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                    "sample_id": sample_id,
-                    "model": repo_id,
-                    "is_summary": False,
-                    "run_index": r,
-                    "preprocess": preprocess,
-                    "pre_window": pre_window,
-                    "add_cues_header": add_cues,
-                    "strip_smalltalk": strip_smalltalk,
-                    "soft_cap": soft_cap,
-                    "latency_ms": latency_ms,
-                    "token_before": before_tok,
-                    "token_after": after_tok,
-                    "model_calls": 1,
-                    "pred_labels": json.dumps(pred_labels, ensure_ascii=False),
-                    "exp_labels": json.dumps(exp_labels, ensure_ascii=False),
-                    "exact_match": exact,
-                    "precision": round(prec, 6),
-                    "recall": round(rec, 6),
-                    "f1": round(f1, 6),
-                    "hamming": round(ham, 6),
-                    "ubs_score": round(ubs, 6),
-                })
-                base = f"{repo_id.replace('/','_')}/{sample_id}/pre{int(preprocess)}_win{pre_window}_cues{int(add_cues)}_small{int(strip_smalltalk)}_cap{soft_cap}_r{r}"
-                zout.writestr(base + "/PREPROCESSED.txt", proc_text)
-                zout.writestr(base + "/MODEL_OUTPUT.raw.txt", raw_text)
-                final_json = {
-                    "labels": pred_labels,
-                    "diagnostics": {
-                        "model_name": repo_id,
-                        "latency_ms": latency_ms,
-                        "token_in_est_before": before_tok,
-                        "token_in_est_after": after_tok,
-                        "preprocess": preprocess,
-                        "pre_window": pre_window,
-                        "pre_add_cues_header": add_cues if preprocess else False,
-                        "pre_strip_smalltalk": strip_smalltalk if preprocess else False,
-                        "pre_soft_token_cap": soft_cap if preprocess else None,
-                        "model_calls": 1
-                    }
-                }
-                zout.writestr(base + "/FINAL.json", json.dumps(final_json, ensure_ascii=False, indent=2))
-                latencies.append(latency_ms)
-                last_pred = pred_labels
-                total_runs += 1
-            if latencies:
-                med = int(statistics.median(latencies))
-                exact, prec, rec, f1, ham = classic_metrics(last_pred, exp_labels) if last_pred is not None else (None,)*5
-                ubs = ubs_score_one(exp_labels, last_pred) if last_pred is not None else None
-                rows.append({
-                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                    "sample_id": sample_id,
-                    "model": repo_id,
-                    "is_summary": True,
-                    "run_index": None,
-                    "preprocess": preprocess,
-                    "pre_window": pre_window,
-                    "add_cues_header": add_cues,
-                    "strip_smalltalk": strip_smalltalk,
-                    "soft_cap": soft_cap,
-                    "median_latency_ms": med,
-                    "latency_ms": None,
-                    "token_before": None,
-                    "token_after": None,
-                    "model_calls": None,
-                    "pred_labels": json.dumps(last_pred or [], ensure_ascii=False),
-                    "exp_labels": json.dumps(exp_labels or [], ensure_ascii=False),
-                    "exact_match": exact,
-                    "precision": round(prec, 6) if prec is not None else None,
-                    "recall": round(rec, 6) if rec is not None else None,
-                    "f1": round(f1, 6) if f1 is not None else None,
-                    "hamming": round(ham, 6) if ham is not None else None,
-                    "ubs_score": round(ubs, 6) if ubs is not None else None,
-                })
-        if total_runs >= max_total_runs:
-            break
-    zout.close()
-    df = pd.DataFrame(rows)
-    if df.empty:
-        return pd.DataFrame(), None, None, "No runs executed (empty dataset / exceeded cap / gated models).", banner_text(last_gpu_token_seen)
-    csv_pair = ("results.csv", df.to_csv(index=False).encode("utf-8"))
-    zip_pair = ("artifacts.zip", all_artifacts.getvalue())
-    return df, csv_pair, zip_pair, "Done.", banner_text(last_gpu_token_seen)
-# ========= UI helpers =========
-OPEN_MODEL_PRESETS = [
-    "mistralai/Mistral-7B-Instruct-v0.2",
-    "Qwen/Qwen2.5-7B-Instruct",
-    "HuggingFaceH4/zephyr-7b-beta",
-    "tiiuae/falcon-7b-instruct",
-]
-def banner_text(gpu_token_seen: bool | None = None) -> str:
-    app_seen = bool(HF_TOKEN)
-    lines = []
-    if not app_seen:
-        lines.append("🟡 **HF_TOKEN not detected in App** — gated models will fail unless you set it in **Settings → Variables and secrets**.")
-    else:
-        lines.append("🟢 **HF_TOKEN detected in App**.")
-    if gpu_token_seen is None:
-        lines.append("ℹ️ ZeroGPU token status: click **Run** or **Check ZeroGPU token** to verify.")
-    else:
-        lines.append("🟢 **HF_TOKEN detected inside ZeroGPU job.**" if gpu_token_seen else "🔴 **HF_TOKEN missing inside ZeroGPU job** (add `secrets=[\"HF_TOKEN\"]` to @spaces.GPU).")
-    lines.append("✅ Tip: use **Open models** (no license gating): " + ", ".join(OPEN_MODEL_PRESETS))
-    # Show pin info for transparency
-    try:
-        revs = [f"{m}@{resolve_revision(m)}" for m in OPEN_MODEL_PRESETS]
-        lines.append("📌 Pinned revisions: " + ", ".join(revs))
-    except Exception:
-        pass
-    return "\n\n".join(lines)
-# ========= UI (dark red) =========
-DARK_RED_CSS = """
-:root, .gradio-container {
-  --color-background: #0b0b0d;
-  --color-foreground: #e6e6e6;
-  --color-primary: #e11d48;
-  --color-secondary: #111216;
-  --color-border: #1f2024;
-  --color-muted: #9ca3af;
-}
-.gradio-container { background: var(--color-background) !important; color: var(--color-foreground) !important; }
-.gr-box, .gr-panel, .gr-group, .gr-form, .wrap.svelte-1ipelgc {
-  background: var(--color-secondary) !important;
-  border: 1px solid var(--color-border) !important;
-  border-radius: 10px !important;
-}
-button, .gr-button {
-  border-radius: 10px !important;
-  border: 1px solid var(--color-primary) !important;
-  background: linear-gradient(180deg, #b91c1c, #7f1d1d) !important;
-  color: white !important;
-}
-.kpi {
-  border: 1px solid #e11d48; border-radius: 10px; padding: 12px; text-align: center;
-  background: #1a0f10; font-size: 18px;
-}
-"""
-with gr.Blocks(title="From Talk to Task — HF Space", css=DARK_RED_CSS) as demo:
-    gr.Markdown("## 🟥 From Talk to Task — Batch & Single Task Extraction")
-    help_md = (
-        "This tool extracts **task labels** from transcripts using Hugging Face models.  \n"
-        "1) Pick a model (or paste a custom repo id).  \n"
-        "2) Provide **Instructions** and **Context**, then supply a transcript (single) or a ZIP (batch).  \n"
-        "3) Adjust parameters (soft token cap, preprocessing).  \n"
-        "4) Run and review **latency**, **precision/recall/F1**, **UBS score**, and download artifacts."
     )
-    gr.Markdown(help_md)
-    # Status banner (token presence + revisions)
-    banner = gr.Markdown(banner_text())
-    check_btn = gr.Button("Check ZeroGPU token")
-    def _check_token():
-        try:
-            present = gpu_check_token()
-        except Exception:
-            present = None
-        return banner_text(present)
-    check_btn.click(_check_token, outputs=banner)
     with gr.Tabs():
-        # Single
-        with gr.TabItem("Single Transcript (default)"):
-            with gr.Row():
-                with gr.Column():
-                    preset_model = gr.Dropdown(choices=OPEN_MODEL_PRESETS, value=OPEN_MODEL_PRESETS[0],
-                                               label="Model (Open presets — no gating)")
-                    custom_model = gr.Textbox(label="Custom model repo id (overrides preset)",
-                                              placeholder="e.g. meta-llama/Meta-Llama-3-8B-Instruct")
-                    instructions = gr.Textbox(label="Instructions (System)", lines=8, value=DEFAULT_SYSTEM)
-                    context = gr.Textbox(label="Context (User prefix before transcript)", lines=6, value=DEFAULT_CONTEXT)
-                with gr.Column():
-                    transcript_text = gr.Textbox(label="Paste transcript text", lines=14, placeholder="Paste your transcript here...")
-                    transcript_file = gr.File(label="...or upload a single transcript .txt", file_types=[".txt"], file_count="single", type="filepath")
-                    expected_labels_json = gr.File(label="(Optional) Expected labels JSON for metrics", file_types=[".json"], file_count="single", type="filepath")
-            with gr.Row():
-                with gr.Column():
-                    soft_cap_s = gr.Slider(1024, 32768, value=8192, step=512, label="Soft token cap")
-                    preprocess_s = gr.Checkbox(value=True, label="Enable preprocessing")
-                    pre_window_s = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
-                    add_cues_s = gr.Checkbox(value=True, label="Add cues header")
-                    strip_smalltalk_s = gr.Checkbox(value=False, label="Strip smalltalk")
-                    gr.Markdown(explain_params_markdown())
-                with gr.Column():
-                    load_4bit_s = gr.Checkbox(value=False, label="Load in 4-bit (GPU only)")
-                    dtype_s = gr.Dropdown(choices=["bfloat16","float16","float32"], value="bfloat16", label="Compute dtype")
-                    trust_remote_code_s = gr.Checkbox(value=True, label="Trust remote code")
-            run_single_btn = gr.Button("Run (Single)")
-            kpi1 = gr.Markdown(elem_classes=["kpi"]); kpi2 = gr.Markdown(elem_classes=["kpi"]); kpi3 = gr.Markdown(elem_classes=["kpi"])
-            single_table = gr.Dataframe(label="Single run — metrics & diagnostics", interactive=False)
-            single_csv = gr.File(label="Download CSV", interactive=False)
-            single_zip = gr.File(label="Download Artifacts ZIP", interactive=False)
-            single_status = gr.Markdown("")
             def _run_single(*args):
-                status, m1, m2, m3, df, csv_buf, zip_buf, btxt = single_mode(*args)
-                return m1 or "", m2 or "", m3 or "", (df if isinstance(df, pd.DataFrame) else pd.DataFrame()), csv_buf, zip_buf, (status or ""), (btxt or banner_text())
-            run_single_btn.click(
                 _run_single,
-                inputs=[preset_model, custom_model, instructions, context,
-                        transcript_text, transcript_file, expected_labels_json,
-                        soft_cap_s, preprocess_s, pre_window_s, add_cues_s, strip_smalltalk_s,
-                        load_4bit_s, dtype_s, trust_remote_code_s],
-                outputs=[kpi1, kpi2, kpi3, single_table, single_csv, single_zip, single_status, banner]
             )
-        # Batch
-        with gr.TabItem("Batch (ZIP of many transcripts)"):
-            with gr.Row():
-                with gr.Column():
-                    models_list = gr.Checkboxgroup(
-                        choices=OPEN_MODEL_PRESETS, value=[OPEN_MODEL_PRESETS[0]],
-                        label="Models (Open presets — select one or more)"
-                    )
-                    custom_models = gr.Textbox(label="Custom model repo ids (comma-separated)",
-                                               placeholder="e.g. meta-llama/Meta-Llama-3-8B-Instruct, Qwen/Qwen2.5-7B-Instruct")
-                    instructions_b = gr.Textbox(label="Instructions (System)", lines=8, value=DEFAULT_SYSTEM)
-                    context_b = gr.Textbox(label="Context (User prefix before transcript)", lines=6, value=DEFAULT_CONTEXT)
-                with gr.Column():
-                    dataset_zip = gr.File(
-                        label="Upload ZIP of transcripts (*.txt) + expected (*.json)",
-                        file_types=[".zip"], file_count="single", type="filepath"
-                    )
-                    gr.Markdown("Zip must contain pairs like `ID.txt` and optional `ID.json` with expected labels (same base filename).")
-            with gr.Row():
-                with gr.Column():
-                    soft_cap = gr.Slider(1024, 32768, value=8192, step=512, label="Soft token cap")
-                    preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
-                    pre_window = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
-                    add_cues = gr.Checkbox(value=True, label="Add cues header")
-                    strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
-                    gr.Markdown(explain_params_markdown())
-                with gr.Column():
-                    repeats = gr.Slider(1, 6, value=3, step=1, label="Repeats per config")
-                    max_total_runs = gr.Slider(1, 200, value=40, step=1, label="Max total runs")
-                    load_4bit = gr.Checkbox(value=False, label="Load in 4-bit (GPU only)")
-                    dtype = gr.Dropdown(choices=["bfloat16","float16","float32"], value="bfloat16", label="Compute dtype")
-                    trust_remote_code = gr.Checkbox(value=True, label="Trust remote code")
-            run_btn = gr.Button("Run Batch")
-            kpi_b1 = gr.Markdown(elem_classes=["kpi"]); kpi_b2 = gr.Markdown(elem_classes=["kpi"]); kpi_b3 = gr.Markdown(elem_classes=["kpi"])
-            table = gr.Dataframe(label="Batch results (per run + summary rows)", interactive=False)
-            csv_dl = gr.File(label="Download CSV", interactive=False)
-            zip_dl = gr.File(label="Download Artifacts ZIP", interactive=False)
-            status = gr.Markdown("")
             def _run_batch(*args):
-                df, csv_pair, zip_pair, msg, btxt = run_batch_ui(*args)
-                m1 = m2 = m3 = ""
-                if isinstance(df, pd.DataFrame) and not df.empty:
-                    summaries = df[df["is_summary"] == True]
-                    if not summaries.empty:
-                        last = summaries.iloc[-1]
-                        f1 = last.get("f1"); ubs = last.get("ubs_score"); med = last.get("median_latency_ms")
-                        m1 = f"**F1 (last summary)**\n\n{f1:.3f}" if pd.notna(f1) else "**F1 (last summary)**\n\n—"
-                        m2 = f"**UBS (last summary)**\n\n{ubs:.3f}" if pd.notna(ubs) else "**UBS (last summary)**\n\n—"
-                        m3 = f"**Median latency (ms)**\n\n{int(med) if pd.notna(med) else '—'}"
-                csv_buf = zip_buf = None
-                if isinstance(csv_pair, tuple):
-                    name, data = csv_pair; csv_buf = io.BytesIO(data); csv_buf.name = name
-                if isinstance(zip_pair, tuple):
-                    name, data = zip_pair; zip_buf = io.BytesIO(data); zip_buf.name = name
-                return m1, m2, m3, (df if isinstance(df, pd.DataFrame) else pd.DataFrame()), csv_buf, zip_buf, (msg or ""), (btxt or banner_text())
-            run_btn.click(
                 _run_batch,
-                inputs=[models_list, custom_models, instructions_b, context_b, dataset_zip,
-                        soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
-                        repeats, max_total_runs, load_4bit, dtype, trust_remote_code],
-                outputs=[kpi_b1, kpi_b2, kpi_b3, table, csv_dl, zip_dl, status, banner]
             )
-demo.launch()

+# app.py
+# From Talk to Task — Batch & Single Task Extraction
+# Works on CPU / GPU / ZeroGPU. Uses a writable HF cache path (no /data).
+# If you want to use gated models (e.g., mistralai/Mistral-7B-Instruct-v0.2),
+# accept the license on HF and set HF_TOKEN in Space → Settings → Secrets.
+import os
+import io
+import re
+import sys
+import time
+import json
+import zipfile
 from pathlib import Path
+from typing import List, Dict, Tuple, Optional
 import gradio as gr
+# ====== Robust, writable HF cache ======
+# Avoid /data (read-only in Spaces). Prefer $HOME or /tmp.
+HOME = Path(os.environ.get("HOME", "/home/user"))
+CACHE_DIR = HOME / ".cache" / "huggingface"
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+os.environ.setdefault("HF_HOME", str(CACHE_DIR))
+os.environ.setdefault("TRANSFORMERS_CACHE", str(CACHE_DIR))
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster downloads when available
+HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() or None
+# ====== Transformers safe import ======
+try:
+    import torch
+    from transformers import (
+        AutoTokenizer,
+        AutoModelForCausalLM,
+        BitsAndBytesConfig,
     )
+except Exception as e:
+    raise RuntimeError(
+        "Failed to import transformers/torch. "
+        "Make sure requirements.txt includes: transformers>=4.41, torch, accelerate"
+    ) from e
+DTYPE_FALLBACK = torch.float32
+if torch.cuda.is_available():
+    DTYPE_FALLBACK = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ====== ZeroGPU (optional) ======
+# If you’re running on ZeroGPU, Spaces injects a runtime; we keep a safe shim.
+try:
+    import spaces  # noqa: F401
+    ON_ZERO_GPU = True
+except Exception:
+    ON_ZERO_GPU = False
+# ====== UI presets ======
+OPEN_MODEL_PRESETS = [
+    # choose truly open, ungated options first
+    "HuggingFaceH4/zephyr-7b-beta",
+    "Qwen/Qwen2.5-7B-Instruct",
+    "tiiuae/falcon-7b-instruct",
+    # You can still type a custom gated model repo id below if you have access.
 ]
+PINNED_REVISIONS = {
+    # None means "main"
     "HuggingFaceH4/zephyr-7b-beta": None,
+    "Qwen/Qwen2.5-7B-Instruct": None,
     "tiiuae/falcon-7b-instruct": None,
+    # "mistralai/Mistral-7B-Instruct-v0.2": None,  # gated — use only if token + license ok
 }
+SYSTEM_INSTRUCTIONS = (
+    "You are a task extraction assistant. Always output valid JSON with a field "
+    '"labels" (list of strings). Use only from this set: '
+    '["plan_contact","schedule_meeting","update_contact_info_non_postal",'
+    '"update_contact_info_postal_address","update_kyc_activity","update_kyc_origin_of_assets",'
+    '"update_kyc_purpose_of_businessrelation","update_kyc_total_assets"]. '
+    "Return JSON only."
+)
+CONTEXT_GUIDE = """\
+- plan_contact: conversation without a concrete meeting (no date/time)
+- schedule_meeting: explicit date/time/modality confirmation
+- update_contact_info_non_postal: changes to email/phone
+- update_contact_info_postal_address: changes to mailing address
+- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)
+"""
+# ====== Utility ======
+def _json_only(text: str) -> str:
+    """
+    Try to extract the first JSON object from text.
+    """
+    text = text.strip()
+    if text.startswith("{") and text.endswith("}"):
+        return text
+    m = re.search(r"\{.*\}", text, re.DOTALL)
+    return m.group(0) if m else '{"labels": []}'
+def safe_json_loads(s: str) -> dict:
+    try:
+        return json.loads(s)
+    except Exception:
+        return {"labels": []}
+def build_prompt(system: str, context: str, transcript: str) -> str:
+    return (
+        f"### System:\n{system}\n\n"
+        f"### Context:\n{context}\n\n"
+        f"### Transcript:\n{transcript}\n\n"
+        "### Output:\nReturn JSON only."
     )
+# ====== Model wrapper ======
 class HFModel:
+    def __init__(
+        self,
+        repo_id: str,
+        revision: Optional[str] = None,
+        load_in_4bit: bool = False,
+        trust_remote_code: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        token: Optional[str] = None,
+    ) -> None:
         self.repo_id = repo_id
+        self.revision = revision or "main"
+        self.trust_remote_code = trust_remote_code
+        self.token = token
+        self.dtype = dtype or DTYPE_FALLBACK
+        self.load_in_4bit = load_in_4bit and (DEVICE == "cuda")
+        self.tokenizer = None
+        self.model = None
+    def load(self):
+        quant_cfg = None
+        if self.load_in_4bit:
+            quant_cfg = BitsAndBytesConfig(load_in_4bit=True)
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.repo_id,
+                revision=self.revision,
+                token=self.token,
+                cache_dir=str(CACHE_DIR),
+                trust_remote_code=self.trust_remote_code,
+                use_fast=True,
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load tokenizer for {self.repo_id} "
+                "(If gated, accept license and set HF_TOKEN in Space → Settings → Secrets)."
+            ) from e
+        try:
             self.model = AutoModelForCausalLM.from_pretrained(
+                self.repo_id,
+                revision=self.revision,
+                token=self.token,
+                cache_dir=str(CACHE_DIR),
+                trust_remote_code=self.trust_remote_code,
+                torch_dtype=self.dtype,
+                device_map="auto" if DEVICE == "cuda" else None,
+                quantization_config=quant_cfg,
+                low_cpu_mem_usage=True,
             )
+            if DEVICE == "cpu":
+                self.model = self.model.to(DEVICE)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load model weights for {self.repo_id}. "
+                "Check license, token, and hardware availability."
+            ) from e
     @torch.inference_mode()
+    def generate(self, prompt: str, max_new_tokens: int = 256, temperature: float = 0.1) -> str:
+        tok = self.tokenizer
+        mdl = self.model
+        if tok.pad_token is None:
+            tok.pad_token = tok.eos_token
+        inputs = tok(prompt, return_tensors="pt").to(mdl.device)
+        out = mdl.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=temperature > 0,
+            temperature=temperature,
+            top_p=0.9,
+            pad_token_id=tok.eos_token_id,
+            eos_token_id=tok.eos_token_id,
         )
+        text = tok.decode(out[0], skip_special_tokens=True)
+        gen = text[len(prompt):].strip() if text.startswith(prompt) else text
+        return _json_only(gen)
+# ====== Model cache (per Space worker) ======
+_MODEL_CACHE: Dict[Tuple[str, Optional[str], bool], HFModel] = {}
+def get_model(repo_id: str, revision: Optional[str], load_in_4bit: bool) -> HFModel:
+    key = (repo_id, revision, load_in_4bit)
+    if key in _MODEL_CACHE:
+        return _MODEL_CACHE[key]
+    model = HFModel(
+        repo_id=repo_id,
+        revision=revision,
+        load_in_4bit=load_in_4bit,
+        token=HF_TOKEN,
     )
+    model.load()
+    _MODEL_CACHE[key] = model
+    return model
+# ====== Single transcript inference ======
+def run_single(
+    model_choice: str,
+    custom_repo_id: str,
+    system: str,
+    context: str,
+    transcript: str,
+    soft_token_cap: int,
+    preprocess: bool,
+    lines_window: int,
+    add_header: bool,
+    strip_smalltalk: bool,
+    load_in_4bit: bool,
+) -> Tuple[str, str, str, str]:
+    """
+    Returns (repo_id_used, revision, raw_json, debug_log)
+    """
+    debug = []
+    t0 = time.perf_counter()
+    repo = (custom_repo_id or model_choice).strip()
+    rev = PINNED_REVISIONS.get(repo, None)
+    debug.append(f"Repo: {repo} | Revision: {rev or 'main'} | 4bit: {load_in_4bit} | Device: {DEVICE}")
+    # Lightweight "preprocess"
+    if preprocess:
+        # basic cleanup
+        lines = [ln.rstrip() for ln in transcript.splitlines()]
+        if strip_smalltalk:
+            lines = [ln for ln in lines if not re.search(r"\b(thanks?|bye|ok(ay)?)\b", ln, re.I)]
+        transcript = "\n".join(lines[-32768:])  # hard cap
+        if add_header:
+            transcript = f"[EMAIL/MESSAGE SIGNAL]\n{transcript}"
+    # Soft token cap (truncate by char approximation)
+    if soft_token_cap and soft_token_cap > 0:
+        approx_chars = int(soft_token_cap * 4)  # naive 4 chars/token
+        if len(transcript) > approx_chars:
+            transcript = transcript[-approx_chars:]
+    prompt = build_prompt(system or SYSTEM_INSTRUCTIONS, context or CONTEXT_GUIDE, transcript)
     try:
+        model = get_model(repo, rev, load_in_4bit)
+        raw = model.generate(prompt, max_new_tokens=256, temperature=0.1)
+        data = safe_json_loads(raw)
+        out_json = json.dumps(data, ensure_ascii=False)
+        debug.append(f"Generation OK in {time.perf_counter()-t0:.2f}s")
+        return repo, (rev or "main"), out_json, "\n".join(debug)
     except Exception as e:
+        debug.append(f"ERROR: {e}")
+        return repo, (rev or "main"), json.dumps({"labels": []}), "\n".join(debug)
+# ====== Batch (ZIP of many .txt files) ======
+def run_batch(
+    model_choice: str,
+    custom_repo_id: str,
+    system: str,
+    context: str,
+    zip_file: Optional[io.BytesIO],
+    soft_token_cap: int,
+    preprocess: bool,
+    lines_window: int,
+    add_header: bool,
+    strip_smalltalk: bool,
+    load_in_4bit: bool,
+) -> Tuple[str, str, str, str]:
+    """
+    Accepts a ZIP of .txt files. Returns (repo_id, revision, csv_like, debug)
+    """
+    debug = []
+    repo = (custom_repo_id or model_choice).strip()
+    rev = PINNED_REVISIONS.get(repo, None)
+    if not zip_file:
+        return repo, (rev or "main"), "filename,labels\n", "No ZIP provided."
     try:
+        z = zipfile.ZipFile(zip_file)
+        names = [n for n in z.namelist() if n.lower().endswith(".txt")]
+        debug.append(f"Files detected: {len(names)}")
     except Exception as e:
+        return repo, (rev or "main"), "filename,labels\n", f"Bad ZIP: {e}"
     try:
+        model = get_model(repo, rev, load_in_4bit)
     except Exception as e:
+        return repo, (rev or "main"), "filename,labels\n", f"Model load error: {e}"
+    rows = ["filename,labels"]
+    for name in names:
         try:
+            txt = z.read(name).decode("utf-8", errors="replace")
+            _, _, labels_json, _ = run_single(
+                model_choice, custom_repo_id, system, context, txt,
+                soft_token_cap, preprocess, lines_window, add_header,
+                strip_smalltalk, load_in_4bit
+            )
+            labels = safe_json_loads(labels_json).get("labels", [])
+            rows.append(f"{name},{json.dumps(labels, ensure_ascii=False)}")
         except Exception as e:
+            rows.append(f"{name},[]  # error: {e}")
+    return repo, (rev or "main"), "\n".join(rows), "\n".join(debug)
+# ====== Gradio UI ======
+with gr.Blocks(title="From Talk to Task — Batch & Single Task Extraction") as demo:
+    gr.Markdown(
+        """
+        # From Talk to Task — Batch & Single Task Extraction
+        **Tip:** Use **open models** first (no gating). If you pick a gated model, make sure
+        you have accepted its license _and_ set `HF_TOKEN` in **Settings → Secrets**.
+        **Pinned revisions:** {}
+        """.format(
+            ", ".join([f"{k}@{v or 'main'}" for k, v in PINNED_REVISIONS.items()])
+        )
     )
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            OPEN_MODEL_PRESETS,
+            label="Model (Open presets — no gating)",
+            value=OPEN_MODEL_PRESETS[0],
+        )
+        custom_repo_id = gr.Textbox(
+            label="Custom model repo id (overrides preset)",
+            placeholder="e.g. mistralai/Mistral-7B-Instruct-v0.2 (requires license + HF_TOKEN)"
+        )
+    system = gr.Textbox(label="Instructions (System)", value=SYSTEM_INSTRUCTIONS, lines=5)
+    context = gr.Textbox(label="Context (User prefix before transcript)", value=CONTEXT_GUIDE, lines=6)
+    with gr.Row():
+        soft_cap = gr.Slider(1024, 32768, value=8192, step=1, label="Soft token cap")
+        preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
+        lines_window = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
+    with gr.Row():
+        add_header = gr.Checkbox(value=True, label="Add cues header")
+        strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
+        load_4bit = gr.Checkbox(value=False, label="Load in 4-bit (GPU only)")
     with gr.Tabs():
+        with gr.Tab("Single Transcript (default)"):
+            transcript = gr.Textbox(label="Paste transcript text", lines=12, placeholder="Paste your transcript here...")
+            run_btn = gr.Button("Run (Single)", variant="primary")
+            repo_used = gr.Textbox(label="Repo used", interactive=False)
+            rev_used = gr.Textbox(label="Revision", interactive=False)
+            json_out = gr.Code(label="JSON Output", language="json")
+            debug_out = gr.Textbox(label="Diagnostics", lines=6)
             def _run_single(*args):
+                r, v, j, d = run_single(*args)
+                return r, v, j, d
+            run_btn.click(
                 _run_single,
+                inputs=[
+                    model_choice, custom_repo_id, system, context, transcript,
+                    soft_cap, preprocess, lines_window, add_header, strip_smalltalk, load_4bit
+                ],
+                outputs=[repo_used, rev_used, json_out, debug_out],
             )
+        with gr.Tab("Batch (ZIP of many transcripts)"):
+            zip_in = gr.File(label="Upload ZIP of .txt transcripts", file_types=[".zip"])
+            run_batch_btn = gr.Button("Run (Batch)", variant="primary")
+            repo_used_b = gr.Textbox(label="Repo used", interactive=False)
+            rev_used_b = gr.Textbox(label="Revision", interactive=False)
+            csv_out = gr.Code(label="CSV (filename,labels)", language="text")
+            debug_out_b = gr.Textbox(label="Diagnostics", lines=6)
             def _run_batch(*args):
+                r, v, c, d = run_batch(*args)
+                return r, v, c, d
+            run_batch_btn.click(
                 _run_batch,
+                inputs=[
+                    model_choice, custom_repo_id, system, context, zip_in,
+                    soft_cap, preprocess, lines_window, add_header, strip_smalltalk, load_4bit
+                ],
+                outputs=[repo_used_b, rev_used_b, csv_out, debug_out_b],
             )
+    gr.Markdown(
+        f"""
+        - **HF_TOKEN detected:** {"✅ yes" if HF_TOKEN else "⚠️ no (only needed for gated models)"}
+        - **Device:** {DEVICE}
+        - **Cache dir:** `{CACHE_DIR}`
+        """
+    )
+if __name__ == "__main__":
+    # Gradio 5 default port/host are fine in Spaces; keep `debug` false for speed
+    demo.launch()