Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 23

Commit

cecfb15

verified ·

1 Parent(s): 38169c5

Update app.py

Browse files

Files changed (1) hide show

app.py +398 -831

app.py CHANGED Viewed

@@ -1,873 +1,440 @@
-import os, io, re, sys, time, json, zipfile, statistics
-from pathlib import Path
-from typing import List, Dict, Tuple, Union
 import gradio as gr
-import pandas as pd
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-# ========= ZeroGPU support =========
 try:
-    import spaces  # available on HF Spaces
 except Exception:
-    class _DummySpaces:
-        def GPU(self, *args, **kwargs):
-            def deco(f): return f
-            return deco
-    spaces = _DummySpaces()
-# ========= Auth token =========
-HF_TOKEN = (
-    os.getenv("HF_TOKEN")
-    or os.getenv("HUGGINGFACE_HUB_TOKEN")
-    or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 )
-# Console warning at startup (helps when logs are open)
-if not HF_TOKEN:
-    print(
-        "[WARN] HF_TOKEN is not set. Gated models will fail. "
-        "Set it in Space → Settings → Variables and secrets.",
-        file=sys.stderr
-    )
-# ========= Labels & metrics =========
-ALLOWED_LABELS = [
-    "plan_contact",
-    "schedule_meeting",
-    "update_contact_info_non_postal",
-    "update_contact_info_postal_address",
-    "update_kyc_activity",
-    "update_kyc_origin_of_assets",
-    "update_kyc_purpose_of_businessrelation",
-    "update_kyc_total_assets",
-]
-LABEL_TO_IDX = {l: i for i, l in enumerate(ALLOWED_LABELS)}
-FN_PENALTY = 2.0
-FP_PENALTY = 1.0
-def safe_json_load(s: str):
-    try:
-        return json.loads(s)
-    except Exception:
-        pass
-    m = re.search(r"\{.*\}", s, re.S)
-    if m:
-        try:
-            return json.loads(m.group(0))
-        except Exception:
-            pass
-    return {"labels": [], "notes": "WARN: model output not valid JSON; fallback used"}
-def _coerce_labels_list(x):
-    if isinstance(x, list):
-        out = []
-        for it in x:
-            if isinstance(it, str): out.append(it)
-            elif isinstance(it, dict):
-                for k in ("label", "value", "task", "category", "name"):
-                    v = it.get(k)
-                    if isinstance(v, str):
-                        out.append(v); break
-                else:
-                    if isinstance(it.get("labels"), list):
-                        out += [s for s in it["labels"] if isinstance(s, str)]
-        # dedupe keep order
-        seen = set(); norm = []
-        for s in out:
-            if s not in seen:
-                norm.append(s); seen.add(s)
-        return norm
-    if isinstance(x, dict):
-        for k in ("expected_labels", "labels", "targets", "y_true"):
-            if k in x: return _coerce_labels_list(x[k])
-        if "one_hot" in x and isinstance(x["one_hot"], dict):
-            return [k for k, v in x["one_hot"].items() if v]
-    return []
-def classic_metrics(pred_labels, exp_labels):
-    pred = set([str(x) for x in (pred_labels or []) if isinstance(x, (str,int,float,bool))])
-    gold = set([str(x) for x in (exp_labels  or []) if isinstance(x, (str,int,float,bool))])
-    if not pred and not gold:
-        return True, 1.0, 1.0, 1.0, 1.0
-    inter = pred & gold; union = pred | gold
-    exact = (sorted(pred) == sorted(gold))
-    precision = (len(inter) / (len(pred) if pred else 1e-9))
-    recall    = (len(inter) / (len(gold) if gold else 1e-9))
-    f1 = 0.0 if len(inter) == 0 else 2*len(inter) / (len(pred)+len(gold)+1e-9)
-    hamming = (len(inter) / (len(union) if union else 1e-9))
-    return exact, precision, recall, f1, hamming
-def ubs_score_one(true_labels, pred_labels) -> float:
-    tset = [l for l in (true_labels or []) if l in LABEL_TO_IDX]
-    pset = [l for l in (pred_labels or []) if l in LABEL_TO_IDX]
-    n_labels = len(ALLOWED_LABELS)
-    tpos = set(tset); ppos = set(pset)
-    fn = sum(1 for l in ALLOWED_LABELS if (l in tpos and l not in ppos))
-    fp = sum(1 for l in ALLOWED_LABELS if (l not in tpos and l in ppos))
-    weighted = FN_PENALTY*fn + FP_PENALTY*fp
-    t_count = len(tpos)
-    max_err = FN_PENALTY*t_count + FP_PENALTY*(n_labels - t_count)
-    score = 1.0 if max_err == 0 else (1.0 - (weighted / max_err))
-    return float(max(0.0, min(1.0, score)))
-# ========= Lightweight preprocessing =========
-EMAIL_RX   = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', re.I)
-TIME_RX    = re.compile(r'\b(\d{1,2}:\d{2}\b|\b\d{1,2}\s?(am|pm)\b|\bafternoon\b|\bmorning\b|\bevening\b)', re.I)
-DATE_RX    = re.compile(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b|\b\d{1,2}[/-]\d{1,2}([/-]\d{2,4})?\b|\b20\d{2}\b', re.I)
-MEET_RX    = re.compile(r'\b(meet(ing)?|call|appointment|schedule|invite|agenda|online|in[- ]?person|phone|zoom|teams)\b', re.I)
-MODAL_RX   = re.compile(r'\b(online|in[- ]?person|phone|zoom|teams)\b', re.I)
-SMALLTALK_RX = re.compile(r'^\s*(user|advisor):\s*(thanks( you)?|thank you|anything else|have a great day|you too)\b', re.I)
-TYPO_FIXES = [
-    (re.compile(r'\bschedulin\s*g\b', re.I), 'scheduling'),
-    (re.compile(r'\beeting\b', re.I), 'meeting'),
-    (re.compile(r'\bdi?i?gtal\b', re.I), 'digital'),
-    (re.compile(r'\bdigi\s+tal\b', re.I), 'digital'),
-    (re.compile(r'\bspread\s*sheet\b', re.I), 'spreadsheet'),
-    (re.compile(r'\bseats\b', re.I), 'sheets'),
-    (re.compile(r'\bver(s|z)ion meters\b', re.I), 'version metrics'),
-]
-def normalize_text(text: str, fix_typos: bool = True) -> str:
-    t = text.replace('\r\n', '\n')
-    t = re.sub(r'^\s*Speaker\s*1\s*:\s*', 'USER: ', t, flags=re.I | re.M)
-    t = re.sub(r'^\s*Speaker\s*2\s*:\s*', 'ADVISOR: ', t, flags=re.I | re.M)
-    t = re.sub(r'[ \t]+', ' ', t)
-    t = re.sub(r'\n{3,}', '\n\n', t)
-    if fix_typos:
-        for rx, rep in TYPO_FIXES:
-            t = rx.sub(rep, t)
-    return t.strip()
-def extract_cues(text: str):
-    emails = EMAIL_RX.findall(text)
-    email_new, email_old = (emails[-1], emails[-2]) if len(emails)>=2 else ((emails[-1], None) if emails else (None, None))
-    has_time = bool(TIME_RX.search(text))
-    has_date = bool(DATE_RX.search(text))
-    has_meet = bool(MEET_RX.search(text))
-    modality = None
-    m = MODAL_RX.search(text)
-    if m:
-        modality = m.group(0).upper().replace('IN PERSON','IN_PERSON').replace('IN-PERSON','IN_PERSON')
-    meeting_confirmed = (has_meet and (has_time or has_date))
-    tm = TIME_RX.search(text)
-    norm_tm = tm.group(0) if tm else None
-    return {
-        "email_new": email_new,
-        "email_old": email_old,
-        "contact_pref": "EMAIL" if email_new else None,
-        "meeting_time_fragment": norm_tm,
-        "meeting_modality": modality,
-        "meeting_confirmed": meeting_confirmed
-    }
-def build_cues_header(cues: dict) -> str:
-    has_any = any([cues.get("email_new"), cues.get("email_old"), cues.get("contact_pref"), cues.get("meeting_confirmed")])
-    if not has_any:
-        return ""
-    lines = ["[DETECTED_CUES]"]
-    if cues.get("email_new"): lines.append(f"EMAIL_NEW: {cues['email_new']}")
-    if cues.get("email_old"): lines.append(f"EMAIL_OLD: {cues['email_old']}")
-    if cues.get("contact_pref"): lines.append(f"CONTACT_PREF: {cues['contact_pref']}")
-    if cues.get("meeting_confirmed"):
-        mod = cues.get("meeting_modality") or ""
-        tm  = cues.get("meeting_time_fragment") or ""
-        lines.append(f"MEETING: {(tm + ' ' + mod).strip()} CONFIRMED")
-    lines.append("[/DETECTED_CUES]")
-    return "\n".join(lines)
-def find_cue_lines(lines):
-    idx = set()
-    for i, ln in enumerate(lines):
-        if EMAIL_RX.search(ln) or (MEET_RX.search(ln) and (TIME_RX.search(ln) or DATE_RX.search(ln))):
-            idx.add(i)
-    return sorted(idx)
-def prune_by_window(lines, cue_idx, window=3, strip_smalltalk=False):
-    n = len(lines); keep = set()
-    for k in cue_idx:
-        lo, hi = max(0, k-window), min(n-1, k+window)
-        keep.update(range(lo,hi+1))
-    out=[]
-    for i, ln in enumerate(lines):
-        if i in keep:
-            if strip_smalltalk and SMALLTALK_RX.search(ln): continue
-            out.append(ln)
-    return out
-def shrink_to_token_cap_by_lines(text: str, soft_cap_tokens: int, tokenizer,
-                                 min_lines_keep: int = 30,
-                                 apply_only_if_ratio: float = 1.15) -> str:
-    ids = tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids
-    est = len(ids)
-    threshold = int(soft_cap_tokens * apply_only_if_ratio)
-    if est <= threshold: return text
-    parts = text.splitlines()
-    if len(parts) <= min_lines_keep: return text
-    keep_flags=[]
-    for ln in parts:
-        is_header = ln.startswith("[DETECTED_CUES]") or ln.startswith("[/DETECTED_CUES]") \
-                    or ln.startswith("EMAIL_") or ln.startswith("CONTACT_") or ln.startswith("MEETING:")
-        is_cue = bool(EMAIL_RX.search(ln) or MEET_RX.search(ln) or DATE_RX.search(ln) or TIME_RX.search(ln))
-        keep_flags.append(is_header or is_cue)
-    pruned = [ln for ln, keep in zip(parts, keep_flags) if keep]
-    if len(pruned) < min_lines_keep:
-        pad_needed = min_lines_keep - len(pruned)
-        non_cue_lines = [ln for ln, keep in zip(parts, keep_flags) if not keep]
-        pruned = pruned + non_cue_lines[:pad_needed]
-    candidate = "\n".join(pruned)
-    cand_tokens = len(tokenizer(candidate, return_tensors=None, add_special_tokens=False).input_ids)
-    if cand_tokens > threshold:
-        mid = len(parts)//2
-        half = max(min_lines_keep//2, 50)
-        slice_parts = parts[max(0, mid-half): min(len(parts), mid+half)]
-        candidate2 = "\n".join(slice_parts)
-        candidate2_tokens = len(tokenizer(candidate2, return_tensors=None, add_special_tokens=False).input_ids)
-        candidate = candidate if cand_tokens <= candidate2_tokens else candidate2
-    if len(candidate.splitlines()) < min_lines_keep: return text
-    return candidate
-def enforce_rules(labels, transcript_text):
-    labels = set(labels or [])
-    if (TIME_RX.search(transcript_text) or DATE_RX.search(transcript_text)) and MEET_RX.search(transcript_text):
-        labels.add("schedule_meeting"); labels.discard("plan_contact")
-    if EMAIL_RX.search(transcript_text) and re.search(r'\b(update|new|set|change|confirm(ed)?|for all communication)\b', transcript_text, re.I):
-        labels.add("update_contact_info_non_postal")
-    kyc_rx = re.compile(r'\b(kyc|aml|compliance|employer|occupation|purpose of (relationship|account)|source of (wealth|funds)|net worth|total assets)\b', re.I)
-    if "update_kyc_activity" in labels and not kyc_rx.search(transcript_text):
-        labels.discard("update_kyc_activity")
-    return sorted(labels)
-# ========= HF model wrapper =========
 class HFModel:
-    def __init__(self, repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
         self.repo_id = repo_id
         self.tokenizer = AutoTokenizer.from_pretrained(
-            repo_id, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN
         )
-        torch_dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}.get(dtype, torch.bfloat16)
         self.model = None
-        if load_4bit:
             try:
-                q = BitsAndBytesConfig(
-                    load_in_4bit=True, bnb_4bit_use_double_quant=True,
-                    bnb_4bit_compute_dtype=torch_dtype, bnb_4bit_quant_type="nf4"
                 )
                 self.model = AutoModelForCausalLM.from_pretrained(
-                    repo_id, device_map="auto", trust_remote_code=trust_remote_code,
-                    quantization_config=q, torch_dtype=torch_dtype, token=HF_TOKEN
                 )
             except Exception as e:
-                print(f"[WARN] 4-bit load failed for {repo_id}: {e}\nFalling back to normal load...", file=sys.stderr)
         if self.model is None:
             self.model = AutoModelForCausalLM.from_pretrained(
-                repo_id, device_map="auto", trust_remote_code=trust_remote_code,
-                torch_dtype=torch_dtype, token=HF_TOKEN
             )
         self.max_context = getattr(self.model.config, "max_position_embeddings", None) \
-                           or getattr(self.model.config, "max_sequence_length", None) or 8192
-    def apply_chat_template(self, system_text: str, user_text: str) -> str:
-        if getattr(self.tokenizer, "chat_template", None):
-            messages = [{"role":"system","content":system_text},
-                        {"role":"user","content":user_text}]
-            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        return ("### System\n" + system_text.strip() + "\n\n" +
-                "### User\n" + user_text.strip() + "\n\n" +
-                "### Assistant\n")
     @torch.inference_mode()
-    def generate_json(self, system_text: str, user_text: str, max_new_tokens: int = 256):
-        prompt = self.apply_chat_template(system_text, user_text)
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        t0 = time.perf_counter()
-        out = self.model.generate(
-            **inputs, max_new_tokens=max_new_tokens,
-            do_sample=False, temperature=None, top_p=None,
-            eos_token_id=self.tokenizer.eos_token_id
         )
-        latency_ms = int((time.perf_counter() - t0) * 1000)
-        text = self.tokenizer.decode(out[0], skip_special_tokens=True)
-        if text.startswith(prompt): text = text[len(prompt):]
-        return latency_ms, text, prompt
-MODEL_CACHE: Dict[str, HFModel] = {}
-def get_model(repo_id: str, load_4bit: bool, dtype: str, trust_remote_code: bool):
-    if repo_id not in MODEL_CACHE:
-        MODEL_CACHE[repo_id] = HFModel(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
-    return MODEL_CACHE[repo_id]
-# ========= ZeroGPU functions =========
-@spaces.GPU(duration=180, secrets=["HF_TOKEN"])  # pass token into ZeroGPU job
-def gpu_generate(repo_id: str, system_text: str, user_text: str,
-                 load_4bit: bool, dtype: str, trust_remote_code: bool):
-    token_seen = bool(os.getenv("HF_TOKEN"))
-    hf = get_model(repo_id, load_4bit=load_4bit, dtype=dtype, trust_remote_code=trust_remote_code)
-    lat, txt, prmpt = hf.generate_json(system_text.strip(), user_text.strip(), max_new_tokens=256)
-    return lat, txt, prmpt, token_seen
-@spaces.GPU(duration=15, secrets=["HF_TOKEN"])
-def gpu_check_token():
-    return bool(os.getenv("HF_TOKEN"))
-# ========= ZIP helpers =========
-def _read_zip_bytes(dataset_zip: Union[bytes, str, dict, None]) -> bytes:
-    if dataset_zip is None: raise ValueError("No ZIP provided")
-    if isinstance(dataset_zip, bytes): return dataset_zip
-    if isinstance(dataset_zip, str):
-        with open(dataset_zip, "rb") as f: return f.read()
-    if isinstance(dataset_zip, dict) and "path" in dataset_zip:
-        with open(dataset_zip["path"], "rb") as f: return f.read()
-    path = getattr(dataset_zip, "name", None)
-    if path and os.path.exists(path):
-        with open(path, "rb") as f: return f.read()
-    raise ValueError("Unsupported file object from Gradio")
-def parse_zip(zip_bytes: bytes) -> Dict[str, Tuple[str, List[str]]]:
-    zf = zipfile.ZipFile(io.BytesIO(zip_bytes))
-    samples = {}
-    for n in zf.namelist():
-        p = Path(n)
-        if p.suffix.lower() == ".txt":
-            samples.setdefault(p.stem, ["", []])[0] = zf.read(n).decode("utf-8", "replace")
-        elif p.suffix.lower() == ".json":
             try:
-                js = json.loads(zf.read(n).decode("utf-8", "replace"))
             except Exception:
-                js = []
-            samples.setdefault(p.stem, ["", []])[1] = _coerce_labels_list(js)
-    return samples
-# ========= Prompts =========
-DEFAULT_SYSTEM = (
-    "You are a task extraction assistant. "
-    "Always output valid JSON with a field \"labels\" (list of strings). "
-    "Use only from this set: " + json.dumps(ALLOWED_LABELS) + ". "
-    "Return JSON only."
-)
-DEFAULT_CONTEXT = (
-    "- plan_contact: conversation without a concrete meeting (no date/time)\n"
-    "- schedule_meeting: explicit date/time/modality confirmation\n"
-    "- update_contact_info_non_postal: changes to email/phone\n"
-    "- update_contact_info_postal_address: changes to mailing address\n"
-    "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)"
-)
-# ========= Preprocess + build input =========
-def prepare_input_text(raw_txt: str, soft_cap: int, preprocess: bool, pre_window: int,
-                       add_cues: bool, strip_smalltalk: bool, tokenizer) -> Tuple[str, int, int]:
-    before = len(tokenizer(raw_txt, return_tensors=None, add_special_tokens=False).input_ids)
-    proc_text = raw_txt
-    if preprocess:
-        t_norm = normalize_text(proc_text, fix_typos=True)
-        lines = [ln.strip() for ln in t_norm.splitlines() if ln.strip()]
-        cue_lines = find_cue_lines(lines)
-        if cue_lines:
-            kept = prune_by_window(lines, cue_lines, window=pre_window, strip_smalltalk=strip_smalltalk)
         else:
-            kept = [ln for ln in lines if not (strip_smalltalk and SMALLTALK_RX.search(ln))]
-        t_kept = "\n".join(kept)
-        cues = extract_cues(t_kept)
-        header = build_cues_header(cues) if add_cues else ""
-        proc_text = (header + "\n\n" + t_kept).strip() if header else t_kept
-        proc_text = shrink_to_token_cap_by_lines(proc_text, soft_cap, tokenizer)
-        if len(proc_text.splitlines()) < 30:
-            proc_text = t_norm
-    after = len(tokenizer(proc_text, return_tensors=None, add_special_tokens=False).input_ids)
-    return proc_text, before, after
-def explain_params_markdown() -> str:
-    return (
-        "**Parameter help**  \n"
-        "- **Soft token cap**: target max input size; we prune long transcripts toward this size to save latency.  \n"
-        "- **Enable preprocessing**: normalizes speaker tags, fixes obvious typos, and focuses on cue lines.  \n"
-        "- **Window ± lines around cues**: how many lines we keep around detected cues (dates/emails/‘meeting’, etc.).  \n"
-        "- **Add cues header**: inserts a short summary block (email, meeting signal) above the transcript to guide the model.  \n"
-        "- **Strip smalltalk**: removes lines like ‘thanks, bye’ to keep only useful content.  \n"
-        "- **Load in 4-bit (GPU only)**: memory-saving quantization; has no effect on CPU Spaces."
-    )
-# ========= Single mode =========
-def single_mode(
-    preset_model: str, custom_model: str,
-    system_text: str, context_text: str,
-    transcript_text: str, transcript_file,
-    expected_labels_json,
-    soft_cap: int, preprocess: bool, pre_window: int, add_cues: bool, strip_smalltalk: bool,
-    load_4bit: bool, dtype: str, trust_remote_code: bool
-):
-    repo_id = custom_model.strip() or preset_model.strip()
-    if not repo_id:
-        return "Please choose a model.", "", "", "", None, None, None, ""
-    txt = (transcript_text or "").strip()
-    if transcript_file and hasattr(transcript_file, "name") and os.path.exists(transcript_file.name):
-        with open(transcript_file.name, "r", encoding="utf-8", errors="replace") as f:
-            txt = f.read()
-    if not txt:
-        return "Please paste a transcript or upload a .txt file.", "", "", "", None, None, None, ""
-    exp = []
-    if expected_labels_json and hasattr(expected_labels_json, "name") and os.path.exists(expected_labels_json.name):
-        try:
-            with open(expected_labels_json.name, "r", encoding="utf-8", errors="replace") as f:
-                exp = _coerce_labels_list(json.load(f))
-        except Exception:
-            exp = []
-    # tokenizer for preprocessing
-    try:
-        dummy_tok = AutoTokenizer.from_pretrained(repo_id, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN)
-    except Exception as e:
-        msg = (f"Failed to load tokenizer for `{repo_id}`. "
-               "If gated, accept license and set HF_TOKEN in Space → Settings → Secrets.\n\nError: " + str(e))
-        return msg, "", "", "", None, None, None, banner_text()
-    proc_text, tok_before, tok_after = prepare_input_text(
-        txt, soft_cap, preprocess, pre_window, add_cues, strip_smalltalk, dummy_tok
-    )
-    system = (system_text or DEFAULT_SYSTEM).strip()
-    user = (context_text or DEFAULT_CONTEXT).strip() + "\n\nTRANSCRIPT\n" + proc_text.strip()
-    try:
-        latency_ms, raw_text, _prompt, gpu_token_seen = gpu_generate(
-            repo_id, system, user, load_4bit, dtype, trust_remote_code
-        )
-    except Exception as e:
-        msg = (f"Failed to run `{repo_id}`. If gated, accept license and set HF_TOKEN.\n\nError: {e}")
-        return msg, "", "", "", None, None, None, banner_text()
-    out = safe_json_load(raw_text)
-    pred_labels = enforce_rules(out.get("labels", []), proc_text)
-    exact, prec, rec, f1, ham = classic_metrics(pred_labels, exp)
-    ubs = ubs_score_one(exp, pred_labels) if exp else None
-    kpi1 = f"**F1**\n\n{f1:.3f}" if exp else "**F1**\n\n—"
-    kpi2 = f"**UBS score**\n\n{ubs:.3f}" if ubs is not None else "**UBS score**\n\n—"
-    kpi3 = f"**Latency (ms)**\n\n{latency_ms}"
-    zbuf = io.BytesIO()
-    with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as zout:
-        zout.writestr("PREPROCESSED.txt", proc_text)
-        zout.writestr("MODEL_OUTPUT.raw.txt", raw_text)
-        final_json = {
-            "labels": pred_labels,
-            "diagnostics": {
-                "model_name": repo_id,
-                "latency_ms": latency_ms,
-                "token_in_est_before": tok_before,
-                "token_in_est_after": tok_after,
-                "preprocess": preprocess,
-                "pre_window": pre_window,
-                "pre_add_cues_header": add_cues if preprocess else False,
-                "pre_strip_smalltalk": strip_smalltalk if preprocess else False,
-                "pre_soft_token_cap": soft_cap if preprocess else None,
-                "model_calls": 1
-            },
-            "evaluation": None if not exp else {
-                "exact_match": exact, "precision": prec, "recall": rec,
-                "f1": f1, "hamming": ham, "ubs_score": ubs
-            }
-        }
-        zout.writestr("FINAL.json", json.dumps(final_json, ensure_ascii=False, indent=2))
-    zbuf.seek(0); zbuf.name = "artifacts_single.zip"
-    row = pd.DataFrame([{
-        "model": repo_id,
-        "latency_ms": latency_ms,
-        "token_before": tok_before,
-        "token_after": tok_after,
-        "model_calls": 1,
-        "pred_labels": json.dumps(pred_labels, ensure_ascii=False),
-        "exp_labels": json.dumps(exp, ensure_ascii=False),
-        "exact_match": exact if exp else None,
-        "precision": round(prec,6) if exp else None,
-        "recall": round(rec,6) if exp else None,
-        "f1": round(f1,6) if exp else None,
-        "hamming": round(ham,6) if exp else None,
-        "ubs_score": round(ubs,6) if ubs is not None else None
-    }])
-    csv_buf = io.BytesIO(row.to_csv(index=False).encode("utf-8")); csv_buf.name = "results_single.csv"
-    return (
-        "Done.",
-        kpi1, kpi2, kpi3,
-        row, csv_buf, zbuf,
-        banner_text(gpu_token_seen)
-    )
-# ========= Batch mode =========
-def run_batch_ui(models_list, custom_models_str, instructions_text, context_text, dataset_zip,
-                 soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
-                 repeats, max_total_runs, load_4bit, dtype, trust_remote_code):
-    models = [m for m in (models_list or [])]
-    models += [m.strip() for m in (custom_models_str or "").split(",") if m.strip()]
-    if not models:
-        return pd.DataFrame(), None, None, "Please pick at least one model.", banner_text()
-    if not dataset_zip:
-        return pd.DataFrame(), None, None, "Please upload a ZIP with *.txt (+ optional matching *.json).", banner_text()
     try:
-        zip_bytes = _read_zip_bytes(dataset_zip)
-        samples = parse_zip(zip_bytes)
-    except Exception as e:
-        return pd.DataFrame(), None, None, f"Failed to read ZIP: {e}", banner_text()
-    rows = []; total_runs = 0
-    all_artifacts = io.BytesIO()
-    zout = zipfile.ZipFile(all_artifacts, "w", zipfile.ZIP_DEFLATED)
-    last_gpu_token_seen = None
-    for repo_id in models:
-        # tokenizer for preprocessing (auth check)
-        try:
-            dummy_tok = AutoTokenizer.from_pretrained(repo_id, use_fast=True, trust_remote_code=trust_remote_code, token=HF_TOKEN)
-        except Exception as e:
-            # gated or missing token; record a summary row and continue
-            rows.append({
-                "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                "sample_id": None,
-                "model": repo_id,
-                "is_summary": True,
-                "run_index": None,
-                "preprocess": preprocess,
-                "pre_window": pre_window,
-                "add_cues_header": add_cues,
-                "strip_smalltalk": strip_smalltalk,
-                "soft_cap": soft_cap,
-                "median_latency_ms": None,
-                "latency_ms": None,
-                "token_before": None,
-                "token_after": None,
-                "model_calls": None,
-                "pred_labels": "[]",
-                "exp_labels": "[]",
-                "exact_match": None,
-                "precision": None,
-                "recall": None,
-                "f1": None,
-                "hamming": None,
-                "ubs_score": None,
-            })
-            continue
-        for sample_id, (transcript_text, exp_labels) in samples.items():
-            if not transcript_text.strip(): continue
-            latencies = []; last_pred = None
-            for r in range(1, repeats+1):
-                if total_runs >= max_total_runs: break
-                proc_text, before_tok, after_tok = prepare_input_text(
-                    transcript_text, soft_cap, preprocess, pre_window, add_cues, strip_smalltalk, dummy_tok
                 )
-                system_text = (instructions_text or DEFAULT_SYSTEM).strip()
-                user_text = (context_text or DEFAULT_CONTEXT).strip() + "\n\nTRANSCRIPT\n" + proc_text.strip()
-                try:
-                    latency_ms, raw_text, _prompt, token_seen = gpu_generate(
-                        repo_id, system_text, user_text, load_4bit, dtype, trust_remote_code
-                    )
-                    last_gpu_token_seen = token_seen
-                except Exception as e:
-                    base = f"{repo_id.replace('/','_')}/{sample_id}/error_r{r}"
-                    zout.writestr(base + "/ERROR.txt", f"Failed to run model via @spaces.GPU. If gated, accept license and set HF_TOKEN.\n\n{e}")
-                    total_runs += 1
-                    continue
-                out = safe_json_load(raw_text)
-                pred_labels = enforce_rules(out.get("labels", []), proc_text)
-                exact, prec, rec, f1, ham = classic_metrics(pred_labels, exp_labels)
-                ubs = ubs_score_one(exp_labels, pred_labels)
-                rows.append({
-                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                    "sample_id": sample_id,
-                    "model": repo_id,
-                    "is_summary": False,
-                    "run_index": r,
-                    "preprocess": preprocess,
-                    "pre_window": pre_window,
-                    "add_cues_header": add_cues,
-                    "strip_smalltalk": strip_smalltalk,
-                    "soft_cap": soft_cap,
-                    "latency_ms": latency_ms,
-                    "token_before": before_tok,
-                    "token_after": after_tok,
-                    "model_calls": 1,
-                    "pred_labels": json.dumps(pred_labels, ensure_ascii=False),
-                    "exp_labels": json.dumps(exp_labels, ensure_ascii=False),
-                    "exact_match": exact,
-                    "precision": round(prec, 6),
-                    "recall": round(rec, 6),
-                    "f1": round(f1, 6),
-                    "hamming": round(ham, 6),
-                    "ubs_score": round(ubs, 6),
-                })
-                base = f"{repo_id.replace('/','_')}/{sample_id}/pre{int(preprocess)}_win{pre_window}_cues{int(add_cues)}_small{int(strip_smalltalk)}_cap{soft_cap}_r{r}"
-                zout.writestr(base + "/PREPROCESSED.txt", proc_text)
-                zout.writestr(base + "/MODEL_OUTPUT.raw.txt", raw_text)
-                final_json = {
-                    "labels": pred_labels,
-                    "diagnostics": {
-                        "model_name": repo_id,
-                        "latency_ms": latency_ms,
-                        "token_in_est_before": before_tok,
-                        "token_in_est_after": after_tok,
-                        "preprocess": preprocess,
-                        "pre_window": pre_window,
-                        "pre_add_cues_header": add_cues if preprocess else False,
-                        "pre_strip_smalltalk": strip_smalltalk if preprocess else False,
-                        "pre_soft_token_cap": soft_cap if preprocess else None,
-                        "model_calls": 1
-                    }
-                }
-                zout.writestr(base + "/FINAL.json", json.dumps(final_json, ensure_ascii=False, indent=2))
-                latencies.append(latency_ms)
-                last_pred = pred_labels
-                total_runs += 1
-            if latencies:
-                med = int(statistics.median(latencies))
-                exact, prec, rec, f1, ham = classic_metrics(last_pred, exp_labels) if last_pred is not None else (None,)*5
-                ubs = ubs_score_one(exp_labels, last_pred) if last_pred is not None else None
-                rows.append({
-                    "timestamp": pd.Timestamp.now().isoformat(timespec="seconds"),
-                    "sample_id": sample_id,
-                    "model": repo_id,
-                    "is_summary": True,
-                    "run_index": None,
-                    "preprocess": preprocess,
-                    "pre_window": pre_window,
-                    "add_cues_header": add_cues,
-                    "strip_smalltalk": strip_smalltalk,
-                    "soft_cap": soft_cap,
-                    "median_latency_ms": med,
-                    "latency_ms": None,
-                    "token_before": None,
-                    "token_after": None,
-                    "model_calls": None,
-                    "pred_labels": json.dumps(last_pred or [], ensure_ascii=False),
-                    "exp_labels": json.dumps(exp_labels or [], ensure_ascii=False),
-                    "exact_match": exact,
-                    "precision": round(prec, 6) if prec is not None else None,
-                    "recall": round(rec, 6) if rec is not None else None,
-                    "f1": round(f1, 6) if f1 is not None else None,
-                    "hamming": round(ham, 6) if ham is not None else None,
-                    "ubs_score": round(ubs, 6) if ubs is not None else None,
-                })
-        if total_runs >= max_total_runs:
-            break
-    zout.close()
-    df = pd.DataFrame(rows)
-    if df.empty:
-        return pd.DataFrame(), None, None, "No runs executed (empty dataset / exceeded cap / gated models).", banner_text(last_gpu_token_seen)
-    csv_pair = ("results.csv", df.to_csv(index=False).encode("utf-8"))
-    zip_pair = ("artifacts.zip", all_artifacts.getvalue())
-    return df, csv_pair, zip_pair, "Done.", banner_text(last_gpu_token_seen)
-# ========= UI helpers =========
-OPEN_MODEL_PRESETS = [
-    "mistralai/Mistral-7B-Instruct-v0.2",
-    "Qwen/Qwen2.5-7B-Instruct",
-    "HuggingFaceH4/zephyr-7b-beta",
-    "tiiuae/falcon-7b-instruct",
-]
-def banner_text(gpu_token_seen: bool | None = None) -> str:
-    app_seen = bool(HF_TOKEN)
-    lines = []
-    if not app_seen:
-        lines.append("🟡 **HF_TOKEN not detected in App** — gated models will fail unless you set it in **Settings → Variables and secrets**.")
-    else:
-        lines.append("🟢 **HF_TOKEN detected in App**.")
-    if gpu_token_seen is None:
-        lines.append("ℹ️ ZeroGPU token status: click **Run** or **Check ZeroGPU token** to verify.")
-    else:
-        lines.append("🟢 **HF_TOKEN detected inside ZeroGPU job.**" if gpu_token_seen else "🔴 **HF_TOKEN missing inside ZeroGPU job** (add `secrets=[\"HF_TOKEN\"]` to @spaces.GPU).")
-    lines.append("✅ Tip: use **Open models** (no license gating): " + ", ".join(OPEN_MODEL_PRESETS))
-    return "\n\n".join(lines)
-# ========= UI (dark red) =========
-DARK_RED_CSS = """
-:root, .gradio-container {
-  --color-background: #0b0b0d;
-  --color-foreground: #e6e6e6;
-  --color-primary: #e11d48;
-  --color-secondary: #111216;
-  --color-border: #1f2024;
-  --color-muted: #9ca3af;
-}
-.gradio-container { background: var(--color-background) !important; color: var(--color-foreground) !important; }
-.gr-box, .gr-panel, .gr-group, .gr-form, .wrap.svelte-1ipelgc {
-  background: var(--color-secondary) !important;
-  border: 1px solid var(--color-border) !important;
-  border-radius: 10px !important;
-}
-button, .gr-button {
-  border-radius: 10px !important;
-  border: 1px solid var(--color-primary) !important;
-  background: linear-gradient(180deg, #b91c1c, #7f1d1d) !important;
-  color: white !important;
-}
-.kpi {
-  border: 1px solid #e11d48; border-radius: 10px; padding: 12px; text-align: center;
-  background: #1a0f10; font-size: 18px;
-}
-"""
-with gr.Blocks(title="From Talk to Task — HF Space", css=DARK_RED_CSS) as demo:
-    gr.Markdown("## 🟥 From Talk to Task — Batch & Single Task Extraction")
-    help_md = (
-        "This tool extracts **task labels** from transcripts using Hugging Face models.  \n"
-        "1) Pick a model (or paste a custom repo id).  \n"
-        "2) Provide **Instructions** and **Context**, then supply a transcript (single) or a ZIP (batch).  \n"
-        "3) Adjust parameters (soft token cap, preprocessing).  \n"
-        "4) Run and review **latency**, **precision/recall/F1**, **UBS score**, and download artifacts."
-    )
-    gr.Markdown(help_md)
-    # Status banner (token presence info)
-    banner = gr.Markdown(banner_text())
-    check_btn = gr.Button("Check ZeroGPU token")
-    def _check_token():
-        try:
-            present = gpu_check_token()
-        except Exception:
-            present = None
-        return banner_text(present)
-    check_btn.click(_check_token, outputs=banner)
-    with gr.Tabs():
-        # Single
-        with gr.TabItem("Single Transcript (default)"):
-            with gr.Row():
-                with gr.Column():
-                    preset_model = gr.Dropdown(choices=OPEN_MODEL_PRESETS, value=OPEN_MODEL_PRESETS[0],
-                                               label="Model (Open presets — no gating)")
-                    custom_model = gr.Textbox(label="Custom model repo id (overrides preset)",
-                                              placeholder="e.g. meta-llama/Meta-Llama-3-8B-Instruct")
-                    instructions = gr.Textbox(label="Instructions (System)", lines=8, value=DEFAULT_SYSTEM)
-                    context = gr.Textbox(label="Context (User prefix before transcript)", lines=6, value=DEFAULT_CONTEXT)
-                with gr.Column():
-                    transcript_text = gr.Textbox(label="Paste transcript text", lines=14, placeholder="Paste your transcript here...")
-                    transcript_file = gr.File(label="...or upload a single transcript .txt", file_types=[".txt"], file_count="single", type="filepath")
-                    expected_labels_json = gr.File(label="(Optional) Expected labels JSON for metrics", file_types=[".json"], file_count="single", type="filepath")
-            with gr.Row():
-                with gr.Column():
-                    soft_cap_s = gr.Slider(1024, 32768, value=8192, step=512, label="Soft token cap")
-                    preprocess_s = gr.Checkbox(value=True, label="Enable preprocessing")
-                    pre_window_s = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
-                    add_cues_s = gr.Checkbox(value=True, label="Add cues header")
-                    strip_smalltalk_s = gr.Checkbox(value=False, label="Strip smalltalk")
-                    gr.Markdown(explain_params_markdown())
-                with gr.Column():
-                    load_4bit_s = gr.Checkbox(value=False, label="Load in 4-bit (GPU only)")
-                    dtype_s = gr.Dropdown(choices=["bfloat16","float16","float32"], value="bfloat16", label="Compute dtype")
-                    trust_remote_code_s = gr.Checkbox(value=True, label="Trust remote code")
-            run_single_btn = gr.Button("Run (Single)")
-            kpi1 = gr.Markdown(elem_classes=["kpi"]); kpi2 = gr.Markdown(elem_classes=["kpi"]); kpi3 = gr.Markdown(elem_classes=["kpi"])
-            single_table = gr.Dataframe(label="Single run — metrics & diagnostics", interactive=False)
-            single_csv = gr.File(label="Download CSV", interactive=False)
-            single_zip = gr.File(label="Download Artifacts ZIP", interactive=False)
-            single_status = gr.Markdown("")
-            def _run_single(*args):
-                status, m1, m2, m3, df, csv_buf, zip_buf, btxt = single_mode(*args)
-                return m1 or "", m2 or "", m3 or "", (df if isinstance(df, pd.DataFrame) else pd.DataFrame()), csv_buf, zip_buf, (status or ""), (btxt or banner_text())
-            run_single_btn.click(
-                _run_single,
-                inputs=[preset_model, custom_model, instructions, context,
-                        transcript_text, transcript_file, expected_labels_json,
-                        soft_cap_s, preprocess_s, pre_window_s, add_cues_s, strip_smalltalk_s,
-                        load_4bit_s, dtype_s, trust_remote_code_s],
-                outputs=[kpi1, kpi2, kpi3, single_table, single_csv, single_zip, single_status, banner]
-            )
-        # Batch
-        with gr.TabItem("Batch (ZIP of many transcripts)"):
-            with gr.Row():
-                with gr.Column():
-                    models_list = gr.Checkboxgroup(
-                        choices=OPEN_MODEL_PRESETS, value=[OPEN_MODEL_PRESETS[0]],
-                        label="Models (Open presets — select one or more)"
-                    )
-                    custom_models = gr.Textbox(label="Custom model repo ids (comma-separated)",
-                                               placeholder="e.g. meta-llama/Meta-Llama-3-8B-Instruct, Qwen/Qwen2.5-7B-Instruct")
-                    instructions_b = gr.Textbox(label="Instructions (System)", lines=8, value=DEFAULT_SYSTEM)
-                    context_b = gr.Textbox(label="Context (User prefix before transcript)", lines=6, value=DEFAULT_CONTEXT)
-                with gr.Column():
-                    dataset_zip = gr.File(
-                        label="Upload ZIP of transcripts (*.txt) + expected (*.json)",
-                        file_types=[".zip"], file_count="single", type="filepath"
-                    )
-                    gr.Markdown("Zip must contain pairs like `ID.txt` and optional `ID.json` with expected labels (same base filename).")
-            with gr.Row():
-                with gr.Column():
-                    soft_cap = gr.Slider(1024, 32768, value=8192, step=512, label="Soft token cap")
-                    preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
-                    pre_window = gr.Slider(0, 6, value=3, step=1, label="Window ± lines around cues")
-                    add_cues = gr.Checkbox(value=True, label="Add cues header")
-                    strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
-                    gr.Markdown(explain_params_markdown())
-                with gr.Column():
-                    repeats = gr.Slider(1, 6, value=3, step=1, label="Repeats per config")
-                    max_total_runs = gr.Slider(1, 200, value=40, step=1, label="Max total runs")
-                    load_4bit = gr.Checkbox(value=False, label="Load in 4-bit (GPU only)")
-                    dtype = gr.Dropdown(choices=["bfloat16","float16","float32"], value="bfloat16", label="Compute dtype")
-                    trust_remote_code = gr.Checkbox(value=True, label="Trust remote code")
-            run_btn = gr.Button("Run Batch")
-            kpi_b1 = gr.Markdown(elem_classes=["kpi"]); kpi_b2 = gr.Markdown(elem_classes=["kpi"]); kpi_b3 = gr.Markdown(elem_classes=["kpi"])
-            table = gr.Dataframe(label="Batch results (per run + summary rows)", interactive=False)
-            csv_dl = gr.File(label="Download CSV", interactive=False)
-            zip_dl = gr.File(label="Download Artifacts ZIP", interactive=False)
-            status = gr.Markdown("")
-            def _run_batch(*args):
-                df, csv_pair, zip_pair, msg, btxt = run_batch_ui(*args)
-                m1 = m2 = m3 = ""
-                if isinstance(df, pd.DataFrame) and not df.empty:
-                    summaries = df[df["is_summary"] == True]
-                    if not summaries.empty:
-                        last = summaries.iloc[-1]
-                        f1 = last.get("f1"); ubs = last.get("ubs_score"); med = last.get("median_latency_ms")
-                        m1 = f"**F1 (last summary)**\n\n{f1:.3f}" if pd.notna(f1) else "**F1 (last summary)**\n\n—"
-                        m2 = f"**UBS (last summary)**\n\n{ubs:.3f}" if pd.notna(ubs) else "**UBS (last summary)**\n\n—"
-                        m3 = f"**Median latency (ms)**\n\n{int(med) if pd.notna(med) else '—'}"
-                csv_buf = zip_buf = None
-                if isinstance(csv_pair, tuple):
-                    name, data = csv_pair; csv_buf = io.BytesIO(data); csv_buf.name = name
-                if isinstance(zip_pair, tuple):
-                    name, data = zip_pair; zip_buf = io.BytesIO(data); zip_buf.name = name
-                return m1, m2, m3, (df if isinstance(df, pd.DataFrame) else pd.DataFrame()), csv_buf, zip_buf, (msg or ""), (btxt or banner_text())
-            run_btn.click(
-                _run_batch,
-                inputs=[models_list, custom_models, instructions_b, context_b, dataset_zip,
-                        soft_cap, preprocess, pre_window, add_cues, strip_smalltalk,
-                        repeats, max_total_runs, load_4bit, dtype, trust_remote_code],
-                outputs=[kpi_b1, kpi_b2, kpi_b3, table, csv_dl, zip_dl, status, banner]
-            )
-demo.launch()

+# app.py
+# ---------------------------------------------------------------------------
+# Talk2Task Demo (single-file, Spaces-friendly, robust model loader)
+# - Loads open-source chat/instruct models (default: mistralai/Mistral-7B-Instruct-v0.2)
+# - Pins model files locally via snapshot_download to avoid corrupt/partial shards
+# - Optional 4-bit quant for small GPU / ZeroGPU
+# - Simple "transcript -> actions JSON" generation with guardrails
+# - Compact but well-commented for easy maintenance
+# ---------------------------------------------------------------------------
+import os
+import sys
+import json
+import time
+import re
+from typing import Dict, Optional, Tuple
 import gradio as gr
+# NOTE: On Spaces, 'spaces' is available. We use the GPU decorator if present.
 try:
+    from spaces import GPU  # type: ignore
 except Exception:
+    # Fallback shim if not running on Spaces — decorator becomes a no-op
+    def GPU(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
 )
+from huggingface_hub import snapshot_download
+# ------------------------------
+# 1) “Hardcoded revision” strategy
+# ------------------------------
+# We let you "hardcode" a revision per repo in this mapping. If empty/None, we default to "main".
+# Best practice: keep this dict and optionally override via environment variables without editing code.
+# For example, set env var MODEL_REVISION__MISTRALAI_MISTRAL_7B_INSTRUCT_V0_2="<commit-hash>"
+# in the Space's "Variables and secrets".
+PRESET_MODELS: Dict[str, Dict[str, Optional[str]]] = {
+    # Key is a human-readable label for the dropdown
+    "Mistral 7B Instruct v0.2": {
+        "repo_id": "mistralai/Mistral-7B-Instruct-v0.2",
+        "revision": None  # leave None to use "main" by default (or override via env)
+    },
+    "Qwen2 7B Instruct": {
+        "repo_id": "Qwen/Qwen2-7B-Instruct",
+        "revision": None
+    },
+    "Zephyr 7B Beta": {
+        "repo_id": "HuggingFaceH4/zephyr-7b-beta",
+        "revision": None
+    },
+    "Falcon 7B Instruct": {
+        "repo_id": "tiiuae/falcon-7b-instruct",
+        "revision": None
+    },
+}
+# You can add/replace presets above. The loader below will:
+# - Look up env var MODEL_REVISION__<REPO_ID_SLUG> if set
+# - Else use the dict's "revision"
+# - Else use "main"
+def _slug_repo_id(repo_id: str) -> str:
+    """Turn 'org/model-name' into 'ORG_MODEL_NAME' for clean env var keys."""
+    return re.sub(r"[^A-Za-z0-9]", "_", repo_id).upper()
+def resolve_revision(repo_id: str, default_revision: Optional[str]) -> str:
+    """
+    Priority order for revision:
+      1) Env var   MODEL_REVISION__<ORG_MODEL_SLUG>
+      2) Given     default_revision
+      3) Fallback  "main"
+    """
+    env_key = f"MODEL_REVISION__{_slug_repo_id(repo_id)}"
+    env_rev = os.getenv(env_key, "").strip()
+    if env_rev:
+        return env_rev
+    if default_revision and default_revision.strip():
+        return default_revision.strip()
+    return "main"
+# ------------------------------
+# 2) Space/Runtime-safe defaults
+# ------------------------------
+# Use the persistent storage on Spaces so model files survive restarts.
+os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
+# Optional token if you plan to use gated/private models in future.
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+# ------------------------------
+# 3) Minimal model wrapper with caching
+# ------------------------------
+MODEL_CACHE: Dict[Tuple[str, bool, str, bool, str], "HFModel"] = {}
 class HFModel:
+    """
+    A small helper that:
+     - Downloads the full model snapshot at a specific *pinned* revision into the HF cache
+     - Loads tokenizer and model (optionally 4-bit)
+     - Exposes a simple generate_json() tailored for "Talk2Task" style outputs
+    """
+    def __init__(self,
+                 repo_id: str,
+                 revision: str,
+                 load_4bit: bool = True,
+                 dtype_str: str = "bfloat16",
+                 trust_remote_code: bool = True):
         self.repo_id = repo_id
+        self.revision = revision
+        self.load_4bit = bool(load_4bit)
+        self.trust_remote_code = bool(trust_remote_code)
+        # Map dtype string to torch dtype (default to bfloat16 if unknown)
+        self.torch_dtype = {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "float32": torch.float32
+        }.get(dtype_str, torch.bfloat16)
+        # 3a) Materialize a clean local copy of the exact revision (no flaky shard streaming)
+        #     'allow_patterns' narrows downloads to typical files we need.
+        self.local_dir = snapshot_download(
+            repo_id=self.repo_id,
+            revision=self.revision,
+            allow_patterns=[
+                "*.json", "*.safetensors", "*.bin", "*.model",
+                "tokenizer.*", "config.json", "generation_config.json", "*.py"
+            ],
+            resume_download=True,
+            local_dir=None,  # keep in HF cache path
+            local_dir_use_symlinks=False,
+            token=HF_TOKEN,
+        )
+        # 3b) Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
+            self.local_dir,
+            use_fast=True,
+            trust_remote_code=self.trust_remote_code,
+            token=HF_TOKEN,
         )
+        # 3c) Load model (try 4-bit, fall back to normal if unavailable)
         self.model = None
+        if self.load_4bit:
             try:
+                qconf = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=self.torch_dtype,
                 )
                 self.model = AutoModelForCausalLM.from_pretrained(
+                    self.local_dir,
+                    device_map="auto",
+                    trust_remote_code=self.trust_remote_code,
+                    quantization_config=qconf,
+                    torch_dtype=self.torch_dtype,
+                    token=HF_TOKEN,
                 )
             except Exception as e:
+                print(f"[WARN] 4-bit load failed for {self.repo_id}@{self.revision}: {e}\n"
+                      f"Falling back to standard load...", file=sys.stderr)
         if self.model is None:
             self.model = AutoModelForCausalLM.from_pretrained(
+                self.local_dir,
+                device_map="auto",
+                trust_remote_code=self.trust_remote_code,
+                torch_dtype=self.torch_dtype,
+                token=HF_TOKEN,
             )
+        # Useful to bound inputs for very long transcripts
         self.max_context = getattr(self.model.config, "max_position_embeddings", None) \
+            or getattr(self.model.config, "max_sequence_length", None) or 8192
+    def _chat_prompt(self, system_text: str, user_text: str) -> str:
+        """
+        Builds a simple chat-style prompt for instruct models.
+        Uses a generic format that works decently across Mistral/Qwen/Zephyr/Falcon.
+        """
+        # Keep system concise; we’ll ask for strict JSON to simplify parsing.
+        sys_part = (system_text or "").strip()
+        usr_part = (user_text or "").strip()
+        # A light structure that improves JSON-likeness across models:
+        prompt = (
+            f"<s>[SYSTEM]\n{sys_part}\n"
+            f"[/SYSTEM]\n"
+            f"[USER]\n{usr_part}\n[/USER]\n"
+            f"[ASSISTANT]\n"
+        )
+        return prompt
     @torch.inference_mode()
+    def generate_json(self,
+                      system_text: str,
+                      user_text: str,
+                      max_new_tokens: int = 256,
+                      temperature: float = 0.2,
+                      top_p: float = 0.9) -> Tuple[float, Dict, str]:
+        """
+        Run generation and return (latency_secs, parsed_json, raw_prompt).
+        The JSON schema we request is:
+        {
+          "actions": [{"type": "...","details":"..."}],
+          "followups": [{"question":"..."}],
+          "implied_actions": [{"hypothesis":"..."}]
+        }
+        """
+        raw_prompt = self._chat_prompt(system_text, user_text)
+        inputs = self.tokenizer(
+            raw_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=min(4096, self.max_context - max_new_tokens - 8),
+        ).to(self.model.device)
+        t0 = time.time()
+        output_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=(temperature > 0),
+            temperature=temperature,
+            top_p=top_p,
+            pad_token_id=self.tokenizer.eos_token_id or 0,
+            eos_token_id=self.tokenizer.eos_token_id,
         )
+        latency = time.time() - t0
+        full_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Heuristics to extract JSON from the assistant tail
+        # 1) Try last {...} block
+        maybe_json = None
+        m = re.findall(r"\{(?:[^{}]|(?R))*\}", full_text, flags=re.DOTALL)
+        if m:
+            maybe_json = m[-1]
+        else:
+            # 2) Attempt bracket capture if model used markdown code fences
+            m2 = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", full_text, flags=re.DOTALL | re.IGNORECASE)
+            if m2:
+                maybe_json = m2.group(1)
+        parsed = {}
+        if maybe_json:
             try:
+                parsed = json.loads(maybe_json)
             except Exception:
+                # Light cleanup if trailing commas/comments sneak in
+                cleaned = re.sub(r"//.*?$", "", maybe_json, flags=re.MULTILINE)
+                cleaned = re.sub(r",\s*}", "}", cleaned)
+                cleaned = re.sub(r",\s*]", "]", cleaned)
+                try:
+                    parsed = json.loads(cleaned)
+                except Exception:
+                    parsed = {"_raw": full_text.strip()}
         else:
+            parsed = {"_raw": full_text.strip()}
+        return latency, parsed, raw_prompt
+# ------------------------------
+# 4) High-level helpers
+# ------------------------------
+def get_or_load_model(repo_id: str,
+                      revision: str,
+                      load_4bit: bool,
+                      dtype_str: str,
+                      trust_remote_code: bool) -> HFModel:
+    key = (repo_id, bool(load_4bit), dtype_str, bool(trust_remote_code), revision)
+    if key not in MODEL_CACHE:
+        MODEL_CACHE[key] = HFModel(
+            repo_id=repo_id,
+            revision=revision,
+            load_4bit=load_4bit,
+            dtype_str=dtype_str,
+            trust_remote_code=trust_remote_code
+        )
+    return MODEL_CACHE[key]
+# ------------------------------
+# 5) “Business” prompt
+# ------------------------------
+SYSTEM_PROMPT = """You are an assistant that extracts structured actions from client transcripts.
+Return STRICT JSON with keys: "actions", "followups", "implied_actions".
+- "actions": list of { "type": string, "details": string }
+- "followups": list of { "question": string }
+- "implied_actions": list of { "hypothesis": string }
+NO extra commentary. NO markdown fences. Plain JSON ONLY.
+"""
+USER_GUIDE_TEMPLATE = """Transcript:
+{transcript}
+Extract concrete "actions" (e.g., "Schedule meeting with John on Friday 3pm CET"; "Send portfolio summary"; "Open a ticket").
+Extract clarifying "followups" as questions for the advisor.
+Infer 1–3 "implied_actions" (what the client might want next).
+Respond as JSON only.
+"""
+# ------------------------------
+# 6) Inference functions (Spaces GPU aware)
+# ------------------------------
+@GPU(duration=180, enable_queue=True)  # On Spaces with ZeroGPU/GPU; safe no-op elsewhere
+def run_inference(preset_label: str,
+                  transcript: str,
+                  max_new_tokens: int,
+                  temperature: float,
+                  top_p: float,
+                  load_4bit: bool,
+                  dtype_str: str,
+                  trust_remote_code: bool) -> Tuple[str, str, str]:
+    """
+    Main generation entry:
+    - Resolves repo_id + revision
+    - Loads (or reuses) a cached HFModel
+    - Runs generate_json()
+    - Returns pretty JSON, latency, and a minimal prompt echo for debugging
+    """
+    if not transcript.strip():
+        return "{}", "0.00s", "(no prompt)"
+    # Resolve repo + revision from preset
+    preset = PRESET_MODELS.get(preset_label)
+    if not preset:
+        return json.dumps({"error": f"Unknown preset: {preset_label}"}), "0.00s", "(no prompt)"
+    repo_id = preset["repo_id"]  # type: ignore
+    revision = resolve_revision(repo_id, preset.get("revision"))  # type: ignore
+    # Build user prompt
+    user_text = USER_GUIDE_TEMPLATE.format(transcript=transcript.strip())
+    # Load model and run
+    model = get_or_load_model(
+        repo_id=repo_id,
+        revision=revision,
+        load_4bit=load_4bit,
+        dtype_str=dtype_str,
+        trust_remote_code=trust_remote_code
+    )
+    latency, parsed, prompt = model.generate_json(
+        system_text=SYSTEM_PROMPT,
+        user_text=user_text,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p
+    )
+    # Pretty-print JSON for UI
     try:
+        pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
+    except Exception:
+        pretty = json.dumps({"_raw": str(parsed)}, indent=2, ensure_ascii=False)
+    return pretty, f"{latency:.2f}s", f"repo={repo_id}@{revision} | dtype={dtype_str} | 4bit={load_4bit}"
+# ------------------------------
+# 7) Gradio UI
+# ------------------------------
+def build_ui() -> gr.Blocks:
+    with gr.Blocks(title="Talk2Task Demo", fill_height=True) as demo:
+        gr.Markdown(
+            """
+            # Talk2Task Demo
+            Paste a short client transcript. The model will extract structured **Actions JSON**.
+            - **Model** is pinned via snapshot download for reliability on Spaces.
+            - Use the advanced options if you want to try different sampling or 4-bit.
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                preset = gr.Dropdown(
+                    label="Model preset",
+                    choices=list(PRESET_MODELS.keys()),
+                    value="Mistral 7B Instruct v0.2"
                 )
+                transcript = gr.Textbox(
+                    label="Transcript",
+                    lines=10,
+                    placeholder="Paste a client conversation or notes here…"
+                )
+                run_btn = gr.Button("Extract Actions", variant="primary")
+                with gr.Accordion("Advanced", open=False):
+                    max_new = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
+                    temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
+                    top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+                    load_4bit = gr.Checkbox(value=True, label="Load in 4-bit (fallback to full if not available)")
+                    dtype = gr.Radio(choices=["bfloat16", "float16", "float32"], value="bfloat16", label="Torch dtype")
+                    trust_rc = gr.Checkbox(value=True, label="trust_remote_code (required by some repos)")
+            with gr.Column(scale=1):
+                out_json = gr.Code(label="Actions JSON", language="json", interactive=False)
+                with gr.Row():
+                    latency = gr.Textbox(label="Latency", interactive=False)
+                    meta = gr.Textbox(label="Model info", interactive=False)
+        # Wire up the click
+        run_btn.click(
+            fn=run_inference,
+            inputs=[preset, transcript, max_new, temperature, top_p, load_4bit, dtype, trust_rc],
+            outputs=[out_json, latency, meta]
+        )
+        gr.Markdown(
+            """
+            **Tips**
+            - To pin a *specific* commit without editing code, set an env var in Space settings like:
+              `MODEL_REVISION__MISTRALAI_MISTRAL_7B_INSTRUCT_V0_2 = <commit-hash>`
+            - If you later add a gated/private model, set a secret **HF_TOKEN** as well.
+            """
+        )
+    return demo
+if __name__ == "__main__":
+    build_ui().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))