Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 26

Commit

5f0642c

verified ·

1 Parent(s): aa5f588

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -48

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import os
 import re
 import io
@@ -28,15 +27,13 @@ SPACE_CACHE.mkdir(parents=True, exist_ok=True)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-# Deterministic, compact outputs
 GEN_CONFIG = GenerationConfig(
     temperature=0.0,
     top_p=1.0,
     do_sample=False,
-    max_new_tokens=128,  # raise if your JSON truncates
 )
-# Canonical labels (UBS)
 OFFICIAL_LABELS = [
     "plan_contact",
     "schedule_meeting",
@@ -72,7 +69,7 @@ DEFAULT_LABEL_GLOSSARY = {
     "update_kyc_total_assets": "Discussion/confirmation of total assets/net worth.",
 }
-# Minimal multilingual fallback rules (optional)
 DEFAULT_FALLBACK_CUES = {
     "plan_contact": [
         r"\b(get|got|will|we'?ll|i'?ll)\s+back to you\b", r"\bfollow\s*up\b", r"\breach out\b", r"\btouch base\b",
@@ -250,14 +247,15 @@ def truncate_tokens(tokenizer, text: str, max_tokens: int) -> str:
     return tokenizer.decode(toks[-max_tokens:], skip_special_tokens=True)
 # =========================
-# HF model wrapper (robust loader + fast→slow tokenizer fallback)
 # =========================
 class ModelWrapper:
-    def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool):
         self.repo_id = repo_id
         self.hf_token = hf_token
         self.load_in_4bit = load_in_4bit
         self.use_sdpa = use_sdpa
         self.tokenizer = None
         self.model = None
         self.load_path = "uninitialized"
@@ -265,18 +263,21 @@ class ModelWrapper:
     def _load_tokenizer(self):
         fast_err = None
         tok = None
         try:
-            tok = AutoTokenizer.from_pretrained(
-                self.repo_id, token=self.hf_token, cache_dir=str(SPACE_CACHE),
-                trust_remote_code=True, use_fast=True
-            )
         except Exception as e:
             fast_err = e
         if tok is None:
-            tok = AutoTokenizer.from_pretrained(
-                self.repo_id, token=self.hf_token, cache_dir=str(SPACE_CACHE),
-                trust_remote_code=True, use_fast=False
-            )
         if tok.pad_token is None and tok.eos_token:
             tok.pad_token = tok.eos_token
         return tok, fast_err
@@ -372,10 +373,10 @@ class ModelWrapper:
         return self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
 _MODEL_CACHE: Dict[str, ModelWrapper] = {}
-def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool) -> ModelWrapper:
-    key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}::{'sdpa' if use_sdpa else 'nosdpa'}"
     if key not in _MODEL_CACHE:
-        m = ModelWrapper(repo_id, hf_token, load_in_4bit, use_sdpa)
         m.load()
         _MODEL_CACHE[key] = m
     return _MODEL_CACHE[key]
@@ -425,7 +426,7 @@ def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> fl
     return float(max(0.0, min(1.0, np.mean(per_sample))))
 # =========================
-# Multilingual regex fallback
 # =========================
 def multilingual_fallback(text: str, allowed: List[str], cues: Dict[str, List[str]]) -> Dict[str, Any]:
     low = text.lower()
@@ -452,10 +453,10 @@ def multilingual_fallback(text: str, allowed: List[str], cues: Dict[str, List[st
 def build_glossary_str(glossary: Dict[str, str], allowed: List[str]) -> str:
     return "\n".join([f"- {lab}: {glossary.get(lab, '')}" for lab in allowed])
-def warmup_model(model_repo: str, use_4bit: bool, use_sdpa: bool, hf_token: str) -> str:
     t0 = _now_ms()
     try:
-        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa)
         _ = model.generate("Return JSON only.", '{"labels": [], "tasks": []}')
         return f"Warm-up complete in {_now_ms() - t0} ms. Load path: {model.load_path}"
     except Exception as e:
@@ -477,11 +478,11 @@ def run_single(
     use_sdpa: bool,
     max_input_tokens: int,
     hf_token: str,
 ) -> Tuple[str, str, str, str, str, str, str, str, str]:
     t0 = _now_ms()
-    # Transcript
     raw_text = ""
     if transcript_file:
         raw_text = read_text_file_any(transcript_file)
@@ -491,36 +492,29 @@ def run_single(
     text = clean_transcript(raw_text) if use_cleaning else raw_text
-    # Allowed labels
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
     allowed = normalize_labels(user_allowed or OFFICIAL_LABELS)
-    # Editable configs
     try:
         sys_instructions = (sys_instructions_text or DEFAULT_SYSTEM_INSTRUCTIONS).strip() or DEFAULT_SYSTEM_INSTRUCTIONS
     except Exception:
         sys_instructions = DEFAULT_SYSTEM_INSTRUCTIONS
     try:
         label_glossary = json.loads(glossary_json_text) if glossary_json_text else DEFAULT_LABEL_GLOSSARY
     except Exception:
         label_glossary = DEFAULT_LABEL_GLOSSARY
     try:
         fallback_cues = json.loads(fallback_json_text) if fallback_json_text else DEFAULT_FALLBACK_CUES
     except Exception:
         fallback_cues = DEFAULT_FALLBACK_CUES
-    # Model
     try:
-        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa)
     except Exception as e:
         return "", "", f"Model load failed: {e}", "", "", "", "", "", ""
-    # Truncate
     trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
-    # Build prompt
     glossary_str = build_glossary_str(label_glossary, allowed)
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
@@ -529,13 +523,11 @@ def run_single(
         glossary=glossary_str,
     )
-    # Token info + prompt preview
     transcript_tokens = len(model.tokenizer(trunc, add_special_tokens=False)["input_ids"])
     prompt_tokens = len(model.tokenizer(user_prompt, add_special_tokens=False)["input_ids"])
     token_info_text = f"Transcript tokens: {transcript_tokens} | Prompt tokens: {prompt_tokens} | Load path: {model.load_path}"
     prompt_preview_text = "```\n" + user_prompt[:4000] + ("\n... (truncated)" if len(user_prompt) > 4000 else "") + "\n```"
-    # Generate
     t1 = _now_ms()
     try:
         out = model.generate(sys_instructions, user_prompt)
@@ -546,7 +538,6 @@ def run_single(
     parsed = robust_json_extract(out)
     filtered = restrict_to_allowed(parsed, allowed)
-    # Fallback merge for recall
     if use_fallback:
         fb = multilingual_fallback(trunc, allowed, fallback_cues)
         if fb["labels"]:
@@ -555,7 +546,6 @@ def run_single(
             merged_tasks = filtered.get("tasks", []) + [t for t in fb["tasks"] if t["label"] not in existing]
             filtered = {"labels": merged_labels, "tasks": merged_tasks}
-    # Diagnostics
     diag = "\n".join([
         f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
         f"Model: {model_repo}",
@@ -567,7 +557,6 @@ def run_single(
         f"Allowed labels: {', '.join(allowed)}",
     ])
-    # Summaries
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
     summary = "Detected labels:\n" + ("\n".join(f"- {l}" for l in labs) if labs else "(none)")
@@ -580,7 +569,6 @@ def run_single(
         summary += "\n\nTasks: (none)"
     json_out = json.dumps(filtered, indent=2, ensure_ascii=False)
-    # Single-file metrics if GT provided
     metrics = ""
     if gt_json_file or (gt_json_text and gt_json_text.strip()):
         truth_obj = None
@@ -613,9 +601,8 @@ def run_single(
         else:
             metrics = "Ground truth JSON missing or invalid; expected {'labels': [...]}."
-    # Previews
-    context_preview = "### Label Glossary (used)\n" + "\n".join(f"- {k}: {v}" for k, v in label_glossary.items() if k in allowed)
-    instructions_preview = "```\n" + sys_instructions + "\n```"
     return summary, json_out, diag, out.strip(), context_preview, instructions_preview, metrics, prompt_preview_text, token_info_text
@@ -642,6 +629,7 @@ def run_batch(
     use_sdpa: bool,
     max_input_tokens: int,
     hf_token: str,
     limit_files: int,
 ) -> Tuple[str, str, pd.DataFrame, str]:
@@ -661,7 +649,6 @@ def run_batch(
     except Exception:
         fallback_cues = DEFAULT_FALLBACK_CUES
-    # Workspace
     work = Path("/tmp/batch")
     if work.exists():
         for p in sorted(work.rglob("*"), reverse=True):
@@ -686,9 +673,8 @@ def run_batch(
     if not stems:
         return ("No .txt transcripts found in ZIP.", "", pd.DataFrame(), "")
-    # Model
     try:
-        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa)
     except Exception as e:
         return (f"Model load failed: {e}", "", pd.DataFrame(), "")
@@ -787,12 +773,11 @@ def run_batch(
 # UI
 # =========================
 MODEL_CHOICES = [
-    "swiss-ai/Apertus-8B-Instruct-2509",      # multilingual
     "meta-llama/Meta-Llama-3-8B-Instruct",
     "mistralai/Mistral-7B-Instruct-v0.3",
 ]
-# White, modern UI (no purple)
 custom_css = """
 :root { --radius: 14px; }
 .gradio-container { font-family: Inter, ui-sans-serif, system-ui; background: #ffffff; color: #111827; }
@@ -806,7 +791,7 @@ a, .prose a { color: #0ea5e9; }
 with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo:
     gr.Markdown("<div class='header'>Talk2Task — Multilingual Task Extraction (UBS Challenge)</div>")
-    gr.Markdown("<div class='subtle'>Single-pass multilingual extraction (DE/FR/IT/EN) with compact prompts. Optional rule fallback ensures recall. Batch evaluation & scoring included.</div>")
     with gr.Tab("Single transcript"):
         with gr.Row():
@@ -850,6 +835,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
                 repo = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
                 use_4bit = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
                 use_sdpa = gr.Checkbox(label="Use SDPA attention (faster on many GPUs)", value=True)
                 max_tokens = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=2048)
                 hf_token = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
                 warm_btn = gr.Button("Warm up model (load & compile kernels)")
@@ -875,8 +861,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
         # Reset labels
         reset_btn.click(fn=lambda: OFFICIAL_LABELS_TEXT, inputs=None, outputs=labels_text)
         # Warm-up
-        warm_btn.click(fn=warmup_model, inputs=[repo, use_4bit, use_sdpa, hf_token], outputs=diag)
         def _pack_context_md(glossary_json, allowed_text):
             try:
@@ -894,7 +885,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
             inputs=[
                 text, file, gt_text, gt_file, use_cleaning, use_fallback,
                 labels_text, sys_instr_tb, glossary_tb, fallback_tb,
-                repo, use_4bit, use_sdpa, max_tokens, hf_token
             ],
             outputs=[summary, json_out, diag, raw, context_md, instr_md, gr.Textbox(visible=False), prompt_preview, token_info],
         )
@@ -912,6 +903,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
                 repo_b = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
                 use_4bit_b = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
                 use_sdpa_b = gr.Checkbox(label="Use SDPA attention (faster on many GPUs)", value=True)
                 max_tokens_b = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=2048)
                 hf_token_b = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
                 sys_instr_tb_b = gr.Textbox(label="System Instructions (editable for batch)", value=DEFAULT_SYSTEM_INSTRUCTIONS, lines=4)
@@ -934,10 +926,20 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
             inputs=[
                 zip_in, use_cleaning_b, use_fallback_b,
                 sys_instr_tb_b, glossary_tb_b, fallback_tb_b,
-                repo_b, use_4bit_b, use_sdpa_b, max_tokens_b, hf_token_b, limit_files
             ],
             outputs=[status, diag_b, df_out, csv_out],
         )
 if __name__ == "__main__":
     demo.launch()

 import os
 import re
 import io
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 GEN_CONFIG = GenerationConfig(
     temperature=0.0,
     top_p=1.0,
     do_sample=False,
+    max_new_tokens=128,  # raise if JSON truncates
 )
 OFFICIAL_LABELS = [
     "plan_contact",
     "schedule_meeting",
     "update_kyc_total_assets": "Discussion/confirmation of total assets/net worth.",
 }
+# Tiny multilingual fallback rules (optional) to avoid empty outputs
 DEFAULT_FALLBACK_CUES = {
     "plan_contact": [
         r"\b(get|got|will|we'?ll|i'?ll)\s+back to you\b", r"\bfollow\s*up\b", r"\breach out\b", r"\btouch base\b",
     return tokenizer.decode(toks[-max_tokens:], skip_special_tokens=True)
 # =========================
+# HF model wrapper (robust: fast→slow tokenizer + load fallbacks)
 # =========================
 class ModelWrapper:
+    def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool, force_tok_redownload: bool):
         self.repo_id = repo_id
         self.hf_token = hf_token
         self.load_in_4bit = load_in_4bit
         self.use_sdpa = use_sdpa
+        self.force_tok_redownload = force_tok_redownload
         self.tokenizer = None
         self.model = None
         self.load_path = "uninitialized"
     def _load_tokenizer(self):
         fast_err = None
         tok = None
+        common = dict(
+            pretrained_model_name_or_path=self.repo_id,
+            token=self.hf_token,
+            cache_dir=str(SPACE_CACHE),
+            trust_remote_code=True,
+            local_files_only=False,
+            force_download=True if self.force_tok_redownload else False,
+            revision=None,
+        )
         try:
+            tok = AutoTokenizer.from_pretrained(use_fast=True, **common)
         except Exception as e:
             fast_err = e
         if tok is None:
+            tok = AutoTokenizer.from_pretrained(use_fast=False, **common)
         if tok.pad_token is None and tok.eos_token:
             tok.pad_token = tok.eos_token
         return tok, fast_err
         return self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
 _MODEL_CACHE: Dict[str, ModelWrapper] = {}
+def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool, force_tok_redownload: bool) -> ModelWrapper:
+    key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}::{'sdpa' if use_sdpa else 'nosdpa'}::{'force' if force_tok_redownload else 'cache'}"
     if key not in _MODEL_CACHE:
+        m = ModelWrapper(repo_id, hf_token, load_in_4bit, use_sdpa, force_tok_redownload)
         m.load()
         _MODEL_CACHE[key] = m
     return _MODEL_CACHE[key]
     return float(max(0.0, min(1.0, np.mean(per_sample))))
 # =========================
+# Multilingual regex fallback (optional)
 # =========================
 def multilingual_fallback(text: str, allowed: List[str], cues: Dict[str, List[str]]) -> Dict[str, Any]:
     low = text.lower()
 def build_glossary_str(glossary: Dict[str, str], allowed: List[str]) -> str:
     return "\n".join([f"- {lab}: {glossary.get(lab, '')}" for lab in allowed])
+def warmup_model(model_repo: str, use_4bit: bool, use_sdpa: bool, hf_token: str, force_tok_redownload: bool) -> str:
     t0 = _now_ms()
     try:
+        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa, force_tok_redownload)
         _ = model.generate("Return JSON only.", '{"labels": [], "tasks": []}')
         return f"Warm-up complete in {_now_ms() - t0} ms. Load path: {model.load_path}"
     except Exception as e:
     use_sdpa: bool,
     max_input_tokens: int,
     hf_token: str,
+    force_tok_redownload: bool,
 ) -> Tuple[str, str, str, str, str, str, str, str, str]:
     t0 = _now_ms()
     raw_text = ""
     if transcript_file:
         raw_text = read_text_file_any(transcript_file)
     text = clean_transcript(raw_text) if use_cleaning else raw_text
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
     allowed = normalize_labels(user_allowed or OFFICIAL_LABELS)
     try:
         sys_instructions = (sys_instructions_text or DEFAULT_SYSTEM_INSTRUCTIONS).strip() or DEFAULT_SYSTEM_INSTRUCTIONS
     except Exception:
         sys_instructions = DEFAULT_SYSTEM_INSTRUCTIONS
     try:
         label_glossary = json.loads(glossary_json_text) if glossary_json_text else DEFAULT_LABEL_GLOSSARY
     except Exception:
         label_glossary = DEFAULT_LABEL_GLOSSARY
     try:
         fallback_cues = json.loads(fallback_json_text) if fallback_json_text else DEFAULT_FALLBACK_CUES
     except Exception:
         fallback_cues = DEFAULT_FALLBACK_CUES
     try:
+        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa, force_tok_redownload)
     except Exception as e:
         return "", "", f"Model load failed: {e}", "", "", "", "", "", ""
     trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
     glossary_str = build_glossary_str(label_glossary, allowed)
     allowed_list_str = "\n".join(f"- {l}" for l in allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
         glossary=glossary_str,
     )
     transcript_tokens = len(model.tokenizer(trunc, add_special_tokens=False)["input_ids"])
     prompt_tokens = len(model.tokenizer(user_prompt, add_special_tokens=False)["input_ids"])
     token_info_text = f"Transcript tokens: {transcript_tokens} | Prompt tokens: {prompt_tokens} | Load path: {model.load_path}"
     prompt_preview_text = "```\n" + user_prompt[:4000] + ("\n... (truncated)" if len(user_prompt) > 4000 else "") + "\n```"
     t1 = _now_ms()
     try:
         out = model.generate(sys_instructions, user_prompt)
     parsed = robust_json_extract(out)
     filtered = restrict_to_allowed(parsed, allowed)
     if use_fallback:
         fb = multilingual_fallback(trunc, allowed, fallback_cues)
         if fb["labels"]:
             merged_tasks = filtered.get("tasks", []) + [t for t in fb["tasks"] if t["label"] not in existing]
             filtered = {"labels": merged_labels, "tasks": merged_tasks}
     diag = "\n".join([
         f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
         f"Model: {model_repo}",
         f"Allowed labels: {', '.join(allowed)}",
     ])
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
     summary = "Detected labels:\n" + ("\n".join(f"- {l}" for l in labs) if labs else "(none)")
         summary += "\n\nTasks: (none)"
     json_out = json.dumps(filtered, indent=2, ensure_ascii=False)
     metrics = ""
     if gt_json_file or (gt_json_text and gt_json_text.strip()):
         truth_obj = None
         else:
             metrics = "Ground truth JSON missing or invalid; expected {'labels': [...]}."
+    context_preview = "### Label Glossary (used)\n" + "\n".join(f"- {k}: {v}" for k, v in DEFAULT_LABEL_GLOSSARY.items() if k in allowed)
+    instructions_preview = "```\n" + (sys_instructions_text or DEFAULT_SYSTEM_INSTRUCTIONS) + "\n```"
     return summary, json_out, diag, out.strip(), context_preview, instructions_preview, metrics, prompt_preview_text, token_info_text
     use_sdpa: bool,
     max_input_tokens: int,
     hf_token: str,
+    force_tok_redownload: bool,
     limit_files: int,
 ) -> Tuple[str, str, pd.DataFrame, str]:
     except Exception:
         fallback_cues = DEFAULT_FALLBACK_CUES
     work = Path("/tmp/batch")
     if work.exists():
         for p in sorted(work.rglob("*"), reverse=True):
     if not stems:
         return ("No .txt transcripts found in ZIP.", "", pd.DataFrame(), "")
     try:
+        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit, use_sdpa, force_tok_redownload)
     except Exception as e:
         return (f"Model load failed: {e}", "", pd.DataFrame(), "")
 # UI
 # =========================
 MODEL_CHOICES = [
+    "swiss-ai/Apertus-8B-Instruct-2509",
     "meta-llama/Meta-Llama-3-8B-Instruct",
     "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 custom_css = """
 :root { --radius: 14px; }
 .gradio-container { font-family: Inter, ui-sans-serif, system-ui; background: #ffffff; color: #111827; }
 with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo:
     gr.Markdown("<div class='header'>Talk2Task — Multilingual Task Extraction (UBS Challenge)</div>")
+    gr.Markdown("<div class='subtle'>Single-pass multilingual extraction (DE/FR/IT/EN). Optional rules fallback for recall. Batch evaluation included.</div>")
     with gr.Tab("Single transcript"):
         with gr.Row():
                 repo = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
                 use_4bit = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
                 use_sdpa = gr.Checkbox(label="Use SDPA attention (faster on many GPUs)", value=True)
+                force_tok_redownload = gr.Checkbox(label="Force fresh tokenizer download", value=False)
                 max_tokens = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=2048)
                 hf_token = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
                 warm_btn = gr.Button("Warm up model (load & compile kernels)")
         # Reset labels
         reset_btn.click(fn=lambda: OFFICIAL_LABELS_TEXT, inputs=None, outputs=labels_text)
         # Warm-up
+        warm_btn.click(
+            fn=warmup_model,
+            inputs=[repo, use_4bit, use_sdpa, hf_token, force_tok_redownload],
+            outputs=diag
+        )
         def _pack_context_md(glossary_json, allowed_text):
             try:
             inputs=[
                 text, file, gt_text, gt_file, use_cleaning, use_fallback,
                 labels_text, sys_instr_tb, glossary_tb, fallback_tb,
+                repo, use_4bit, use_sdpa, max_tokens, hf_token, force_tok_redownload
             ],
             outputs=[summary, json_out, diag, raw, context_md, instr_md, gr.Textbox(visible=False), prompt_preview, token_info],
         )
                 repo_b = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
                 use_4bit_b = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
                 use_sdpa_b = gr.Checkbox(label="Use SDPA attention (faster on many GPUs)", value=True)
+                force_tok_redownload_b = gr.Checkbox(label="Force fresh tokenizer download", value=False)
                 max_tokens_b = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=2048)
                 hf_token_b = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
                 sys_instr_tb_b = gr.Textbox(label="System Instructions (editable for batch)", value=DEFAULT_SYSTEM_INSTRUCTIONS, lines=4)
             inputs=[
                 zip_in, use_cleaning_b, use_fallback_b,
                 sys_instr_tb_b, glossary_tb_b, fallback_tb_b,
+                repo_b, use_4bit_b, use_sdpa_b, max_tokens_b, hf_token_b, force_tok_redownload_b, limit_files
             ],
             outputs=[status, diag_b, df_out, csv_out],
         )
 if __name__ == "__main__":
+    # Optional: print environment info to logs
+    try:
+        print("Torch version:", torch.__version__)
+        print("CUDA available:", torch.cuda.is_available())
+        if torch.cuda.is_available():
+            print("CUDA (compiled):", torch.version.cuda)
+            print("Device:", torch.cuda.get_device_name(0))
+    except Exception as _:
+        pass
     demo.launch()