Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 25

Commit

c99502f

verified ·

1 Parent(s): 41b65ed

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -154

app.py CHANGED Viewed

@@ -1,19 +1,44 @@
 import os
 import json
 import gradio as gr
 import torch
-from typing import Optional, Tuple, Dict, Any
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # =========================
-# Runtime / Model Defaults
 # =========================
-# Small, ungated default to avoid permission/download issues.
-# You can switch at runtime via the dropdown or set MODEL_ID env var.
-DEFAULT_MODEL_ID = os.environ.get("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 def _has_bnb_and_cuda() -> bool:
-    if not torch.cuda.is_available():
         return False
     try:
         import bitsandbytes as _bnb  # noqa: F401
@@ -22,10 +47,9 @@ def _has_bnb_and_cuda() -> bool:
         return False
 USE_BNB = _has_bnb_and_cuda()
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
-# Model Load (safe + flexible)
 # =========================
 _tokenizer: Optional[AutoTokenizer] = None
 _model: Optional[AutoModelForCausalLM] = None
@@ -33,8 +57,8 @@ _current_model_id: Optional[str] = None
 def load_model(model_id: str) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
     """
-    Loads (or reuses) a model/tokenizer. Uses bitsandbytes 4-bit only if
-    CUDA is available AND bnb is installed. Otherwise plain CPU/GPU.
     """
     global _tokenizer, _model, _current_model_id
@@ -64,9 +88,9 @@ def load_model(model_id: str) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
     _tokenizer, _model, _current_model_id = tokenizer, model, model_id
     return tokenizer, model
-# ======================================
-# Helpers: Ingest TXT/JSON from Tabs box
-# ======================================
 def read_file(file_obj: Optional[gr.File]) -> Optional[str]:
     if not file_obj:
         return None
@@ -77,20 +101,81 @@ def read_file(file_obj: Optional[gr.File]) -> Optional[str]:
         return None
 def normalize_txt_input(paste_txt: str, upload_file: Optional[gr.File]) -> str:
-    file_text = read_file(upload_file)
-    if paste_txt and paste_txt.strip():
-        return paste_txt
-    return file_text or ""
 def normalize_json_input(paste_json: str, upload_file: Optional[gr.File]) -> str:
-    file_text = read_file(upload_file)
-    candidate = paste_json.strip() if paste_json else ""
-    if not candidate and file_text:
-        candidate = file_text
-    return candidate
 # =========================
-# Core Extraction (placeholder)
 # =========================
 def run_extraction(
     model_choice: str,
@@ -104,58 +189,66 @@ def run_extraction(
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ) -> Tuple[str, str, str, str, str]:
-    """
-    Wire your real extraction here.
-    Returns:
-      tasks_out, entities_out, cleaned_out, summary_out, diagnostics
-    """
     diagnostics_lines = []
-    # Resolve inputs from single-box Tab controls
     input_txt = normalize_txt_input(txt_paste, txt_upload)
     input_json_raw = normalize_json_input(json_paste, json_upload)
-    diagnostics_lines.append(f"Model: {model_choice}")
-    diagnostics_lines.append(f"Params: {params_checked}")
-    diagnostics_lines.append(f"Instructions length: {len(instructions_text)} chars")
-    diagnostics_lines.append(f"Context length: {len(context_text)} chars")
-    diagnostics_lines.append(f"TXT length: {len(input_txt)} chars")
-    # Try parse JSON (optional)
     parsed_json: Dict[str, Any] = {}
     if input_json_raw:
         try:
             parsed_json = json.loads(input_json_raw)
-            diagnostics_lines.append("JSON: parsed successfully")
         except Exception as e:
             diagnostics_lines.append(f"JSON parse error: {e}")
-    # Load selected model (safe)
     try:
         tokenizer, model = load_model(model_choice)
     except Exception as e:
-        # If model fails to load, still return diagnostics
-        diag = "\n".join(diagnostics_lines + [f"Model load failed: {e}"])
         return "", "", "", "", diag
-    # ---------- Dummy generation (replace with your real prompts) ----------
-    # Build a prompt from inputs (very basic)
     user_prompt = (
-        "You are an assistant that extracts tasks and entities.\n"
         f"Instructions: {instructions_text}\n"
         f"Context: {context_text}\n"
         "----\n"
         f"TEXT:\n{input_txt[:4000]}\n"
         "----\n"
         f"JSON:\n{json.dumps(parsed_json)[:2000]}\n"
-        "Extract:\n- Tasks list\n- Entities list\n- Cleaned text (sanitized)\n- 1-2 line summary\n"
     )
     try:
         inputs = tokenizer(user_prompt, return_tensors="pt").to(DEVICE)
         with torch.no_grad():
-            outputs = _model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
@@ -165,184 +258,195 @@ def run_extraction(
             )
         full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
-        diag = "\n".join(diagnostics_lines + [f"Inference failed: {e}"])
         return "", "", "", "", diag
-    # Very naive post-split (replace with your own structured parsing)
-    tasks_out = "• Task 1\n• Task 2\n(Replace with your parser)"
-    entities_out = "• Entity A\n• Entity B\n(Replace with your parser)"
-    cleaned_out = "Cleaned text here… (Replace with your cleaning pipeline)"
-    summary_out = "Short summary here… (Replace with your summarizer)"
-    diagnostics_lines.append("Generation completed successfully.")
     diagnostics = "\n".join(diagnostics_lines)
     return tasks_out, entities_out, cleaned_out, summary_out, diagnostics
 # =========================
-# UI (Gradio Blocks)
 # =========================
 THEME_CSS = """
-/* Global colors: white background, black text */
 :root {
   --body-background-fill: #ffffff !important;
   --body-text-color: #111111 !important;
-  --link-text-color: #0b63ce !important;  /* blue */
-  --shadow-spread: 0px;
-}
-/* Ensure all text is readable (black-ish) */
-.gradio-container, .prose, .prose * {
-  color: #111111 !important;
 }
-/* Accent elements in blue (no purple) */
-label, .tabitem .label-wrap, .wrap .label-wrap {
-  color: #0b63ce !important;
-}
-/* Cards / Boxes */
-.gr-box, .gr-panel, .gr-group, .gr-accordion {
-  border: 1px solid #e5e7eb !important; /* light gray border */
-  border-radius: 14px !important;
-}
-/* Red run button */
 button#run-btn {
   background: #e11900 !important;
-  color: #ffffff !important;
   border: 1px solid #b50f00 !important;
 }
-button#run-btn:hover {
-  filter: brightness(0.95);
-}
-/* Inputs layout polish */
-.input-card {
-  padding: 10px;
-}
 """
 def build_interface() -> gr.Blocks:
     with gr.Blocks(title="Talk2Task Demo", css=THEME_CSS) as demo:
-        # 1) MODEL SELECTION (full width) + checklist embedded
         with gr.Group():
-            gr.Markdown("### Model & Parameters", elem_id="model-header")
-            with gr.Row(equal_height=True):
                 model_choice = gr.Dropdown(
                     label="Model",
-                    choices=[
-                        DEFAULT_MODEL_ID,
-                        "mistralai/Mistral-7B-Instruct-v0.2",
-                        "meta-llama/Llama-3.1-8B-Instruct",  # if accessible
-                    ],
                     value=DEFAULT_MODEL_ID,
-                    scale=3
                 )
                 params_checked = gr.CheckboxGroup(
                     label="Options",
                     choices=[
                         "Default cleaning",
                         "Remove PII",
-                        "Allow 4-bit (if available)",
                         "Detect language",
                     ],
-                    value=["Default cleaning"],
-                    scale=2
                 )
             with gr.Row():
-                # generation controls (kept compact)
                 temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
                 top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
-                max_new_tokens = gr.Slider(32, 1024, value=200, step=8, label="Max new tokens")
-        # 2) SINGLE “BOX” PER TYPE — via Tabs (Paste OR Drag & Drop) — side-by-side
-        gr.Markdown("### Input", elem_id="input-header")
         with gr.Row(equal_height=True):
-            with gr.Group(elem_classes=["input-card"]):
-                gr.Markdown("**TXT Input** (Paste or Drag & Drop)", elem_id="txt-box-title")
                 with gr.Tabs():
                     with gr.TabItem("Paste"):
-                        txt_paste = gr.TextArea(
-                            label="Paste TXT",
-                            placeholder="Paste raw transcript or text here...",
-                            lines=12,
-                        )
-                    with gr.TabItem("Drag & Drop"):
-                        txt_upload = gr.File(
-                            label="Upload .txt file",
-                            file_types=[".txt"],
-                        )
-            with gr.Group(elem_classes=["input-card"]):
-                gr.Markdown("**JSON Input** (Paste or Drag & Drop)", elem_id="json-box-title")
                 with gr.Tabs():
                     with gr.TabItem("Paste"):
-                        json_paste = gr.Code(
-                            label="Paste JSON",
-                            language="json",
-                            value="{\n  \"example\": true\n}",
-                            lines=12,
-                        )
-                    with gr.TabItem("Drag & Drop"):
-                        json_upload = gr.File(
-                            label="Upload .json file",
-                            file_types=[".json"],
-                        )
-        # 3) RUN BUTTON (red), then collapsible Instructions & Context
-        run_btn = gr.Button("Run Extraction", elem_id="run-btn", variant="primary")
         with gr.Row():
             with gr.Accordion("Instructions (editable)", open=False):
                 instructions_text = gr.TextArea(
-                    label="Instructions",
                     value=(
-                        "Extract tasks, entities, and a short summary. "
-                        "Apply default cleaning unless unchecked."
                     ),
-                    lines=5,
                 )
             with gr.Accordion("Context (editable)", open=False):
                 context_text = gr.TextArea(
-                    label="Context",
                     value=(
-                        "Use banking/consulting context if relevant. "
-                        "Prefer concise actionable phrasing."
                     ),
-                    lines=5,
                 )
-        # 4) OUTPUT LAYOUT — symmetrical boxes
-        gr.Markdown("### Results", elem_id="results-header")
         with gr.Row(equal_height=True):
-            tasks_out = gr.TextArea(label="Tasks", lines=10)
-            entities_out = gr.TextArea(label="Entities", lines=10)
         with gr.Row(equal_height=True):
-            cleaned_out = gr.TextArea(label="Cleaned Text", lines=10)
-            summary_out = gr.TextArea(label="Summary", lines=10)
-        gr.Markdown("### Diagnostics", elem_id="diagnostics-header")
-        diagnostics = gr.TextArea(label="Diagnostics / Logs", lines=10)
-        # Wire up button
         run_inputs = [
             model_choice, params_checked, instructions_text, context_text,
             txt_paste, txt_upload, json_paste, json_upload,
-            max_new_tokens, temperature, top_p
         ]
         run_outputs = [tasks_out, entities_out, cleaned_out, summary_out, diagnostics]
-        run_btn.click(
-            fn=run_extraction,
-            inputs=run_inputs,
-            outputs=run_outputs
-        )
     return demo
 demo = build_interface()
 if __name__ == "__main__":
-    # Let Gradio/Spaces choose host & port; this keeps local runs easy too.
     demo.launch()

 import os
 import json
+from typing import Optional, Tuple, Dict, Any, List
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from langdetect import detect, DetectorFactory
+# Make langdetect deterministic
+DetectorFactory.seed = 7
 # =========================
+# Challenge: allowed labels (from UBS repo)
+# =========================
+# Source: GitHub repo "From-Talk-to-Task-Insights-from-Client-Conversations"
+ALLOWED_LABELS = [
+    "plan_contact",
+    "schedule_meeting",
+    "update_contact_info_non_postal",
+    "update_contact_info_postal_address",
+    "update_kyc_activity",
+    "update_kyc_origin_of_assets",
+    "update_kyc_purpose_of_businessrelation",
+    "update_kyc_total_assets",
+]
 # =========================
+# Models / Defaults
+# =========================
+DEFAULT_MODEL_ID = os.environ.get("MODEL_ID", "Apertus/Apertus-8B")
+SUPPORTED_MODELS = [
+    "Apertus/Apertus-8B",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def _has_bnb_and_cuda() -> bool:
+    if DEVICE != "cuda":
         return False
     try:
         import bitsandbytes as _bnb  # noqa: F401
         return False
 USE_BNB = _has_bnb_and_cuda()
 # =========================
+# Model cache
 # =========================
 _tokenizer: Optional[AutoTokenizer] = None
 _model: Optional[AutoModelForCausalLM] = None
 def load_model(model_id: str) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
     """
+    Loads (or reuses) a model/tokenizer.
+    Uses bitsandbytes 4-bit only if CUDA + bnb available; otherwise standard load.
     """
     global _tokenizer, _model, _current_model_id
     _tokenizer, _model, _current_model_id = tokenizer, model, model_id
     return tokenizer, model
+# =========================
+# Helpers
+# =========================
 def read_file(file_obj: Optional[gr.File]) -> Optional[str]:
     if not file_obj:
         return None
         return None
 def normalize_txt_input(paste_txt: str, upload_file: Optional[gr.File]) -> str:
+    return paste_txt.strip() if (paste_txt and paste_txt.strip()) else (read_file(upload_file) or "")
 def normalize_json_input(paste_json: str, upload_file: Optional[gr.File]) -> str:
+    if paste_json and paste_json.strip():
+        return paste_json
+    return read_file(upload_file) or ""
+def safe_lang_detect(text: str) -> str:
+    try:
+        if not text or not text.strip():
+            return "unknown"
+        return detect(text)
+    except Exception:
+        return "unknown"
+def count_tokens(tokenizer: AutoTokenizer, text: str) -> int:
+    try:
+        return len(tokenizer(text, return_tensors=None).get("input_ids", []))
+    except Exception:
+        # Fallback rough estimate if tokenizer path fails
+        return max(1, len(text.split()))
 # =========================
+# Evaluation function (from repo)
+# =========================
+# Source: UBS GitHub README "Evaluation" snippet (weighted FN/FP, custom penalties)
+def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> float:
+    import numpy as np
+    LABEL_TO_IDX = {label: idx for idx, label in enumerate(ALLOWED_LABELS)}
+    FN_PENALTY = 2.0
+    FP_PENALTY = 1.0
+    if len(y_true) != len(y_pred):
+        raise ValueError(f"y_true and y_pred must have same length. Got {len(y_true)} vs {len(y_pred)}")
+    n_samples = len(y_true)
+    n_labels = len(ALLOWED_LABELS)
+    y_true_binary = np.zeros((n_samples, n_labels), dtype=int)
+    y_pred_binary = np.zeros((n_samples, n_labels), dtype=int)
+    def _process(sample_labels: List[str], sample_name: str) -> List[str]:
+        if not isinstance(sample_labels, list):
+            raise ValueError(f"{sample_name} must be a list of strings, got {type(sample_labels)}")
+        seen = set()
+        valid = []
+        for lbl in sample_labels:
+            if not isinstance(lbl, str):
+                raise ValueError(f"{sample_name} contains non-string label: {lbl}")
+            if lbl in seen:
+                raise ValueError(f"{sample_name} contains duplicate label: '{lbl}'")
+            seen.add(lbl)
+            if lbl not in ALLOWED_LABELS:
+                raise ValueError(f"{sample_name} contains invalid label: '{lbl}'. Allowed: {ALLOWED_LABELS}")
+            valid.append(lbl)
+        return valid
+    for i, lbls in enumerate(y_true):
+        for lbl in _process(lbls, f"y_true[{i}]"):
+            y_true_binary[i, LABEL_TO_IDX[lbl]] = 1
+    for i, lbls in enumerate(y_pred):
+        for lbl in _process(lbls, f"y_pred[{i}]"):
+            y_pred_binary[i, LABEL_TO_IDX[lbl]] = 1
+    false_negatives = np.sum((y_true_binary == 1) & (y_pred_binary == 0), axis=1)
+    false_positives = np.sum((y_true_binary == 0) & (y_pred_binary == 1), axis=1)
+    weighted_errors = FN_PENALTY * false_negatives + FP_PENALTY * false_positives
+    max_errors_per_sample = FN_PENALTY * np.sum(y_true_binary, axis=1) + FP_PENALTY * (n_labels - np.sum(y_true_binary, axis=1))
+    per_sample_scores = np.where(max_errors_per_sample > 0, 1.0 - (weighted_errors / max_errors_per_sample), 1.0)
+    return float(np.mean(per_sample_scores))
+# =========================
+# Core Extraction
 # =========================
 def run_extraction(
     model_choice: str,
     max_new_tokens: int,
     temperature: float,
     top_p: float,
+    usd_per_1k_tokens: float,
 ) -> Tuple[str, str, str, str, str]:
     diagnostics_lines = []
+    # Resolve inputs from the unified boxes
     input_txt = normalize_txt_input(txt_paste, txt_upload)
     input_json_raw = normalize_json_input(json_paste, json_upload)
+    # Language detection & JSON parse
+    lang = safe_lang_detect(input_txt)
     parsed_json: Dict[str, Any] = {}
+    json_parse_ok = False
     if input_json_raw:
         try:
             parsed_json = json.loads(input_json_raw)
+            json_parse_ok = True
         except Exception as e:
             diagnostics_lines.append(f"JSON parse error: {e}")
+    # Load model
     try:
         tokenizer, model = load_model(model_choice)
     except Exception as e:
+        diag = "\n".join([
+            f"Model: {model_choice}",
+            f"Params: {params_checked}",
+            f"Language detected: {lang}",
+            f"TXT length: {len(input_txt)}",
+            f"JSON parsed: {json_parse_ok}",
+            f"Model load failed: {e}"
+        ])
         return "", "", "", "", diag
+    # Token counts & rough cost estimate
+    in_tokens = count_tokens(tokenizer, input_txt) + count_tokens(tokenizer, json.dumps(parsed_json) if parsed_json else "")
+    # Build multilingual-aware prompt (summary in English; extraction language-agnostic)
     user_prompt = (
+        "You analyze client-conversation transcripts.\n"
+        "Transcripts may be multilingual. Detect the language automatically. "
+        "Extract tasks and entities correctly regardless of language. "
+        "Always write the short summary in English.\n"
+        "Include only information present in the inputs; avoid hallucinations.\n"
         f"Instructions: {instructions_text}\n"
         f"Context: {context_text}\n"
         "----\n"
         f"TEXT:\n{input_txt[:4000]}\n"
         "----\n"
         f"JSON:\n{json.dumps(parsed_json)[:2000]}\n"
+        "Output:\n"
+        "- Tasks list (use allowed labels where possible)\n"
+        "- Entities list\n"
+        "- Cleaned text\n"
+        "- Short summary (English)\n"
     )
+    prompt_tokens = count_tokens(tokenizer, user_prompt)
     try:
         inputs = tokenizer(user_prompt, return_tensors="pt").to(DEVICE)
         with torch.no_grad():
+            outputs = model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
             )
         full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
+        diag = "\n".join([
+            f"Model: {model_choice}",
+            f"Params: {params_checked}",
+            f"Language detected: {lang}",
+            f"TXT length: {len(input_txt)}",
+            f"JSON parsed: {json_parse_ok}",
+            f"Inference failed: {e}"
+        ])
         return "", "", "", "", diag
+    # (Replace this with your structured parser that maps to ALLOWED_LABELS)
+    # For now, placeholders to keep UI working:
+    tasks_out = "• plan_contact\n• schedule_meeting"
+    entities_out = "• Client: John Doe\n• Product: Mortgage"
+    cleaned_out = "Cleaned transcript text here…"
+    summary_out = "A short English summary of the conversation."
+    # Output token count and cost
+    out_tokens = count_tokens(tokenizer, full_text)
+    total_tokens = in_tokens + prompt_tokens + out_tokens
+    est_cost = (total_tokens / 1000.0) * max(0.0, float(usd_per_1k_tokens))
+    diagnostics_lines.extend([
+        f"Model: {model_choice}",
+        f"Params: {params_checked}",
+        f"Language detected: {lang}",
+        f"TXT length: {len(input_txt)}",
+        f"JSON parsed: {json_parse_ok}",
+        f"Input tokens (txt+json): {in_tokens}",
+        f"Prompt tokens: {prompt_tokens}",
+        f"Output tokens: {out_tokens}",
+        f"Total tokens (approx): {total_tokens}",
+        f"Est. cost @ ${usd_per_1k_tokens:.4f}/1k toks: ${est_cost:.6f}",
+        "Generation completed successfully.",
+    ])
     diagnostics = "\n".join(diagnostics_lines)
     return tasks_out, entities_out, cleaned_out, summary_out, diagnostics
 # =========================
+# Evaluation handler (JSON arrays or files)
+# =========================
+def evaluate_ui(y_true_text: str, y_true_file: Optional[gr.File], y_pred_text: str, y_pred_file: Optional[gr.File]) -> str:
+    """
+    Accepts pasted JSON (e.g., [["plan_contact"], ["schedule_meeting", ...], ...])
+    or uploaded .json files for y_true and y_pred. Returns the score or an error.
+    """
+    def _load_json(text: str, file_obj: Optional[gr.File]) -> Any:
+        if text and text.strip():
+            return json.loads(text)
+        ftxt = read_file(file_obj)
+        if ftxt:
+            return json.loads(ftxt)
+        raise ValueError("Missing JSON input")
+    try:
+        y_true = _load_json(y_true_text, y_true_file)
+        y_pred = _load_json(y_pred_text, y_pred_file)
+        score = evaluate_predictions(y_true, y_pred)
+        return f"Evaluation score: {score:.4f} (higher is better; weighted FN>FP)"
+    except Exception as e:
+        return f"Evaluation error: {e}"
+# =========================
+# UI Styling (black text on white; blue accents; red Run)
 # =========================
 THEME_CSS = """
 :root {
   --body-background-fill: #ffffff !important;
   --body-text-color: #111111 !important;
+  --link-text-color: #0b63ce !important;
 }
+.gradio-container, .prose, .prose * { color: #111111 !important; }
+label { color: #0b63ce !important; }
 button#run-btn {
   background: #e11900 !important;
+  color: #fff !important;
   border: 1px solid #b50f00 !important;
 }
 """
+# =========================
+# UI Layout
+# =========================
 def build_interface() -> gr.Blocks:
     with gr.Blocks(title="Talk2Task Demo", css=THEME_CSS) as demo:
+        # Model selection (full width) with checklist + sliders + price input
         with gr.Group():
+            gr.Markdown("### Model & Parameters")
+            with gr.Row():
                 model_choice = gr.Dropdown(
                     label="Model",
+                    choices=SUPPORTED_MODELS,
                     value=DEFAULT_MODEL_ID,
+                    scale=3,
                 )
                 params_checked = gr.CheckboxGroup(
                     label="Options",
                     choices=[
                         "Default cleaning",
                         "Remove PII",
                         "Detect language",
+                        "Use 4-bit if available",
                     ],
+                    value=["Default cleaning", "Detect language"],
+                    scale=2,
                 )
             with gr.Row():
+                max_new_tokens = gr.Slider(64, 1024, value=200, step=16, label="Max new tokens")
                 temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
                 top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
+                usd_per_1k_tokens = gr.Number(value=0.002, label="Est. $ per 1k tokens (edit)")
+        # Single boxes for TXT and JSON via Tabs (left/right)
+        gr.Markdown("### Input")
         with gr.Row(equal_height=True):
+            with gr.Group():
+                gr.Markdown("**TXT Input** (Paste or Upload)")
                 with gr.Tabs():
                     with gr.TabItem("Paste"):
+                        txt_paste = gr.TextArea(label="Paste TXT", lines=12, placeholder="Paste transcript here (any language)…")
+                    with gr.TabItem("Upload"):
+                        txt_upload = gr.File(label="Upload TXT", file_types=[".txt"])
+            with gr.Group():
+                gr.Markdown("**JSON Input** (Paste or Upload)")
                 with gr.Tabs():
                     with gr.TabItem("Paste"):
+                        json_paste = gr.Code(label="Paste JSON", language="json", value="{\n  \"example\": true\n}", lines=12)
+                    with gr.TabItem("Upload"):
+                        json_upload = gr.File(label="Upload JSON", file_types=[".json"])
+        # Red run button
+        run_btn = gr.Button("Run Extraction", elem_id="run-btn")
+        # Collapsible instructions/context (defaults aligned to brief)
         with gr.Row():
             with gr.Accordion("Instructions (editable)", open=False):
                 instructions_text = gr.TextArea(
                     value=(
+                        "Extract key tasks (use allowed labels when applicable), entities, cleaned text, and a short summary.\n"
+                        "Be robust to noisy or incomplete data. Avoid hallucinations."
                     ),
+                    lines=5
                 )
             with gr.Accordion("Context (editable)", open=False):
                 context_text = gr.TextArea(
                     value=(
+                        "Client-advisor banking context. Assume transcripts may include multiple languages; "
+                        "summaries must be in English."
                     ),
+                    lines=5
                 )
+        # Outputs (symmetrical)
+        gr.Markdown("### Results")
         with gr.Row(equal_height=True):
+            tasks_out = gr.TextArea(label="Tasks", lines=8)
+            entities_out = gr.TextArea(label="Entities", lines=8)
         with gr.Row(equal_height=True):
+            cleaned_out = gr.TextArea(label="Cleaned Text", lines=8)
+            summary_out = gr.TextArea(label="Summary (English)", lines=8)
+        gr.Markdown("### Diagnostics / Metrics")
+        diagnostics = gr.TextArea(label="Diagnostics", lines=12)
+        # Evaluation accordion (cost-accuracy comparison support)
+        with gr.Accordion("Evaluation (paste or upload y_true / y_pred arrays)", open=False):
+            with gr.Row():
+                y_true_text = gr.Code(label="y_true (JSON)", language="json", lines=10)
+                y_pred_text = gr.Code(label="y_pred (JSON)", language="json", lines=10)
+            with gr.Row():
+                y_true_file = gr.File(label="Upload y_true.json", file_types=[".json"])
+                y_pred_file = gr.File(label="Upload y_pred.json", file_types=[".json"])
+            eval_btn = gr.Button("Compute Official Score")
+            eval_result = gr.Textbox(label="Evaluation Result")
+            eval_btn.click(evaluate_ui, inputs=[y_true_text, y_true_file, y_pred_text, y_pred_file], outputs=eval_result)
+        # Wire main run
         run_inputs = [
             model_choice, params_checked, instructions_text, context_text,
             txt_paste, txt_upload, json_paste, json_upload,
+            max_new_tokens, temperature, top_p, usd_per_1k_tokens
         ]
         run_outputs = [tasks_out, entities_out, cleaned_out, summary_out, diagnostics]
+        run_btn.click(fn=run_extraction, inputs=run_inputs, outputs=run_outputs)
     return demo
 demo = build_interface()
 if __name__ == "__main__":
     demo.launch()