UIGEN-T3-4B-Demo

Runtime error

App Files Files Community

smirki commited on Apr 8

Commit

f409e96

verified ·

1 Parent(s): 1944655

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -71

app.py CHANGED Viewed

@@ -1,9 +1,31 @@
 import spaces
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from datetime import datetime
 import os
 # --- Configuration ---
 # Updated model ID
@@ -13,20 +35,20 @@ model_link = f"https://huggingface.co/{model_id}"
 website_link = "https://tesslate.com"
 discord_link = "https://discord.gg/DkzMzwBTaw"
-# --- Text Content ---
 Title = f"""
 <div style="text-align: center; margin-bottom: 20px;">
     <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
     <h1 style="margin-bottom: 5px;">🚀 Welcome to the Tessa-T1-14B Demo 🚀</h1>
     <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
-    <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a></p>
 </div>
 """
 description = f"""
 Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
 Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
-It's designed for integration into AI coding agents and autonomous frontend systems.
 """
 about_tesslate = f"""
@@ -72,16 +94,16 @@ join_us = f"""
 # --- Model and Tokenizer Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # Get the token from environment variables
 hf_token = os.getenv('HF_TOKEN') # Standard env var name for HF token
 if not hf_token:
-    # Try to load from Hugging Face login if available, otherwise raise error
     try:
-        from huggingface_hub import HfApi, HfFolder
-        hf_token = HfFolder.get_token() # Use HfFolder to get token saved by login
         if not hf_token:
-             # If still not found, try HfApi (less common for user login token)
              hf_token = HfApi().token
         if not hf_token:
             raise ValueError("HF token not found. Please set HF_TOKEN env var or login via `huggingface-cli login`.")
@@ -92,28 +114,65 @@ if not hf_token:
          raise ValueError(f"HF token acquisition failed. Please set the HF_TOKEN environment variable or login via `huggingface-cli login`. Error: {e}")
 print(f"Loading Tokenizer: {model_id}")
-# Initialize tokenizer and model with token authentication
-# trust_remote_code=True is necessary for models with custom code (like Qwen2)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     token=hf_token,
     trust_remote_code=True
 )
-print(f"Loading Model: {model_id}")
-# Load the model with bfloat16 and automatic device mapping
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    token=hf_token,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True
-)
-print("Model loaded successfully.")
-# Attempt to get config, handle potential errors
 try:
     config_json = model.config.to_dict()
     model_config_info = f"""
 **Model Type:** {config_json.get('model_type', 'N/A')}
 **Architecture:** {config_json.get('architectures', ['N/A'])[0]}
@@ -122,13 +181,16 @@ try:
 **Num Hidden Layers:** {config_json.get('num_hidden_layers', 'N/A')}
 **Num Attention Heads:** {config_json.get('num_attention_heads', 'N/A')}
 **Max Position Embeddings:** {config_json.get('max_position_embeddings', 'N/A')}
-**Torch Dtype:** {str(config_json.get('torch_dtype', 'N/A'))}
 """
 except Exception as e:
-    print(f"Could not retrieve model config: {e}")
-    model_config_info = f"**Error:** Could not load config for {model_id}. Check model files on Hugging Face."
 # --- Helper Function for Tokenizer Info ---
 def format_tokenizer_info(tokenizer_instance):
     try:
         info = [
@@ -152,76 +214,69 @@ def format_tokenizer_info(tokenizer_instance):
 tokenizer_info = format_tokenizer_info(tokenizer)
 # --- Generation Function ---
-@spaces.GPU(duration=180) # Increased duration slightly
 def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
-    # min_p is not directly supported by HF generate, it requires custom logit processing.
-    # We will ignore min_p for now but keep it in the UI if needed for future implementation.
-    # Note: Setting min_p typically involves filtering logits, which isn't done here.
-    # Use the tokenizer's chat template (Recommended for Qwen2 based models)
     messages = []
     if system_prompt and system_prompt.strip():
-        # Qwen2 template might prefer system prompt directly or integrated differently.
-        # Using the standard 'system' role here, assuming tokenizer handles it.
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": user_prompt})
     try:
-        # Let the tokenizer handle the template - crucial for models like Qwen2
         full_prompt = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
-            add_generation_prompt=True # Adds the prompt for the assistant's turn
         )
-        print("Applied tokenizer's chat template.")
     except Exception as e:
-        # Fallback only if template application fails catastrophically
         print(f"Warning: Could not use apply_chat_template (Error: {e}). Falling back to basic format. This might degrade performance.")
         prompt_parts = []
         if system_prompt and system_prompt.strip():
              prompt_parts.append(f"System: {system_prompt}")
         prompt_parts.append(f"\nUser: {user_prompt}")
-        prompt_parts.append("\nAssistant:") # Basic prompt end
         full_prompt = "\n".join(prompt_parts)
-    print(f"\n--- Generating ---")
-    # print(f"Prompt:\n{full_prompt}") # Optional: Print full prompt for debugging
-    print(f"Params: Temp={temperature}, TopK={top_k}, TopP={top_p}, RepPen={repetition_penalty}, MaxNew={max_new_tokens}, MinP={min_p} (MinP ignored by generate)")
-    print("-" * 20)
-    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device) # Added truncation safeguard
-    # Generation arguments
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=int(max_new_tokens),
-        temperature=float(temperature) if float(temperature) > 0 else None, # Temp 0 means greedy search
         top_p=float(top_p),
         top_k=int(top_k),
         repetition_penalty=float(repetition_penalty),
         do_sample=True if float(temperature) > 0 else False,
-        pad_token_id=tokenizer.eos_token_id, # Use EOS for padding when generating
         eos_token_id=tokenizer.eos_token_id
-        # min_p cannot be directly passed here.
     )
-    if temperature == 0: # If temp is 0, disable sampling params
         generation_kwargs.pop('top_p', None)
         generation_kwargs.pop('top_k', None)
         generation_kwargs['do_sample'] = False
-    # Generate response
     with torch.inference_mode():
         outputs = model.generate(**generation_kwargs)
-    # Decode response, skipping special tokens and the input prompt part
     input_length = inputs['input_ids'].shape[1]
     generated_tokens = outputs[0][input_length:]
     response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    print(f"--- Response ---\n{response}\n---------------\n")
     return response.strip()
 # --- Gradio Interface ---
@@ -231,7 +286,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
     with gr.Row():
         with gr.Column(scale=3):
-             # Main Interaction Area
             with gr.Group():
                 system_prompt = gr.Textbox(
                     label="System Prompt (Persona & Instructions)",
@@ -247,6 +301,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
             with gr.Accordion("🛠️ Generation Parameters", open=True):
                  with gr.Row():
                      temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌡️ Temperature", info="Controls randomness. 0 = deterministic, >0 = random.")
                      max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=1024, step=32, label="📊 Max New Tokens", info="Max length of the generated response.")
                  with gr.Row():
@@ -254,29 +309,27 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                      top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="🏅 Top-p (nucleus)", info="Sample from tokens with cumulative probability >= top_p.")
                  with gr.Row():
                      repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty", info="Penalizes repeating tokens ( > 1).")
-                     # Add min_p slider, but note it's not used in backend currently
                      min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="📉 Min-p (Not Active)", info="Filters tokens below this probability threshold (Requires custom logic - currently ignored).")
             generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
         with gr.Column(scale=2):
-             # Output Area
             output = gr.Code(
-                label=f"🌠 Tessa-T1-14B Output",
-                language="markdown", # Use markdown for mixed text/code
                 lines=25,
-                show_copy_button=True,
             )
-             # Model & Tokenizer Info in an Accordion
             with gr.Accordion("⚙️ Model & Tokenizer Details", open=False):
                  gr.Markdown("### Model Configuration")
-                 gr.Markdown(model_config_info)
                  gr.Markdown("---")
                  gr.Markdown("### Tokenizer Configuration")
                  gr.Markdown(tokenizer_info)
     # About Tesslate Section
     with gr.Row():
         with gr.Accordion("💡 About Tesslate & Our Mission", open=False):
@@ -285,25 +338,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
     # Links Section
     gr.Markdown(join_us)
-    # Examples (Updated for React/Coding focus)
     gr.Examples(
         examples=[
-            # [system_prompt, user_prompt, temperature, max_tokens, top_p, rep_penalty, top_k, min_p]
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
-                0.5, 512, 0.95, 1.1, 40, 0.05
             ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Explain the difference between `useState` and `useEffect` hooks in React with simple examples.",
                 0.7, 1024, 0.95, 1.1, 40, 0.05
             ],
-            [
-                "You are a helpful AI assistant.",
-                "Write a short explanation of how React's reconciliation algorithm works.",
-                0.6, 768, 0.9, 1.15, 50, 0.05
-            ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development. Use Tailwind CSS for styling.",
                 "Generate a React component for a responsive card with an image, title, and description, using Tailwind CSS classes.",
@@ -312,7 +359,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
              [
                 "You are a helpful AI assistant.",
                 "What are the pros and cons of using Next.js compared to Create React App?",
-                0.8, 1024, 0.98, 1.05, 60, 0.05
             ]
         ],
         inputs=[
@@ -323,7 +370,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
             top_p,
             repetition_penalty,
             top_k,
-            min_p # Include min_p here even if not used by backend, to match UI
         ],
         outputs=output,
         label="✨ Example Prompts (Click to Load)"
@@ -339,6 +386,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
 # Launch the demo
 if __name__ == "__main__":
-    # share=True creates a public link (useful for Colab/remote or HF Spaces)
-    # queue enables handling multiple users
     demo.queue().launch(debug=True, share=False) # Set share=True if deploying on HF Spaces

 import spaces
 import gradio as gr
 import torch
 from datetime import datetime
 import os
+import subprocess # For Flash Attention install
+# --- Install Flash Attention (specific method for compatibility) ---
+# This method attempts to install flash-attn without building CUDA extensions locally,
+# which can be helpful in restricted environments like ZeroGPU or when build tools are missing.
+print("Attempting to install Flash Attention 2...")
+try:
+    subprocess.run(
+        'pip install flash-attn --no-build-isolation',
+        env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+        shell=True,
+        check=True # Raise an error if the command fails
+    )
+    print("Flash Attention installed successfully using subprocess method.")
+    _flash_attn_2_available = True
+except Exception as e:
+    print(f"Could not install Flash Attention 2 using subprocess: {e}")
+    print("Proceeding without Flash Attention 2. Performance may be impacted.")
+    _flash_attn_2_available = False
+# --- Import Transformers AFTER potential install ---
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from huggingface_hub import HfApi, HfFolder
 # --- Configuration ---
 # Updated model ID
 website_link = "https://tesslate.com"
 discord_link = "https://discord.gg/DkzMzwBTaw"
+# --- Text Content (Keep the cool UI elements) ---
 Title = f"""
 <div style="text-align: center; margin-bottom: 20px;">
     <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
     <h1 style="margin-bottom: 5px;">🚀 Welcome to the Tessa-T1-14B Demo 🚀</h1>
     <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
+    <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a> | Running with 8-bit Quantization</p>
 </div>
 """
 description = f"""
 Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
 Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
+This demo uses **8-bit quantization** via `bitsandbytes` for reduced memory footprint. **Flash Attention 2** is enabled if available for potentially faster inference.
 """
 about_tesslate = f"""
 # --- Model and Tokenizer Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+if device == "cpu":
+    print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
+    _flash_attn_2_available = False # Cannot use flash attn on CPU
 # Get the token from environment variables
 hf_token = os.getenv('HF_TOKEN') # Standard env var name for HF token
 if not hf_token:
     try:
+        hf_token = HfFolder.get_token()
         if not hf_token:
              hf_token = HfApi().token
         if not hf_token:
             raise ValueError("HF token not found. Please set HF_TOKEN env var or login via `huggingface-cli login`.")
          raise ValueError(f"HF token acquisition failed. Please set the HF_TOKEN environment variable or login via `huggingface-cli login`. Error: {e}")
 print(f"Loading Tokenizer: {model_id}")
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     token=hf_token,
     trust_remote_code=True
 )
+print(f"Loading Model: {model_id} with 8-bit quantization")
+# Define quantization configuration
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+# Determine attn_implementation based on install success and device
+attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa" # sdpa is a fallback
+print(f"Using attention implementation: {attn_implementation}")
+# Note: You might see a warning from bitsandbytes about library paths on ZeroGPU, this is often normal.
+try:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        token=hf_token,
+        device_map="auto", # Automatically distributes layers, crucial for large quantized models
+        quantization_config=quantization_config,
+        attn_implementation=attn_implementation, # Enable Flash Attention 2 if available
+        trust_remote_code=True
+    )
+    print("Model loaded successfully with 8-bit quantization.")
+except ImportError as e:
+     print(f"ImportError during model loading: {e}")
+     print("Ensure 'bitsandbytes' and 'accelerate' are installed.")
+     # Optionally fall back to no quantization if bitsandbytes is missing,
+     # but for this request, we assume it's intended.
+     raise e
+except Exception as e:
+    print(f"Error loading model: {e}")
+    # If Flash Attention was requested but is incompatible, Transformers might raise an error.
+    # Let's try falling back to SDPA (Scaled Dot Product Attention) if FA2 fails at load time.
+    if attn_implementation == "flash_attention_2":
+        print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
+        try:
+            attn_implementation = "sdpa"
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                token=hf_token,
+                device_map="auto",
+                quantization_config=quantization_config,
+                attn_implementation=attn_implementation,
+                trust_remote_code=True
+            )
+            print("Model loaded successfully with 8-bit quantization and SDPA attention.")
+        except Exception as e2:
+             print(f"Fallback to SDPA attention also failed: {e2}")
+             raise e2 # Re-raise the error if fallback fails too
+    else:
+        raise e # Re-raise original error if it wasn't FA2 related
+# Get config info (might need adjustment based on quantized model structure)
 try:
     config_json = model.config.to_dict()
+    # Add quantization info
+    quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
     model_config_info = f"""
 **Model Type:** {config_json.get('model_type', 'N/A')}
 **Architecture:** {config_json.get('architectures', ['N/A'])[0]}
 **Num Hidden Layers:** {config_json.get('num_hidden_layers', 'N/A')}
 **Num Attention Heads:** {config_json.get('num_attention_heads', 'N/A')}
 **Max Position Embeddings:** {config_json.get('max_position_embeddings', 'N/A')}
+**Attention Implementation:** `{attn_implementation}`
+**Quantization:** 8-bit (`load_in_8bit={quant_info.get('load_in_8bit', 'N/A')}`)
 """
 except Exception as e:
+    print(f"Could not retrieve full model config: {e}")
+    model_config_info = f"**Error:** Could not load full config details for {model_id}."
 # --- Helper Function for Tokenizer Info ---
+# (Keep the existing format_tokenizer_info function - no changes needed)
 def format_tokenizer_info(tokenizer_instance):
     try:
         info = [
 tokenizer_info = format_tokenizer_info(tokenizer)
 # --- Generation Function ---
+@spaces.GPU(duration=180) # Keep duration, can be adjusted if needed
 def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
+    # (Keep the existing generate_response function structure)
+    # It correctly uses apply_chat_template and handles generation parameters.
+    # min_p is still noted as ignored by the standard HF generate function.
     messages = []
     if system_prompt and system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": user_prompt})
     try:
         full_prompt = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
+            add_generation_prompt=True
         )
+        # print("Applied tokenizer's chat template.") # Less verbose logging
     except Exception as e:
         print(f"Warning: Could not use apply_chat_template (Error: {e}). Falling back to basic format. This might degrade performance.")
         prompt_parts = []
         if system_prompt and system_prompt.strip():
              prompt_parts.append(f"System: {system_prompt}")
         prompt_parts.append(f"\nUser: {user_prompt}")
+        prompt_parts.append("\nAssistant:")
         full_prompt = "\n".join(prompt_parts)
+    # print(f"\n--- Generating ---")
+    # print(f"Prompt:\n{full_prompt}")
+    # print(f"Params: Temp={temperature}, TopK={top_k}, TopP={top_p}, RepPen={repetition_penalty}, MaxNew={max_new_tokens}, MinP={min_p} (MinP ignored)")
+    # print("-" * 20)
+    # Ensure inputs are on the correct device (handled by device_map="auto")
+    # Added truncation safeguard during tokenization
+    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature) if float(temperature) > 0 else None,
         top_p=float(top_p),
         top_k=int(top_k),
         repetition_penalty=float(repetition_penalty),
         do_sample=True if float(temperature) > 0 else False,
+        pad_token_id=tokenizer.eos_token_id,
         eos_token_id=tokenizer.eos_token_id
     )
+    if temperature == 0:
         generation_kwargs.pop('top_p', None)
         generation_kwargs.pop('top_k', None)
         generation_kwargs['do_sample'] = False
     with torch.inference_mode():
         outputs = model.generate(**generation_kwargs)
     input_length = inputs['input_ids'].shape[1]
     generated_tokens = outputs[0][input_length:]
     response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # print(f"--- Response ---\n{response}\n---------------\n")
     return response.strip()
 # --- Gradio Interface ---
     with gr.Row():
         with gr.Column(scale=3):
             with gr.Group():
                 system_prompt = gr.Textbox(
                     label="System Prompt (Persona & Instructions)",
             with gr.Accordion("🛠️ Generation Parameters", open=True):
                  with gr.Row():
+                     # --- Set Default Params ---
                      temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌡️ Temperature", info="Controls randomness. 0 = deterministic, >0 = random.")
                      max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=1024, step=32, label="📊 Max New Tokens", info="Max length of the generated response.")
                  with gr.Row():
                      top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="🏅 Top-p (nucleus)", info="Sample from tokens with cumulative probability >= top_p.")
                  with gr.Row():
                      repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty", info="Penalizes repeating tokens ( > 1).")
                      min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="📉 Min-p (Not Active)", info="Filters tokens below this probability threshold (Requires custom logic - currently ignored).")
             generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
         with gr.Column(scale=2):
+            # --- Fix: Remove show_copy_button=True ---
+            # gr.Code inherently has a copy button in modern Gradio versions
             output = gr.Code(
+                label=f"🌠 Tessa-T1-14B (8-bit) Output",
+                language="markdown",
                 lines=25,
+                # show_copy_button=True, # REMOVED - This caused the TypeError
             )
             with gr.Accordion("⚙️ Model & Tokenizer Details", open=False):
                  gr.Markdown("### Model Configuration")
+                 gr.Markdown(model_config_info) # Display updated info including quantization/attn
                  gr.Markdown("---")
                  gr.Markdown("### Tokenizer Configuration")
                  gr.Markdown(tokenizer_info)
     # About Tesslate Section
     with gr.Row():
         with gr.Accordion("💡 About Tesslate & Our Mission", open=False):
     # Links Section
     gr.Markdown(join_us)
+    # Examples (Keep the relevant examples)
     gr.Examples(
         examples=[
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
+                0.7, 512, 0.95, 1.1, 40, 0.05 # Default params match the sliders now
             ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Explain the difference between `useState` and `useEffect` hooks in React with simple examples.",
                 0.7, 1024, 0.95, 1.1, 40, 0.05
             ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development. Use Tailwind CSS for styling.",
                 "Generate a React component for a responsive card with an image, title, and description, using Tailwind CSS classes.",
              [
                 "You are a helpful AI assistant.",
                 "What are the pros and cons of using Next.js compared to Create React App?",
+                0.8, 1024, 0.98, 1.05, 60, 0.05 # Example with slightly different params
             ]
         ],
         inputs=[
             top_p,
             repetition_penalty,
             top_k,
+            min_p
         ],
         outputs=output,
         label="✨ Example Prompts (Click to Load)"
 # Launch the demo
 if __name__ == "__main__":
+    # The progress bar noise during shard loading is normal output from the `transformers` library
+    # during the download/loading phase before the Gradio app starts serving.
+    # It cannot be suppressed from within this script.
     demo.queue().launch(debug=True, share=False) # Set share=True if deploying on HF Spaces