UIGEN-T3-4B-Demo

Runtime error

App Files Files Community

smirki commited on Apr 8

Commit

b10e890

verified ·

1 Parent(s): beed4b3

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -112

app.py CHANGED Viewed

@@ -4,17 +4,16 @@ import torch
 from datetime import datetime
 import os
 import subprocess # For Flash Attention install
 # --- Install Flash Attention (specific method for compatibility) ---
-# This method attempts to install flash-attn without building CUDA extensions locally,
-# which can be helpful in restricted environments like ZeroGPU or when build tools are missing.
 print("Attempting to install Flash Attention 2...")
 try:
     subprocess.run(
         'pip install flash-attn --no-build-isolation',
         env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
         shell=True,
-        check=True # Raise an error if the command fails
     )
     print("Flash Attention installed successfully using subprocess method.")
     _flash_attn_2_available = True
@@ -24,11 +23,10 @@ except Exception as e:
     _flash_attn_2_available = False
 # --- Import Transformers AFTER potential install ---
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from huggingface_hub import HfApi, HfFolder
 # --- Configuration ---
-# Updated model ID
 model_id = "Tesslate/Tessa-T1-14B"
 creator_link = "https://huggingface.co/TesslateAI"
 model_link = f"https://huggingface.co/{model_id}"
@@ -41,16 +39,17 @@ Title = f"""
     <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
     <h1 style="margin-bottom: 5px;">🚀 Welcome to the Tessa-T1-14B Demo 🚀</h1>
     <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
-    <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a> | Running with 8-bit Quantization</p>
 </div>
 """
 description = f"""
 Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
 Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
-This demo uses **8-bit quantization** via `bitsandbytes` for reduced memory footprint. **Flash Attention 2** is enabled if available for potentially faster inference.
 """
 about_tesslate = f"""
 ## About Tesslate & Our Vision
 <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_notext.png?download=true" alt="Tesslate Icon" style="height: 40px; float: left; margin-right: 10px;">
@@ -90,88 +89,59 @@ join_us = f"""
     </a>
 </div>
 """
 # --- Model and Tokenizer Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 if device == "cpu":
     print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
-    _flash_attn_2_available = False # Cannot use flash attn on CPU
-# Get the token from environment variables
-hf_token = os.getenv('HF_TOKEN') # Standard env var name for HF token
 if not hf_token:
     try:
         hf_token = HfFolder.get_token()
-        if not hf_token:
-             hf_token = HfApi().token
-        if not hf_token:
-            raise ValueError("HF token not found. Please set HF_TOKEN env var or login via `huggingface-cli login`.")
         print("Using token from Hugging Face login.")
-    except ImportError:
-         raise ValueError("huggingface_hub not installed. Please set the HF_TOKEN environment variable or install huggingface_hub.")
     except Exception as e:
-         raise ValueError(f"HF token acquisition failed. Please set the HF_TOKEN environment variable or login via `huggingface-cli login`. Error: {e}")
 print(f"Loading Tokenizer: {model_id}")
-tokenizer = AutoTokenizer.from_pretrained(
-    model_id,
-    token=hf_token,
-    trust_remote_code=True
-)
 print(f"Loading Model: {model_id} with 8-bit quantization")
-# Define quantization configuration
 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-# Determine attn_implementation based on install success and device
-attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa" # sdpa is a fallback
 print(f"Using attention implementation: {attn_implementation}")
-# Note: You might see a warning from bitsandbytes about library paths on ZeroGPU, this is often normal.
 try:
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=hf_token,
-        device_map="auto", # Automatically distributes layers, crucial for large quantized models
         quantization_config=quantization_config,
-        attn_implementation=attn_implementation, # Enable Flash Attention 2 if available
         trust_remote_code=True
     )
     print("Model loaded successfully with 8-bit quantization.")
-except ImportError as e:
-     print(f"ImportError during model loading: {e}")
-     print("Ensure 'bitsandbytes' and 'accelerate' are installed.")
-     # Optionally fall back to no quantization if bitsandbytes is missing,
-     # but for this request, we assume it's intended.
-     raise e
 except Exception as e:
     print(f"Error loading model: {e}")
-    # If Flash Attention was requested but is incompatible, Transformers might raise an error.
-    # Let's try falling back to SDPA (Scaled Dot Product Attention) if FA2 fails at load time.
     if attn_implementation == "flash_attention_2":
         print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
         try:
             attn_implementation = "sdpa"
             model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                token=hf_token,
-                device_map="auto",
-                quantization_config=quantization_config,
-                attn_implementation=attn_implementation,
-                trust_remote_code=True
             )
             print("Model loaded successfully with 8-bit quantization and SDPA attention.")
         except Exception as e2:
-             print(f"Fallback to SDPA attention also failed: {e2}")
-             raise e2 # Re-raise the error if fallback fails too
-    else:
-        raise e # Re-raise original error if it wasn't FA2 related
-# Get config info (might need adjustment based on quantized model structure)
 try:
     config_json = model.config.to_dict()
-    # Add quantization info
     quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
     model_config_info = f"""
 **Model Type:** {config_json.get('model_type', 'N/A')}
@@ -188,9 +158,6 @@ except Exception as e:
     print(f"Could not retrieve full model config: {e}")
     model_config_info = f"**Error:** Could not load full config details for {model_id}."
-# --- Helper Function for Tokenizer Info ---
-# (Keep the existing format_tokenizer_info function - no changes needed)
 def format_tokenizer_info(tokenizer_instance):
     try:
         info = [
@@ -215,45 +182,38 @@ def format_tokenizer_info(tokenizer_instance):
 tokenizer_info = format_tokenizer_info(tokenizer)
-# --- Generation Function ---
-@spaces.GPU(duration=180) # Keep duration, can be adjusted if needed
 def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
-    # (Keep the existing generate_response function structure)
-    # It correctly uses apply_chat_template and handles generation parameters.
-    # min_p is still noted as ignored by the standard HF generate function.
     messages = []
     if system_prompt and system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": user_prompt})
     try:
-        full_prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # print("Applied tokenizer's chat template.") # Less verbose logging
     except Exception as e:
-        print(f"Warning: Could not use apply_chat_template (Error: {e}). Falling back to basic format. This might degrade performance.")
         prompt_parts = []
-        if system_prompt and system_prompt.strip():
-             prompt_parts.append(f"System: {system_prompt}")
-        prompt_parts.append(f"\nUser: {user_prompt}")
-        prompt_parts.append("\nAssistant:")
         full_prompt = "\n".join(prompt_parts)
-    # print(f"\n--- Generating ---")
-    # print(f"Prompt:\n{full_prompt}")
-    # print(f"Params: Temp={temperature}, TopK={top_k}, TopP={top_p}, RepPen={repetition_penalty}, MaxNew={max_new_tokens}, MinP={min_p} (MinP ignored)")
-    # print("-" * 20)
-    # Ensure inputs are on the correct device (handled by device_map="auto")
-    # Added truncation safeguard during tokenization
-    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
     generation_kwargs = dict(
-        **inputs,
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature) if float(temperature) > 0 else None,
         top_p=float(top_p),
@@ -269,17 +229,19 @@ def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, t
         generation_kwargs.pop('top_k', None)
         generation_kwargs['do_sample'] = False
-    with torch.inference_mode():
-        outputs = model.generate(**generation_kwargs)
-    input_length = inputs['input_ids'].shape[1]
-    generated_tokens = outputs[0][input_length:]
-    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    # print(f"--- Response ---\n{response}\n---------------\n")
-    return response.strip()
-# --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=".gradio-container { max-width: 90% !important; }") as demo:
     gr.Markdown(Title)
     gr.Markdown(description)
@@ -295,56 +257,51 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
                 )
                 user_prompt = gr.Textbox(
                     label="💬 Your Request",
-                    placeholder="e.g., 'Create a React functional component for a simple counter with increment and decrement buttons using useState.' or 'Explain the concept of virtual DOM.'",
                     lines=6
                 )
             with gr.Accordion("🛠️ Generation Parameters", open=True):
                  with gr.Row():
-                     # --- Set Default Params ---
-                     temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌡️ Temperature", info="Controls randomness. 0 = deterministic, >0 = random.")
-                     max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=1024, step=32, label="📊 Max New Tokens", info="Max length of the generated response.")
                  with gr.Row():
-                     top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="🏆 Top-k", info="Sample from top k likely tokens.")
-                     top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="🏅 Top-p (nucleus)", info="Sample from tokens with cumulative probability >= top_p.")
                  with gr.Row():
-                     repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty", info="Penalizes repeating tokens ( > 1).")
-                     min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="📉 Min-p (Not Active)", info="Filters tokens below this probability threshold (Requires custom logic - currently ignored).")
-            generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
         with gr.Column(scale=2):
-            # --- Fix: Remove show_copy_button=True ---
-            # gr.Code inherently has a copy button in modern Gradio versions
             output = gr.Code(
                 label=f"🌠 Tessa-T1-14B (8-bit) Output",
                 language="markdown",
                 lines=25,
-                # show_copy_button=True, # REMOVED - This caused the TypeError
             )
             with gr.Accordion("⚙️ Model & Tokenizer Details", open=False):
                  gr.Markdown("### Model Configuration")
-                 gr.Markdown(model_config_info) # Display updated info including quantization/attn
                  gr.Markdown("---")
                  gr.Markdown("### Tokenizer Configuration")
                  gr.Markdown(tokenizer_info)
-    # About Tesslate Section
     with gr.Row():
         with gr.Accordion("💡 About Tesslate & Our Mission", open=False):
             gr.Markdown(about_tesslate)
-    # Links Section
     gr.Markdown(join_us)
-    # Examples (Keep the relevant examples)
     gr.Examples(
         examples=[
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
-                0.7, 512, 0.95, 1.1, 40, 0.05 # Default params match the sliders now
             ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
@@ -359,7 +316,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
              [
                 "You are a helpful AI assistant.",
                 "What are the pros and cons of using Next.js compared to Create React App?",
-                0.8, 1024, 0.98, 1.05, 60, 0.05 # Example with slightly different params
             ]
         ],
         inputs=[
@@ -376,17 +333,14 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
         label="✨ Example Prompts (Click to Load)"
     )
-    # Connect button click to function
     generate_btn.click(
         fn=generate_response,
         inputs=[system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p],
         outputs=output,
-        api_name="generate"
     )
-# Launch the demo
 if __name__ == "__main__":
-    # The progress bar noise during shard loading is normal output from the `transformers` library
-    # during the download/loading phase before the Gradio app starts serving.
-    # It cannot be suppressed from within this script.
-    demo.queue().launch(debug=True, share=False) # Set share=True if deploying on HF Spaces

 from datetime import datetime
 import os
 import subprocess # For Flash Attention install
+from threading import Thread # For streaming
 # --- Install Flash Attention (specific method for compatibility) ---
 print("Attempting to install Flash Attention 2...")
 try:
     subprocess.run(
         'pip install flash-attn --no-build-isolation',
         env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
         shell=True,
+        check=True
     )
     print("Flash Attention installed successfully using subprocess method.")
     _flash_attn_2_available = True
     _flash_attn_2_available = False
 # --- Import Transformers AFTER potential install ---
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer # Added TextIteratorStreamer
 from huggingface_hub import HfApi, HfFolder
 # --- Configuration ---
 model_id = "Tesslate/Tessa-T1-14B"
 creator_link = "https://huggingface.co/TesslateAI"
 model_link = f"https://huggingface.co/{model_id}"
     <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
     <h1 style="margin-bottom: 5px;">🚀 Welcome to the Tessa-T1-14B Demo 🚀</h1>
     <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
+    <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a> | Running with 8-bit Quantization | Streaming Output</p>
 </div>
 """
 description = f"""
 Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
 Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
+This demo uses **8-bit quantization** via `bitsandbytes` for reduced memory footprint. **Flash Attention 2** is enabled if available. Output is **streamed** token-by-token.
 """
+# --- (Keep about_tesslate and join_us sections as before) ---
 about_tesslate = f"""
 ## About Tesslate & Our Vision
 <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_notext.png?download=true" alt="Tesslate Icon" style="height: 40px; float: left; margin-right: 10px;">
     </a>
 </div>
 """
 # --- Model and Tokenizer Loading ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 if device == "cpu":
     print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
+    _flash_attn_2_available = False
+hf_token = os.getenv('HF_TOKEN')
 if not hf_token:
     try:
         hf_token = HfFolder.get_token()
+        if not hf_token: hf_token = HfApi().token
+        if not hf_token: raise ValueError("HF token not found.")
         print("Using token from Hugging Face login.")
     except Exception as e:
+         raise ValueError(f"HF token acquisition failed: {e}. Please set HF_TOKEN or login.")
 print(f"Loading Tokenizer: {model_id}")
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token, trust_remote_code=True)
 print(f"Loading Model: {model_id} with 8-bit quantization")
 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa"
 print(f"Using attention implementation: {attn_implementation}")
 try:
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=hf_token,
+        device_map="auto",
         quantization_config=quantization_config,
+        attn_implementation=attn_implementation,
         trust_remote_code=True
     )
     print("Model loaded successfully with 8-bit quantization.")
 except Exception as e:
     print(f"Error loading model: {e}")
     if attn_implementation == "flash_attention_2":
         print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
         try:
             attn_implementation = "sdpa"
             model = AutoModelForCausalLM.from_pretrained(
+                model_id, token=hf_token, device_map="auto", quantization_config=quantization_config,
+                attn_implementation=attn_implementation, trust_remote_code=True
             )
             print("Model loaded successfully with 8-bit quantization and SDPA attention.")
         except Exception as e2:
+             print(f"Fallback to SDPA attention also failed: {e2}"); raise e2
+    else: raise e
+# --- (Keep config info gathering and tokenizer info formatting as before) ---
 try:
     config_json = model.config.to_dict()
     quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
     model_config_info = f"""
 **Model Type:** {config_json.get('model_type', 'N/A')}
     print(f"Could not retrieve full model config: {e}")
     model_config_info = f"**Error:** Could not load full config details for {model_id}."
 def format_tokenizer_info(tokenizer_instance):
     try:
         info = [
 tokenizer_info = format_tokenizer_info(tokenizer)
+# --- Generation Function (Modified for Streaming) ---
+@spaces.GPU(duration=180)
 def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
     messages = []
     if system_prompt and system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt})
     messages.append({"role": "user", "content": user_prompt})
     try:
+        full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     except Exception as e:
+        print(f"Warning: Using fallback prompt format due to error: {e}")
         prompt_parts = []
+        if system_prompt and system_prompt.strip(): prompt_parts.append(f"System: {system_prompt}")
+        prompt_parts.append(f"\nUser: {user_prompt}\nAssistant:")
         full_prompt = "\n".join(prompt_parts)
+    # Use TextIteratorStreamer for streaming output
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        timeout=10.0, # Timeout for waiting for new tokens
+        skip_prompt=True, # Don't yield the prompt
+        skip_special_tokens=True
+    )
+    # Ensure inputs are correctly placed (device_map handles this)
+    inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device) # Use model's device
+    # Generation kwargs, pass streamer
     generation_kwargs = dict(
+        inputs, # Pass tokenized inputs directly
+        streamer=streamer, # Pass the streamer
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature) if float(temperature) > 0 else None,
         top_p=float(top_p),
         generation_kwargs.pop('top_k', None)
         generation_kwargs['do_sample'] = False
+    # Run generation in a separate thread
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Yield generated text as it becomes available
+    generated_text = ""
+    # Yield an empty string immediately to clear previous output
+    yield ""
+    for new_text in streamer:
+        generated_text += new_text
+        yield generated_text
+# --- Gradio Interface (No changes needed here for streaming itself) ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=".gradio-container { max-width: 90% !important; }") as demo:
     gr.Markdown(Title)
     gr.Markdown(description)
                 )
                 user_prompt = gr.Textbox(
                     label="💬 Your Request",
+                    placeholder="e.g., 'Create a React functional component for a simple counter...' or 'Explain virtual DOM.'",
                     lines=6
                 )
             with gr.Accordion("🛠️ Generation Parameters", open=True):
                  with gr.Row():
+                     temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌡️ Temperature")
+                     max_new_tokens = gr.Slider(minimum=64, maximum=10000, value=10000, step=32, label="📊 Max New Tokens")
                  with gr.Row():
+                     top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="🏆 Top-k")
+                     top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="🏅 Top-p (nucleus)")
                  with gr.Row():
+                     repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty")
+                     min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="📉 Min-p (Not Active)")
+            generate_btn = gr.Button("🚀 Generate Response (Streaming)", variant="primary", size="lg") # Updated button text slightly
         with gr.Column(scale=2):
             output = gr.Code(
                 label=f"🌠 Tessa-T1-14B (8-bit) Output",
                 language="markdown",
                 lines=25,
+                # interactive=False # Usually keep interactive=False for Code output
             )
             with gr.Accordion("⚙️ Model & Tokenizer Details", open=False):
                  gr.Markdown("### Model Configuration")
+                 gr.Markdown(model_config_info)
                  gr.Markdown("---")
                  gr.Markdown("### Tokenizer Configuration")
                  gr.Markdown(tokenizer_info)
+    # --- (Keep About Tesslate, Links, and Examples sections as before) ---
     with gr.Row():
         with gr.Accordion("💡 About Tesslate & Our Mission", open=False):
             gr.Markdown(about_tesslate)
     gr.Markdown(join_us)
     gr.Examples(
         examples=[
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
                 "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
+                0.7, 512, 0.95, 1.1, 40, 0.05
             ],
             [
                 "You are Tessa, an expert AI assistant specialized in React development.",
              [
                 "You are a helpful AI assistant.",
                 "What are the pros and cons of using Next.js compared to Create React App?",
+                0.8, 1024, 0.98, 1.05, 60, 0.05
             ]
         ],
         inputs=[
         label="✨ Example Prompts (Click to Load)"
     )
+    # --- Connect button click to the GENERATOR function ---
     generate_btn.click(
         fn=generate_response,
         inputs=[system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p],
         outputs=output,
+        api_name="generate_stream" # Changed API name for clarity
     )
+# --- Launch the demo ---
 if __name__ == "__main__":
+    demo.queue().launch(debug=True, share=False)