UIGEN-T3-4B-Demo

Runtime error

App Files Files Community

UIGEN-T3-4B-Demo / app.py

smirki

Update app.py

f409e96 verified 8 months ago

raw

history blame

20.4 kB

	import spaces
	import gradio as gr
	import torch
	from datetime import datetime
	import os
	import subprocess # For Flash Attention install

	# --- Install Flash Attention (specific method for compatibility) ---
	# This method attempts to install flash-attn without building CUDA extensions locally,
	# which can be helpful in restricted environments like ZeroGPU or when build tools are missing.
	print("Attempting to install Flash Attention 2...")
	try:
	subprocess.run(
	'pip install flash-attn --no-build-isolation',
	env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
	shell=True,
	check=True # Raise an error if the command fails
	)
	print("Flash Attention installed successfully using subprocess method.")
	_flash_attn_2_available = True
	except Exception as e:
	print(f"Could not install Flash Attention 2 using subprocess: {e}")
	print("Proceeding without Flash Attention 2. Performance may be impacted.")
	_flash_attn_2_available = False

	# --- Import Transformers AFTER potential install ---
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from huggingface_hub import HfApi, HfFolder

	# --- Configuration ---
	# Updated model ID
	model_id = "Tesslate/Tessa-T1-14B"
	creator_link = "https://huggingface.co/TesslateAI"
	model_link = f"https://huggingface.co/{model_id}"
	website_link = "https://tesslate.com"
	discord_link = "https://discord.gg/DkzMzwBTaw"

	# --- Text Content (Keep the cool UI elements) ---
	Title = f"""
	<div style="text-align: center; margin-bottom: 20px;">
	<img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
	<h1 style="margin-bottom: 5px;">🚀 Welcome to the Tessa-T1-14B Demo 🚀</h1>
	<p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
	<p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> \| <a href="{model_link}" target="_blank">View on Hugging Face</a> \| Running with 8-bit Quantization</p>
	</div>
	"""

	description = f"""
	Interact with [{model_id}]({model_link}), an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
	Tessa-T1 specializes in React frontend development, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
	This demo uses 8-bit quantization via `bitsandbytes` for reduced memory footprint. Flash Attention 2 is enabled if available for potentially faster inference.
	"""

	about_tesslate = f"""
	## About Tesslate & Our Vision
	<img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_notext.png?download=true" alt="Tesslate Icon" style="height: 40px; float: left; margin-right: 10px;">
	Hi everyone, I’m Manav, founder of Tesslate, and we’re on a mission to revolutionize AI by putting powerful reasoning models into your hands.

	Today, the AI landscape is dominated by massive frontier models—large, costly, and slow. At Tesslate, we see things differently. The next wave of AI disruption won’t come from sheer size; it'll be driven by speed, specialization, and precision reasoning. Smaller, specialized models aren’t just faster—they’re smarter and more efficient.

	Our story began when we released a UI-generation model on Hugging Face that didn't just replicate patterns—it could reason through entire component hierarchies. It resonated instantly, hitting over 10,000 downloads in weeks. That early success validated our vision, and we doubled down.

	At Tesslate, we build lean, intelligent models that:
	* 🧠 Think like human agents
	* 💡 Reason through complex, real-world workflows
	* 💻 Execute like elite developers, designers, and analysts

	We've already delivered:
	* UIGEN-T1.5: Creating stunning, editable interfaces (React, Tailwind, Three.js)
	* Tessa-T1: A specialized reasoning engine optimized for React development and AI agents (You are here!)
	* Synthia S1: Our flagship general-reasoning model, proving powerful reasoning capabilities beyond STEM into creativity and storytelling.

	Our vision is bigger. We aim to be the #1 trusted brand in fast, specialized AI, covering training, inference, real-time agent actions, infrastructure, research, and innovative products. We’re already piloting with industry-leading clients tackling everything from sophisticated design systems to real-time analytics.

	Join us! We're seeking strategic advice, introductions, compute resources, and capital.
	👉 Visit [tesslate.com]({website_link}) to learn more and connect.
	"""

	join_us = f"""
	<div style="text-align: center;">
	<h3 style="margin-bottom: 10px;">Connect with Tesslate</h3>
	<a href="{discord_link}" target="_blank" style="text-decoration: none; margin: 0 10px;">
	<img src="https://img.shields.io/discord/1225631184402124842?label=Discord&logo=discord&style=for-the-badge&color=5865F2" alt="Join us on Discord">
	</a>
	<a href="{website_link}" target="_blank" style="text-decoration: none; margin: 0 10px;">
	<img src="https://img.shields.io/badge/Website-tesslate.com-blue?style=for-the-badge&logo=googlechrome&logoColor=white" alt="Visit tesslate.com">
	</a>
	<a href="{model_link}" target="_blank" style="text-decoration: none; margin: 0 10px;">
	<img src="https://img.shields.io/badge/🤗%20Model-Tessa--T1--14B-yellow?style=for-the-badge&logo=huggingface" alt="Tessa-T1-14B on Hugging Face">
	</a>
	</div>
	"""

	# --- Model and Tokenizer Loading ---
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")
	if device == "cpu":
	print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
	_flash_attn_2_available = False # Cannot use flash attn on CPU

	# Get the token from environment variables
	hf_token = os.getenv('HF_TOKEN') # Standard env var name for HF token
	if not hf_token:
	try:
	hf_token = HfFolder.get_token()
	if not hf_token:
	hf_token = HfApi().token
	if not hf_token:
	raise ValueError("HF token not found. Please set HF_TOKEN env var or login via `huggingface-cli login`.")
	print("Using token from Hugging Face login.")
	except ImportError:
	raise ValueError("huggingface_hub not installed. Please set the HF_TOKEN environment variable or install huggingface_hub.")
	except Exception as e:
	raise ValueError(f"HF token acquisition failed. Please set the HF_TOKEN environment variable or login via `huggingface-cli login`. Error: {e}")

	print(f"Loading Tokenizer: {model_id}")
	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	token=hf_token,
	trust_remote_code=True
	)

	print(f"Loading Model: {model_id} with 8-bit quantization")
	# Define quantization configuration
	quantization_config = BitsAndBytesConfig(load_in_8bit=True)

	# Determine attn_implementation based on install success and device
	attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa" # sdpa is a fallback
	print(f"Using attention implementation: {attn_implementation}")
	# Note: You might see a warning from bitsandbytes about library paths on ZeroGPU, this is often normal.

	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=hf_token,
	device_map="auto", # Automatically distributes layers, crucial for large quantized models
	quantization_config=quantization_config,
	attn_implementation=attn_implementation, # Enable Flash Attention 2 if available
	trust_remote_code=True
	)
	print("Model loaded successfully with 8-bit quantization.")
	except ImportError as e:
	print(f"ImportError during model loading: {e}")
	print("Ensure 'bitsandbytes' and 'accelerate' are installed.")
	# Optionally fall back to no quantization if bitsandbytes is missing,
	# but for this request, we assume it's intended.
	raise e
	except Exception as e:
	print(f"Error loading model: {e}")
	# If Flash Attention was requested but is incompatible, Transformers might raise an error.
	# Let's try falling back to SDPA (Scaled Dot Product Attention) if FA2 fails at load time.
	if attn_implementation == "flash_attention_2":
	print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
	try:
	attn_implementation = "sdpa"
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=hf_token,
	device_map="auto",
	quantization_config=quantization_config,
	attn_implementation=attn_implementation,
	trust_remote_code=True
	)
	print("Model loaded successfully with 8-bit quantization and SDPA attention.")
	except Exception as e2:
	print(f"Fallback to SDPA attention also failed: {e2}")
	raise e2 # Re-raise the error if fallback fails too
	else:
	raise e # Re-raise original error if it wasn't FA2 related

	# Get config info (might need adjustment based on quantized model structure)
	try:
	config_json = model.config.to_dict()
	# Add quantization info
	quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
	model_config_info = f"""
	Model Type: {config_json.get('model_type', 'N/A')}
	Architecture: {config_json.get('architectures', ['N/A'])[0]}
	Vocab Size: {config_json.get('vocab_size', 'N/A')}
	Hidden Size: {config_json.get('hidden_size', 'N/A')}
	Num Hidden Layers: {config_json.get('num_hidden_layers', 'N/A')}
	Num Attention Heads: {config_json.get('num_attention_heads', 'N/A')}
	Max Position Embeddings: {config_json.get('max_position_embeddings', 'N/A')}
	Attention Implementation: `{attn_implementation}`
	Quantization: 8-bit (`load_in_8bit={quant_info.get('load_in_8bit', 'N/A')}`)
	"""
	except Exception as e:
	print(f"Could not retrieve full model config: {e}")
	model_config_info = f"Error: Could not load full config details for {model_id}."


	# --- Helper Function for Tokenizer Info ---
	# (Keep the existing format_tokenizer_info function - no changes needed)
	def format_tokenizer_info(tokenizer_instance):
	try:
	info = [
	f"Tokenizer Class: `{tokenizer_instance.__class__.__name__}`",
	f"Vocabulary Size: {tokenizer_instance.vocab_size}",
	f"Model Max Length: {tokenizer_instance.model_max_length}",
	f"EOS Token: `{tokenizer_instance.eos_token}` (ID: {tokenizer_instance.eos_token_id})",
	f"Special Tokens: Check model card for specific template/tokens.", # Qwen2 has specific tokens
	]
	# Add BOS/PAD/UNK if they are commonly used and different from EOS
	if hasattr(tokenizer_instance, 'pad_token') and tokenizer_instance.pad_token and tokenizer_instance.pad_token_id is not None:
	info.append(f"Padding Token: `{tokenizer_instance.pad_token}` (ID: {tokenizer_instance.pad_token_id})")
	if hasattr(tokenizer_instance, 'bos_token') and tokenizer_instance.bos_token and tokenizer_instance.bos_token_id is not None:
	info.append(f"BOS Token: `{tokenizer_instance.bos_token}` (ID: {tokenizer_instance.bos_token_id})")
	if hasattr(tokenizer_instance, 'unk_token') and tokenizer_instance.unk_token and tokenizer_instance.unk_token_id is not None:
	info.append(f"UNK Token: `{tokenizer_instance.unk_token}` (ID: {tokenizer_instance.unk_token_id})")
	return "\n".join(info)
	except Exception as e:
	print(f"Error getting tokenizer info: {e}")
	return f"Could not retrieve full tokenizer details. Vocab size: {getattr(tokenizer_instance, 'vocab_size', 'N/A')}"

	tokenizer_info = format_tokenizer_info(tokenizer)


	# --- Generation Function ---
	@spaces.GPU(duration=180) # Keep duration, can be adjusted if needed
	def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
	# (Keep the existing generate_response function structure)
	# It correctly uses apply_chat_template and handles generation parameters.
	# min_p is still noted as ignored by the standard HF generate function.

	messages = []
	if system_prompt and system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt})
	messages.append({"role": "user", "content": user_prompt})

	try:
	full_prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	# print("Applied tokenizer's chat template.") # Less verbose logging
	except Exception as e:
	print(f"Warning: Could not use apply_chat_template (Error: {e}). Falling back to basic format. This might degrade performance.")
	prompt_parts = []
	if system_prompt and system_prompt.strip():
	prompt_parts.append(f"System: {system_prompt}")
	prompt_parts.append(f"\nUser: {user_prompt}")
	prompt_parts.append("\nAssistant:")
	full_prompt = "\n".join(prompt_parts)

	# print(f"\n--- Generating ---")
	# print(f"Prompt:\n{full_prompt}")
	# print(f"Params: Temp={temperature}, TopK={top_k}, TopP={top_p}, RepPen={repetition_penalty}, MaxNew={max_new_tokens}, MinP={min_p} (MinP ignored)")
	# print("-" * 20)

	# Ensure inputs are on the correct device (handled by device_map="auto")
	# Added truncation safeguard during tokenization
	inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)

	generation_kwargs = dict(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	temperature=float(temperature) if float(temperature) > 0 else None,
	top_p=float(top_p),
	top_k=int(top_k),
	repetition_penalty=float(repetition_penalty),
	do_sample=True if float(temperature) > 0 else False,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	if temperature == 0:
	generation_kwargs.pop('top_p', None)
	generation_kwargs.pop('top_k', None)
	generation_kwargs['do_sample'] = False

	with torch.inference_mode():
	outputs = model.generate(**generation_kwargs)

	input_length = inputs['input_ids'].shape[1]
	generated_tokens = outputs[0][input_length:]
	response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

	# print(f"--- Response ---\n{response}\n---------------\n")
	return response.strip()

	# --- Gradio Interface ---
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=".gradio-container { max-width: 90% !important; }") as demo:
	gr.Markdown(Title)
	gr.Markdown(description)

	with gr.Row():
	with gr.Column(scale=3):
	with gr.Group():
	system_prompt = gr.Textbox(
	label="System Prompt (Persona & Instructions)",
	value="You are Tessa, an expert AI assistant specialized in React development. Generate clean, semantic React code based on user requests. If the request is not about React, answer as a general helpful assistant.",
	lines=3,
	info="Guide the model's overall behavior and expertise."
	)
	user_prompt = gr.Textbox(
	label="💬 Your Request",
	placeholder="e.g., 'Create a React functional component for a simple counter with increment and decrement buttons using useState.' or 'Explain the concept of virtual DOM.'",
	lines=6
	)

	with gr.Accordion("🛠️ Generation Parameters", open=True):
	with gr.Row():
	# --- Set Default Params ---
	temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌡️ Temperature", info="Controls randomness. 0 = deterministic, >0 = random.")
	max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=1024, step=32, label="📊 Max New Tokens", info="Max length of the generated response.")
	with gr.Row():
	top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="🏆 Top-k", info="Sample from top k likely tokens.")
	top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="🏅 Top-p (nucleus)", info="Sample from tokens with cumulative probability >= top_p.")
	with gr.Row():
	repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty", info="Penalizes repeating tokens ( > 1).")
	min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="📉 Min-p (Not Active)", info="Filters tokens below this probability threshold (Requires custom logic - currently ignored).")

	generate_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")

	with gr.Column(scale=2):
	# --- Fix: Remove show_copy_button=True ---
	# gr.Code inherently has a copy button in modern Gradio versions
	output = gr.Code(
	label=f"🌠 Tessa-T1-14B (8-bit) Output",
	language="markdown",
	lines=25,
	# show_copy_button=True, # REMOVED - This caused the TypeError
	)

	with gr.Accordion("⚙️ Model & Tokenizer Details", open=False):
	gr.Markdown("### Model Configuration")
	gr.Markdown(model_config_info) # Display updated info including quantization/attn
	gr.Markdown("---")
	gr.Markdown("### Tokenizer Configuration")
	gr.Markdown(tokenizer_info)

	# About Tesslate Section
	with gr.Row():
	with gr.Accordion("💡 About Tesslate & Our Mission", open=False):
	gr.Markdown(about_tesslate)

	# Links Section
	gr.Markdown(join_us)

	# Examples (Keep the relevant examples)
	gr.Examples(
	examples=[
	[
	"You are Tessa, an expert AI assistant specialized in React development.",
	"Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
	0.7, 512, 0.95, 1.1, 40, 0.05 # Default params match the sliders now
	],
	[
	"You are Tessa, an expert AI assistant specialized in React development.",
	"Explain the difference between `useState` and `useEffect` hooks in React with simple examples.",
	0.7, 1024, 0.95, 1.1, 40, 0.05
	],
	[
	"You are Tessa, an expert AI assistant specialized in React development. Use Tailwind CSS for styling.",
	"Generate a React component for a responsive card with an image, title, and description, using Tailwind CSS classes.",
	0.7, 1536, 0.95, 1.1, 40, 0.05
	],
	[
	"You are a helpful AI assistant.",
	"What are the pros and cons of using Next.js compared to Create React App?",
	0.8, 1024, 0.98, 1.05, 60, 0.05 # Example with slightly different params
	]
	],
	inputs=[
	system_prompt,
	user_prompt,
	temperature,
	max_new_tokens,
	top_p,
	repetition_penalty,
	top_k,
	min_p
	],
	outputs=output,
	label="✨ Example Prompts (Click to Load)"
	)

	# Connect button click to function
	generate_btn.click(
	fn=generate_response,
	inputs=[system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p],
	outputs=output,
	api_name="generate"
	)

	# Launch the demo
	if __name__ == "__main__":
	# The progress bar noise during shard loading is normal output from the `transformers` library
	# during the download/loading phase before the Gradio app starts serving.
	# It cannot be suppressed from within this script.
	demo.queue().launch(debug=True, share=False) # Set share=True if deploying on HF Spaces