smirki commited on
Commit
b10e890
Β·
verified Β·
1 Parent(s): beed4b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -112
app.py CHANGED
@@ -4,17 +4,16 @@ import torch
4
  from datetime import datetime
5
  import os
6
  import subprocess # For Flash Attention install
 
7
 
8
  # --- Install Flash Attention (specific method for compatibility) ---
9
- # This method attempts to install flash-attn without building CUDA extensions locally,
10
- # which can be helpful in restricted environments like ZeroGPU or when build tools are missing.
11
  print("Attempting to install Flash Attention 2...")
12
  try:
13
  subprocess.run(
14
  'pip install flash-attn --no-build-isolation',
15
  env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
16
  shell=True,
17
- check=True # Raise an error if the command fails
18
  )
19
  print("Flash Attention installed successfully using subprocess method.")
20
  _flash_attn_2_available = True
@@ -24,11 +23,10 @@ except Exception as e:
24
  _flash_attn_2_available = False
25
 
26
  # --- Import Transformers AFTER potential install ---
27
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
28
  from huggingface_hub import HfApi, HfFolder
29
 
30
  # --- Configuration ---
31
- # Updated model ID
32
  model_id = "Tesslate/Tessa-T1-14B"
33
  creator_link = "https://huggingface.co/TesslateAI"
34
  model_link = f"https://huggingface.co/{model_id}"
@@ -41,16 +39,17 @@ Title = f"""
41
  <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
42
  <h1 style="margin-bottom: 5px;">πŸš€ Welcome to the Tessa-T1-14B Demo πŸš€</h1>
43
  <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
44
- <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a> | Running with 8-bit Quantization</p>
45
  </div>
46
  """
47
 
48
  description = f"""
49
  Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
50
  Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
51
- This demo uses **8-bit quantization** via `bitsandbytes` for reduced memory footprint. **Flash Attention 2** is enabled if available for potentially faster inference.
52
  """
53
 
 
54
  about_tesslate = f"""
55
  ## About Tesslate & Our Vision
56
  <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_notext.png?download=true" alt="Tesslate Icon" style="height: 40px; float: left; margin-right: 10px;">
@@ -90,88 +89,59 @@ join_us = f"""
90
  </a>
91
  </div>
92
  """
93
-
94
  # --- Model and Tokenizer Loading ---
95
  device = "cuda" if torch.cuda.is_available() else "cpu"
96
  print(f"Using device: {device}")
97
  if device == "cpu":
98
  print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
99
- _flash_attn_2_available = False # Cannot use flash attn on CPU
100
 
101
- # Get the token from environment variables
102
- hf_token = os.getenv('HF_TOKEN') # Standard env var name for HF token
103
  if not hf_token:
104
  try:
105
  hf_token = HfFolder.get_token()
106
- if not hf_token:
107
- hf_token = HfApi().token
108
- if not hf_token:
109
- raise ValueError("HF token not found. Please set HF_TOKEN env var or login via `huggingface-cli login`.")
110
  print("Using token from Hugging Face login.")
111
- except ImportError:
112
- raise ValueError("huggingface_hub not installed. Please set the HF_TOKEN environment variable or install huggingface_hub.")
113
  except Exception as e:
114
- raise ValueError(f"HF token acquisition failed. Please set the HF_TOKEN environment variable or login via `huggingface-cli login`. Error: {e}")
115
 
116
  print(f"Loading Tokenizer: {model_id}")
117
- tokenizer = AutoTokenizer.from_pretrained(
118
- model_id,
119
- token=hf_token,
120
- trust_remote_code=True
121
- )
122
 
123
  print(f"Loading Model: {model_id} with 8-bit quantization")
124
- # Define quantization configuration
125
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
126
-
127
- # Determine attn_implementation based on install success and device
128
- attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa" # sdpa is a fallback
129
  print(f"Using attention implementation: {attn_implementation}")
130
- # Note: You might see a warning from bitsandbytes about library paths on ZeroGPU, this is often normal.
131
 
132
  try:
133
  model = AutoModelForCausalLM.from_pretrained(
134
  model_id,
135
  token=hf_token,
136
- device_map="auto", # Automatically distributes layers, crucial for large quantized models
137
  quantization_config=quantization_config,
138
- attn_implementation=attn_implementation, # Enable Flash Attention 2 if available
139
  trust_remote_code=True
140
  )
141
  print("Model loaded successfully with 8-bit quantization.")
142
- except ImportError as e:
143
- print(f"ImportError during model loading: {e}")
144
- print("Ensure 'bitsandbytes' and 'accelerate' are installed.")
145
- # Optionally fall back to no quantization if bitsandbytes is missing,
146
- # but for this request, we assume it's intended.
147
- raise e
148
  except Exception as e:
149
  print(f"Error loading model: {e}")
150
- # If Flash Attention was requested but is incompatible, Transformers might raise an error.
151
- # Let's try falling back to SDPA (Scaled Dot Product Attention) if FA2 fails at load time.
152
  if attn_implementation == "flash_attention_2":
153
  print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
154
  try:
155
  attn_implementation = "sdpa"
156
  model = AutoModelForCausalLM.from_pretrained(
157
- model_id,
158
- token=hf_token,
159
- device_map="auto",
160
- quantization_config=quantization_config,
161
- attn_implementation=attn_implementation,
162
- trust_remote_code=True
163
  )
164
  print("Model loaded successfully with 8-bit quantization and SDPA attention.")
165
  except Exception as e2:
166
- print(f"Fallback to SDPA attention also failed: {e2}")
167
- raise e2 # Re-raise the error if fallback fails too
168
- else:
169
- raise e # Re-raise original error if it wasn't FA2 related
170
 
171
- # Get config info (might need adjustment based on quantized model structure)
172
  try:
173
  config_json = model.config.to_dict()
174
- # Add quantization info
175
  quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
176
  model_config_info = f"""
177
  **Model Type:** {config_json.get('model_type', 'N/A')}
@@ -188,9 +158,6 @@ except Exception as e:
188
  print(f"Could not retrieve full model config: {e}")
189
  model_config_info = f"**Error:** Could not load full config details for {model_id}."
190
 
191
-
192
- # --- Helper Function for Tokenizer Info ---
193
- # (Keep the existing format_tokenizer_info function - no changes needed)
194
  def format_tokenizer_info(tokenizer_instance):
195
  try:
196
  info = [
@@ -215,45 +182,38 @@ def format_tokenizer_info(tokenizer_instance):
215
  tokenizer_info = format_tokenizer_info(tokenizer)
216
 
217
 
218
- # --- Generation Function ---
219
- @spaces.GPU(duration=180) # Keep duration, can be adjusted if needed
220
  def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
221
- # (Keep the existing generate_response function structure)
222
- # It correctly uses apply_chat_template and handles generation parameters.
223
- # min_p is still noted as ignored by the standard HF generate function.
224
-
225
  messages = []
226
  if system_prompt and system_prompt.strip():
227
  messages.append({"role": "system", "content": system_prompt})
228
  messages.append({"role": "user", "content": user_prompt})
229
 
230
  try:
231
- full_prompt = tokenizer.apply_chat_template(
232
- messages,
233
- tokenize=False,
234
- add_generation_prompt=True
235
- )
236
- # print("Applied tokenizer's chat template.") # Less verbose logging
237
  except Exception as e:
238
- print(f"Warning: Could not use apply_chat_template (Error: {e}). Falling back to basic format. This might degrade performance.")
239
  prompt_parts = []
240
- if system_prompt and system_prompt.strip():
241
- prompt_parts.append(f"System: {system_prompt}")
242
- prompt_parts.append(f"\nUser: {user_prompt}")
243
- prompt_parts.append("\nAssistant:")
244
  full_prompt = "\n".join(prompt_parts)
245
 
246
- # print(f"\n--- Generating ---")
247
- # print(f"Prompt:\n{full_prompt}")
248
- # print(f"Params: Temp={temperature}, TopK={top_k}, TopP={top_p}, RepPen={repetition_penalty}, MaxNew={max_new_tokens}, MinP={min_p} (MinP ignored)")
249
- # print("-" * 20)
 
 
 
250
 
251
- # Ensure inputs are on the correct device (handled by device_map="auto")
252
- # Added truncation safeguard during tokenization
253
- inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
254
 
 
255
  generation_kwargs = dict(
256
- **inputs,
 
257
  max_new_tokens=int(max_new_tokens),
258
  temperature=float(temperature) if float(temperature) > 0 else None,
259
  top_p=float(top_p),
@@ -269,17 +229,19 @@ def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, t
269
  generation_kwargs.pop('top_k', None)
270
  generation_kwargs['do_sample'] = False
271
 
272
- with torch.inference_mode():
273
- outputs = model.generate(**generation_kwargs)
274
-
275
- input_length = inputs['input_ids'].shape[1]
276
- generated_tokens = outputs[0][input_length:]
277
- response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
278
 
279
- # print(f"--- Response ---\n{response}\n---------------\n")
280
- return response.strip()
 
 
 
 
 
281
 
282
- # --- Gradio Interface ---
283
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=".gradio-container { max-width: 90% !important; }") as demo:
284
  gr.Markdown(Title)
285
  gr.Markdown(description)
@@ -295,56 +257,51 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
295
  )
296
  user_prompt = gr.Textbox(
297
  label="πŸ’¬ Your Request",
298
- placeholder="e.g., 'Create a React functional component for a simple counter with increment and decrement buttons using useState.' or 'Explain the concept of virtual DOM.'",
299
  lines=6
300
  )
301
 
302
  with gr.Accordion("πŸ› οΈ Generation Parameters", open=True):
303
  with gr.Row():
304
- # --- Set Default Params ---
305
- temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌑️ Temperature", info="Controls randomness. 0 = deterministic, >0 = random.")
306
- max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=1024, step=32, label="πŸ“Š Max New Tokens", info="Max length of the generated response.")
307
  with gr.Row():
308
- top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="πŸ† Top-k", info="Sample from top k likely tokens.")
309
- top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="πŸ… Top-p (nucleus)", info="Sample from tokens with cumulative probability >= top_p.")
310
  with gr.Row():
311
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty", info="Penalizes repeating tokens ( > 1).")
312
- min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="πŸ“‰ Min-p (Not Active)", info="Filters tokens below this probability threshold (Requires custom logic - currently ignored).")
313
 
314
- generate_btn = gr.Button("πŸš€ Generate Response", variant="primary", size="lg")
315
 
316
  with gr.Column(scale=2):
317
- # --- Fix: Remove show_copy_button=True ---
318
- # gr.Code inherently has a copy button in modern Gradio versions
319
  output = gr.Code(
320
  label=f"🌠 Tessa-T1-14B (8-bit) Output",
321
  language="markdown",
322
  lines=25,
323
- # show_copy_button=True, # REMOVED - This caused the TypeError
324
  )
325
 
326
  with gr.Accordion("βš™οΈ Model & Tokenizer Details", open=False):
327
  gr.Markdown("### Model Configuration")
328
- gr.Markdown(model_config_info) # Display updated info including quantization/attn
329
  gr.Markdown("---")
330
  gr.Markdown("### Tokenizer Configuration")
331
  gr.Markdown(tokenizer_info)
332
 
333
- # About Tesslate Section
334
  with gr.Row():
335
  with gr.Accordion("πŸ’‘ About Tesslate & Our Mission", open=False):
336
  gr.Markdown(about_tesslate)
337
 
338
- # Links Section
339
  gr.Markdown(join_us)
340
 
341
- # Examples (Keep the relevant examples)
342
  gr.Examples(
343
  examples=[
344
  [
345
  "You are Tessa, an expert AI assistant specialized in React development.",
346
  "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
347
- 0.7, 512, 0.95, 1.1, 40, 0.05 # Default params match the sliders now
348
  ],
349
  [
350
  "You are Tessa, an expert AI assistant specialized in React development.",
@@ -359,7 +316,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
359
  [
360
  "You are a helpful AI assistant.",
361
  "What are the pros and cons of using Next.js compared to Create React App?",
362
- 0.8, 1024, 0.98, 1.05, 60, 0.05 # Example with slightly different params
363
  ]
364
  ],
365
  inputs=[
@@ -376,17 +333,14 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
376
  label="✨ Example Prompts (Click to Load)"
377
  )
378
 
379
- # Connect button click to function
380
  generate_btn.click(
381
  fn=generate_response,
382
  inputs=[system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p],
383
  outputs=output,
384
- api_name="generate"
385
  )
386
 
387
- # Launch the demo
388
  if __name__ == "__main__":
389
- # The progress bar noise during shard loading is normal output from the `transformers` library
390
- # during the download/loading phase before the Gradio app starts serving.
391
- # It cannot be suppressed from within this script.
392
- demo.queue().launch(debug=True, share=False) # Set share=True if deploying on HF Spaces
 
4
  from datetime import datetime
5
  import os
6
  import subprocess # For Flash Attention install
7
+ from threading import Thread # For streaming
8
 
9
  # --- Install Flash Attention (specific method for compatibility) ---
 
 
10
  print("Attempting to install Flash Attention 2...")
11
  try:
12
  subprocess.run(
13
  'pip install flash-attn --no-build-isolation',
14
  env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
15
  shell=True,
16
+ check=True
17
  )
18
  print("Flash Attention installed successfully using subprocess method.")
19
  _flash_attn_2_available = True
 
23
  _flash_attn_2_available = False
24
 
25
  # --- Import Transformers AFTER potential install ---
26
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer # Added TextIteratorStreamer
27
  from huggingface_hub import HfApi, HfFolder
28
 
29
  # --- Configuration ---
 
30
  model_id = "Tesslate/Tessa-T1-14B"
31
  creator_link = "https://huggingface.co/TesslateAI"
32
  model_link = f"https://huggingface.co/{model_id}"
 
39
  <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_color.png?download=true" alt="Tesslate Logo" style="height: 80px; margin-bottom: 10px;">
40
  <h1 style="margin-bottom: 5px;">πŸš€ Welcome to the Tessa-T1-14B Demo πŸš€</h1>
41
  <p style="font-size: 1.1em;">Experience the power of specialized React reasoning!</p>
42
+ <p>Model by <a href="{creator_link}" target="_blank">TesslateAI</a> | <a href="{model_link}" target="_blank">View on Hugging Face</a> | Running with 8-bit Quantization | Streaming Output</p>
43
  </div>
44
  """
45
 
46
  description = f"""
47
  Interact with **[{model_id}]({model_link})**, an innovative 14B parameter transformer model fine-tuned from Qwen2.5-Coder-14B-Instruct.
48
  Tessa-T1 specializes in **React frontend development**, leveraging advanced reasoning to autonomously generate well-structured, semantic React components.
49
+ This demo uses **8-bit quantization** via `bitsandbytes` for reduced memory footprint. **Flash Attention 2** is enabled if available. Output is **streamed** token-by-token.
50
  """
51
 
52
+ # --- (Keep about_tesslate and join_us sections as before) ---
53
  about_tesslate = f"""
54
  ## About Tesslate & Our Vision
55
  <img src="https://huggingface.co/Tesslate/Tessa-T1-14B/resolve/main/tesslate_logo_notext.png?download=true" alt="Tesslate Icon" style="height: 40px; float: left; margin-right: 10px;">
 
89
  </a>
90
  </div>
91
  """
 
92
  # --- Model and Tokenizer Loading ---
93
  device = "cuda" if torch.cuda.is_available() else "cpu"
94
  print(f"Using device: {device}")
95
  if device == "cpu":
96
  print("Warning: Running on CPU. Quantization and Flash Attention require CUDA.")
97
+ _flash_attn_2_available = False
98
 
99
+ hf_token = os.getenv('HF_TOKEN')
 
100
  if not hf_token:
101
  try:
102
  hf_token = HfFolder.get_token()
103
+ if not hf_token: hf_token = HfApi().token
104
+ if not hf_token: raise ValueError("HF token not found.")
 
 
105
  print("Using token from Hugging Face login.")
 
 
106
  except Exception as e:
107
+ raise ValueError(f"HF token acquisition failed: {e}. Please set HF_TOKEN or login.")
108
 
109
  print(f"Loading Tokenizer: {model_id}")
110
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token, trust_remote_code=True)
 
 
 
 
111
 
112
  print(f"Loading Model: {model_id} with 8-bit quantization")
 
113
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
114
+ attn_implementation = "flash_attention_2" if _flash_attn_2_available and device == "cuda" else "sdpa"
 
 
115
  print(f"Using attention implementation: {attn_implementation}")
 
116
 
117
  try:
118
  model = AutoModelForCausalLM.from_pretrained(
119
  model_id,
120
  token=hf_token,
121
+ device_map="auto",
122
  quantization_config=quantization_config,
123
+ attn_implementation=attn_implementation,
124
  trust_remote_code=True
125
  )
126
  print("Model loaded successfully with 8-bit quantization.")
 
 
 
 
 
 
127
  except Exception as e:
128
  print(f"Error loading model: {e}")
 
 
129
  if attn_implementation == "flash_attention_2":
130
  print("Flash Attention 2 failed at load time. Trying fallback 'sdpa' attention...")
131
  try:
132
  attn_implementation = "sdpa"
133
  model = AutoModelForCausalLM.from_pretrained(
134
+ model_id, token=hf_token, device_map="auto", quantization_config=quantization_config,
135
+ attn_implementation=attn_implementation, trust_remote_code=True
 
 
 
 
136
  )
137
  print("Model loaded successfully with 8-bit quantization and SDPA attention.")
138
  except Exception as e2:
139
+ print(f"Fallback to SDPA attention also failed: {e2}"); raise e2
140
+ else: raise e
 
 
141
 
142
+ # --- (Keep config info gathering and tokenizer info formatting as before) ---
143
  try:
144
  config_json = model.config.to_dict()
 
145
  quant_info = model.config.quantization_config.to_dict() if hasattr(model.config, 'quantization_config') else {}
146
  model_config_info = f"""
147
  **Model Type:** {config_json.get('model_type', 'N/A')}
 
158
  print(f"Could not retrieve full model config: {e}")
159
  model_config_info = f"**Error:** Could not load full config details for {model_id}."
160
 
 
 
 
161
  def format_tokenizer_info(tokenizer_instance):
162
  try:
163
  info = [
 
182
  tokenizer_info = format_tokenizer_info(tokenizer)
183
 
184
 
185
+ # --- Generation Function (Modified for Streaming) ---
186
+ @spaces.GPU(duration=180)
187
  def generate_response(system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p):
 
 
 
 
188
  messages = []
189
  if system_prompt and system_prompt.strip():
190
  messages.append({"role": "system", "content": system_prompt})
191
  messages.append({"role": "user", "content": user_prompt})
192
 
193
  try:
194
+ full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
195
  except Exception as e:
196
+ print(f"Warning: Using fallback prompt format due to error: {e}")
197
  prompt_parts = []
198
+ if system_prompt and system_prompt.strip(): prompt_parts.append(f"System: {system_prompt}")
199
+ prompt_parts.append(f"\nUser: {user_prompt}\nAssistant:")
 
 
200
  full_prompt = "\n".join(prompt_parts)
201
 
202
+ # Use TextIteratorStreamer for streaming output
203
+ streamer = TextIteratorStreamer(
204
+ tokenizer,
205
+ timeout=10.0, # Timeout for waiting for new tokens
206
+ skip_prompt=True, # Don't yield the prompt
207
+ skip_special_tokens=True
208
+ )
209
 
210
+ # Ensure inputs are correctly placed (device_map handles this)
211
+ inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device) # Use model's device
 
212
 
213
+ # Generation kwargs, pass streamer
214
  generation_kwargs = dict(
215
+ inputs, # Pass tokenized inputs directly
216
+ streamer=streamer, # Pass the streamer
217
  max_new_tokens=int(max_new_tokens),
218
  temperature=float(temperature) if float(temperature) > 0 else None,
219
  top_p=float(top_p),
 
229
  generation_kwargs.pop('top_k', None)
230
  generation_kwargs['do_sample'] = False
231
 
232
+ # Run generation in a separate thread
233
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
234
+ thread.start()
 
 
 
235
 
236
+ # Yield generated text as it becomes available
237
+ generated_text = ""
238
+ # Yield an empty string immediately to clear previous output
239
+ yield ""
240
+ for new_text in streamer:
241
+ generated_text += new_text
242
+ yield generated_text
243
 
244
+ # --- Gradio Interface (No changes needed here for streaming itself) ---
245
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=".gradio-container { max-width: 90% !important; }") as demo:
246
  gr.Markdown(Title)
247
  gr.Markdown(description)
 
257
  )
258
  user_prompt = gr.Textbox(
259
  label="πŸ’¬ Your Request",
260
+ placeholder="e.g., 'Create a React functional component for a simple counter...' or 'Explain virtual DOM.'",
261
  lines=6
262
  )
263
 
264
  with gr.Accordion("πŸ› οΈ Generation Parameters", open=True):
265
  with gr.Row():
266
+ temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="🌑️ Temperature")
267
+ max_new_tokens = gr.Slider(minimum=64, maximum=10000, value=10000, step=32, label="πŸ“Š Max New Tokens")
 
268
  with gr.Row():
269
+ top_k = gr.Slider(minimum=1, maximum=200, value=40, step=1, label="πŸ† Top-k")
270
+ top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="πŸ… Top-p (nucleus)")
271
  with gr.Row():
272
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.01, label="🦜 Repetition Penalty")
273
+ min_p = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="πŸ“‰ Min-p (Not Active)")
274
 
275
+ generate_btn = gr.Button("πŸš€ Generate Response (Streaming)", variant="primary", size="lg") # Updated button text slightly
276
 
277
  with gr.Column(scale=2):
 
 
278
  output = gr.Code(
279
  label=f"🌠 Tessa-T1-14B (8-bit) Output",
280
  language="markdown",
281
  lines=25,
282
+ # interactive=False # Usually keep interactive=False for Code output
283
  )
284
 
285
  with gr.Accordion("βš™οΈ Model & Tokenizer Details", open=False):
286
  gr.Markdown("### Model Configuration")
287
+ gr.Markdown(model_config_info)
288
  gr.Markdown("---")
289
  gr.Markdown("### Tokenizer Configuration")
290
  gr.Markdown(tokenizer_info)
291
 
292
+ # --- (Keep About Tesslate, Links, and Examples sections as before) ---
293
  with gr.Row():
294
  with gr.Accordion("πŸ’‘ About Tesslate & Our Mission", open=False):
295
  gr.Markdown(about_tesslate)
296
 
 
297
  gr.Markdown(join_us)
298
 
 
299
  gr.Examples(
300
  examples=[
301
  [
302
  "You are Tessa, an expert AI assistant specialized in React development.",
303
  "Create a simple React functional component for a button that alerts 'Hello!' when clicked.",
304
+ 0.7, 512, 0.95, 1.1, 40, 0.05
305
  ],
306
  [
307
  "You are Tessa, an expert AI assistant specialized in React development.",
 
316
  [
317
  "You are a helpful AI assistant.",
318
  "What are the pros and cons of using Next.js compared to Create React App?",
319
+ 0.8, 1024, 0.98, 1.05, 60, 0.05
320
  ]
321
  ],
322
  inputs=[
 
333
  label="✨ Example Prompts (Click to Load)"
334
  )
335
 
336
+ # --- Connect button click to the GENERATOR function ---
337
  generate_btn.click(
338
  fn=generate_response,
339
  inputs=[system_prompt, user_prompt, temperature, max_new_tokens, top_p, repetition_penalty, top_k, min_p],
340
  outputs=output,
341
+ api_name="generate_stream" # Changed API name for clarity
342
  )
343
 
344
+ # --- Launch the demo ---
345
  if __name__ == "__main__":
346
+ demo.queue().launch(debug=True, share=False)