Den Pavloff commited on
Commit
52c0d1f
·
1 Parent(s): e9bcb5a

time report

Browse files
Files changed (2) hide show
  1. app.py +9 -11
  2. util.py +13 -1
app.py CHANGED
@@ -103,16 +103,16 @@ def generate_speech_gpu(text, model_choice):
103
 
104
  # Generate audio
105
  print(f"Generating speech with {model_choice}...")
106
- audio, _ = selected_model.run_model(text)
107
 
108
  sample_rate = 22050
109
  print("Speech generation completed!")
110
 
111
- return (sample_rate, audio) #, f"✅ Audio generated successfully using {model_choice} on {device}"
112
 
113
  except Exception as e:
114
  print(f"Error during generation: {str(e)}")
115
- return None #, f"❌ Error during generation: {str(e)}"
116
 
117
  # def validate_input(text, model_choice):
118
  # """Quick validation without GPU"""
@@ -154,21 +154,19 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
154
  type="numpy"
155
  )
156
 
157
- # status_text = gr.Textbox(
158
- # label="Status",
159
- # interactive=False,
160
- # value="Ready to generate speech"
161
- # )
162
 
163
  # GPU generation event
164
  generate_btn.click(
165
  fn=generate_speech_gpu,
166
  inputs=[text_input, model_dropdown],
167
- outputs=[audio_output]
168
  )
169
 
170
-
171
-
172
  # Demo Examples
173
  gr.Markdown("## 🎯 Demo Examples")
174
 
 
103
 
104
  # Generate audio
105
  print(f"Generating speech with {model_choice}...")
106
+ audio, _, time_report = selected_model.run_model(text)
107
 
108
  sample_rate = 22050
109
  print("Speech generation completed!")
110
 
111
+ return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"
112
 
113
  except Exception as e:
114
  print(f"Error during generation: {str(e)}")
115
+ return None, f"❌ Error during generation: {str(e)}"
116
 
117
  # def validate_input(text, model_choice):
118
  # """Quick validation without GPU"""
 
154
  type="numpy"
155
  )
156
 
157
+ time_report_output = gr.Textbox(
158
+ label="Time Report",
159
+ interactive=False,
160
+ value="Ready to generate speech"
161
+ )
162
 
163
  # GPU generation event
164
  generate_btn.click(
165
  fn=generate_speech_gpu,
166
  inputs=[text_input, model_dropdown],
167
+ outputs=[audio_output, time_report_output]
168
  )
169
 
 
 
170
  # Demo Examples
171
  gr.Markdown("## 🎯 Demo Examples")
172
 
util.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  import librosa
3
  import requests
 
4
  from nemo.collections.tts.models import AudioCodecModel
5
  from dataclasses import dataclass
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -192,17 +193,28 @@ class KaniModel:
192
  )
193
  return generated_ids.to('cpu')
194
 
 
 
 
 
 
 
 
195
  def run_model(self, text: str):
196
  """Complete pipeline: text -> tokens -> generation -> audio"""
197
  # Prepare input
198
  input_ids, attention_mask = self.get_input_ids(text)
199
 
200
  # Generate tokens
 
201
  model_output = self.model_request(input_ids, attention_mask)
202
 
203
  # Convert to audio
 
204
  audio, _ = self.player.get_waveform(model_output)
205
- return audio, text
 
 
206
 
207
 
208
  class Demo:
 
1
  import torch
2
  import librosa
3
  import requests
4
+ import time
5
  from nemo.collections.tts.models import AudioCodecModel
6
  from dataclasses import dataclass
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
193
  )
194
  return generated_ids.to('cpu')
195
 
196
+ def time_report(self, point_1, point_2, point_3):
197
+ model_request = point_2 - point_1
198
+ player_time = point_3 - point_2
199
+ total_time = point_3 - point_1
200
+ report = f"MODEL GENERATION: {model_request:.2f}\nNANO CODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
201
+ return report
202
+
203
  def run_model(self, text: str):
204
  """Complete pipeline: text -> tokens -> generation -> audio"""
205
  # Prepare input
206
  input_ids, attention_mask = self.get_input_ids(text)
207
 
208
  # Generate tokens
209
+ point_1 = time.time()
210
  model_output = self.model_request(input_ids, attention_mask)
211
 
212
  # Convert to audio
213
+ point_2 = time.time()
214
  audio, _ = self.player.get_waveform(model_output)
215
+
216
+ point_3 = time.time()
217
+ return audio, text, self.time_report(point_1, point_2, point_3)
218
 
219
 
220
  class Demo: