Spaces:
Running
on
Zero
Running
on
Zero
Den Pavloff
commited on
Commit
·
52c0d1f
1
Parent(s):
e9bcb5a
time report
Browse files
app.py
CHANGED
|
@@ -103,16 +103,16 @@ def generate_speech_gpu(text, model_choice):
|
|
| 103 |
|
| 104 |
# Generate audio
|
| 105 |
print(f"Generating speech with {model_choice}...")
|
| 106 |
-
audio, _ = selected_model.run_model(text)
|
| 107 |
|
| 108 |
sample_rate = 22050
|
| 109 |
print("Speech generation completed!")
|
| 110 |
|
| 111 |
-
return (sample_rate, audio) #, f"✅ Audio generated successfully using {model_choice} on {device}"
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
print(f"Error during generation: {str(e)}")
|
| 115 |
-
return None
|
| 116 |
|
| 117 |
# def validate_input(text, model_choice):
|
| 118 |
# """Quick validation without GPU"""
|
|
@@ -154,21 +154,19 @@ with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as d
|
|
| 154 |
type="numpy"
|
| 155 |
)
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
# GPU generation event
|
| 164 |
generate_btn.click(
|
| 165 |
fn=generate_speech_gpu,
|
| 166 |
inputs=[text_input, model_dropdown],
|
| 167 |
-
outputs=[audio_output]
|
| 168 |
)
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
# Demo Examples
|
| 173 |
gr.Markdown("## 🎯 Demo Examples")
|
| 174 |
|
|
|
|
| 103 |
|
| 104 |
# Generate audio
|
| 105 |
print(f"Generating speech with {model_choice}...")
|
| 106 |
+
audio, _, time_report = selected_model.run_model(text)
|
| 107 |
|
| 108 |
sample_rate = 22050
|
| 109 |
print("Speech generation completed!")
|
| 110 |
|
| 111 |
+
return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
print(f"Error during generation: {str(e)}")
|
| 115 |
+
return None, f"❌ Error during generation: {str(e)}"
|
| 116 |
|
| 117 |
# def validate_input(text, model_choice):
|
| 118 |
# """Quick validation without GPU"""
|
|
|
|
| 154 |
type="numpy"
|
| 155 |
)
|
| 156 |
|
| 157 |
+
time_report_output = gr.Textbox(
|
| 158 |
+
label="Time Report",
|
| 159 |
+
interactive=False,
|
| 160 |
+
value="Ready to generate speech"
|
| 161 |
+
)
|
| 162 |
|
| 163 |
# GPU generation event
|
| 164 |
generate_btn.click(
|
| 165 |
fn=generate_speech_gpu,
|
| 166 |
inputs=[text_input, model_dropdown],
|
| 167 |
+
outputs=[audio_output, time_report_output]
|
| 168 |
)
|
| 169 |
|
|
|
|
|
|
|
| 170 |
# Demo Examples
|
| 171 |
gr.Markdown("## 🎯 Demo Examples")
|
| 172 |
|
util.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
import librosa
|
| 3 |
import requests
|
|
|
|
| 4 |
from nemo.collections.tts.models import AudioCodecModel
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
@@ -192,17 +193,28 @@ class KaniModel:
|
|
| 192 |
)
|
| 193 |
return generated_ids.to('cpu')
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
def run_model(self, text: str):
|
| 196 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
| 197 |
# Prepare input
|
| 198 |
input_ids, attention_mask = self.get_input_ids(text)
|
| 199 |
|
| 200 |
# Generate tokens
|
|
|
|
| 201 |
model_output = self.model_request(input_ids, attention_mask)
|
| 202 |
|
| 203 |
# Convert to audio
|
|
|
|
| 204 |
audio, _ = self.player.get_waveform(model_output)
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
|
| 207 |
|
| 208 |
class Demo:
|
|
|
|
| 1 |
import torch
|
| 2 |
import librosa
|
| 3 |
import requests
|
| 4 |
+
import time
|
| 5 |
from nemo.collections.tts.models import AudioCodecModel
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 193 |
)
|
| 194 |
return generated_ids.to('cpu')
|
| 195 |
|
| 196 |
+
def time_report(self, point_1, point_2, point_3):
|
| 197 |
+
model_request = point_2 - point_1
|
| 198 |
+
player_time = point_3 - point_2
|
| 199 |
+
total_time = point_3 - point_1
|
| 200 |
+
report = f"MODEL GENERATION: {model_request:.2f}\nNANO CODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
|
| 201 |
+
return report
|
| 202 |
+
|
| 203 |
def run_model(self, text: str):
|
| 204 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
| 205 |
# Prepare input
|
| 206 |
input_ids, attention_mask = self.get_input_ids(text)
|
| 207 |
|
| 208 |
# Generate tokens
|
| 209 |
+
point_1 = time.time()
|
| 210 |
model_output = self.model_request(input_ids, attention_mask)
|
| 211 |
|
| 212 |
# Convert to audio
|
| 213 |
+
point_2 = time.time()
|
| 214 |
audio, _ = self.player.get_waveform(model_output)
|
| 215 |
+
|
| 216 |
+
point_3 = time.time()
|
| 217 |
+
return audio, text, self.time_report(point_1, point_2, point_3)
|
| 218 |
|
| 219 |
|
| 220 |
class Demo:
|