Update app.py
Browse files
app.py
CHANGED
|
@@ -102,6 +102,7 @@ def split_into_sentences(text):
|
|
| 102 |
return [s.strip() for s in sentences if s.strip()]
|
| 103 |
|
| 104 |
# --- Main TTS function with pauses between sentences ---
|
|
|
|
| 105 |
def text_to_speech(text, voice_choice):
|
| 106 |
if not text or not voice_choice:
|
| 107 |
gr.Warning("Fadlan geli qoraal oo dooro cod.")
|
|
@@ -112,38 +113,32 @@ def text_to_speech(text, voice_choice):
|
|
| 112 |
paragraphs = text.strip().split("\n")
|
| 113 |
audio_chunks = []
|
| 114 |
|
| 115 |
-
for para in paragraphs:
|
| 116 |
para = para.strip()
|
| 117 |
if not para:
|
| 118 |
continue
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
pause = np.zeros(int(16000 * 0.5))
|
| 142 |
-
audio_chunks.append(pause)
|
| 143 |
-
|
| 144 |
-
# Pause 0.8 sec between paragraphs (optional)
|
| 145 |
-
pause_para = np.zeros(int(16000 * 0.8))
|
| 146 |
-
audio_chunks.append(pause_para)
|
| 147 |
|
| 148 |
final_audio = np.concatenate(audio_chunks)
|
| 149 |
return (16000, final_audio)
|
|
|
|
| 102 |
return [s.strip() for s in sentences if s.strip()]
|
| 103 |
|
| 104 |
# --- Main TTS function with pauses between sentences ---
|
| 105 |
+
# --- Main TTS function with pause after each new line only ---
|
| 106 |
def text_to_speech(text, voice_choice):
|
| 107 |
if not text or not voice_choice:
|
| 108 |
gr.Warning("Fadlan geli qoraal oo dooro cod.")
|
|
|
|
| 113 |
paragraphs = text.strip().split("\n")
|
| 114 |
audio_chunks = []
|
| 115 |
|
| 116 |
+
for idx, para in enumerate(paragraphs):
|
| 117 |
para = para.strip()
|
| 118 |
if not para:
|
| 119 |
continue
|
| 120 |
+
|
| 121 |
+
norm_para = normalize_text(para)
|
| 122 |
+
inputs = processor(text=norm_para, return_tensors="pt").to(device)
|
| 123 |
+
|
| 124 |
+
with torch.no_grad():
|
| 125 |
+
speech = model.generate(
|
| 126 |
+
input_ids=inputs["input_ids"],
|
| 127 |
+
speaker_embeddings=speaker_embedding.unsqueeze(0),
|
| 128 |
+
do_sample=True,
|
| 129 |
+
top_k=50,
|
| 130 |
+
temperature=0.75,
|
| 131 |
+
repetition_penalty=1.2,
|
| 132 |
+
max_new_tokens=512
|
| 133 |
+
)
|
| 134 |
+
audio = vocoder(speech).cpu().squeeze().numpy()
|
| 135 |
+
|
| 136 |
+
audio_chunks.append(audio)
|
| 137 |
+
|
| 138 |
+
# Pause after each paragraph (new line)
|
| 139 |
+
if idx < len(paragraphs) - 1:
|
| 140 |
+
pause = np.zeros(int(16000 * 0.8)) # 0.8s pause
|
| 141 |
+
audio_chunks.append(pause)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
final_audio = np.concatenate(audio_chunks)
|
| 144 |
return (16000, final_audio)
|