Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 21

Commit

6ce5da6

verified ·

1 Parent(s): df3f293

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -16

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-VOICE_SAMPLE_FILES = ["1.wav"]  # Codka tusaale ahaan
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
@@ -95,31 +95,33 @@ def normalize_text(text):
     text = re.sub(r'[^\w\s\']', '', text)
     return text
-# --- Helper to split text into sentences ---
-def split_into_sentences(text):
-    sentence_endings = re.compile(r'(?<=[.!?])\s+')
-    sentences = sentence_endings.split(text)
-    return [s.strip() for s in sentences if s.strip()]
-# --- Main TTS function with pauses between sentences ---
-# --- Main TTS function with pause after each new line only ---
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
         return None
     speaker_embedding = get_speaker_embedding(voice_choice)
-    paragraphs = text.strip().split("\n")
     audio_chunks = []
-    for idx, para in enumerate(paragraphs):
-        para = para.strip()
-        if not para:
             continue
-        norm_para = normalize_text(para)
-        inputs = processor(text=norm_para, return_tensors="pt").to(device)
         with torch.no_grad():
             speech = model.generate(
@@ -135,8 +137,8 @@ def text_to_speech(text, voice_choice):
         audio_chunks.append(audio)
-        # Pause after each paragraph (new line)
-        if idx < len(paragraphs) - 1:
             pause = np.zeros(int(16000 * 0.8))  # 0.8s pause
             audio_chunks.append(pause)

 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+VOICE_SAMPLE_FILES = ["1.wav"]
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
+# --- Split long text into chunks by word count ---
+def split_long_text_into_chunks(text, max_words=18):
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), max_words):
+        chunk = ' '.join(words[i:i + max_words])
+        chunks.append(chunk)
+    return chunks
+# --- Main TTS function ---
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
         return None
     speaker_embedding = get_speaker_embedding(voice_choice)
+    text_chunks = split_long_text_into_chunks(text)
     audio_chunks = []
+    for idx, chunk in enumerate(text_chunks):
+        chunk = chunk.strip()
+        if not chunk:
             continue
+        norm_chunk = normalize_text(chunk)
+        inputs = processor(text=norm_chunk, return_tensors="pt").to(device)
         with torch.no_grad():
             speech = model.generate(
         audio_chunks.append(audio)
+        # Pause after each chunk
+        if idx < len(text_chunks) - 1:
             pause = np.zeros(int(16000 * 0.8))  # 0.8s pause
             audio_chunks.append(pause)