Gradio-WhisperSpeech

Paused

App Files Files Community

jpc commited on Jan 31, 2024

Commit

a68da21

1 Parent(s): f7b03d4

Simplified the UI and the generation code

Browse files

Added more examples and support for a URL-based voice cloning sample.

Files changed (1) hide show

app.py +61 -72

app.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import spaces
-import tempfile
-import wave
 import gradio as gr
 import os
-import re
 import torch
-import soundfile as sf
-import numpy as np
-import torch.nn.functional as F
 from whisperspeech.pipeline import Pipeline
-from whisperspeech.languages import LANGUAGES
-from whisperspeech.utils import resampler
 title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
@@ -26,9 +22,13 @@ This space runs on ZeroGPU, so **you need to be patient** while you acquire the
 text_examples = [
-    ["<en> WhisperSpeech is an opensource library that helps you hack whisper."],
-    ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
-    ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
 ]
 def parse_multilingual_text(input_text):
@@ -42,74 +42,63 @@ def parse_multilingual_text(input_text):
     return segments
 @spaces.GPU(enable_queue=True)
-def generate_segment_audio(text, lang, speaker_audio, pipe):
-    if not isinstance(text, str):
-        text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
-    speaker_audio_data = speaker_audio
-    audio_data = pipe.generate(text, speaker_audio_data, lang)
-    resample_audio = resampler(newsr=24000)
-    audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
-    audio_np = audio_data_resampled.cpu().numpy()
-    # Debug statement    print("Shape after resampling:", audio_np.shape)
-    return audio_np
-def concatenate_audio_segments(segments):
-    concatenated_audio = np.concatenate(segments , axis=1)
-    return concatenated_audio
-@spaces.GPU(enable_queue=True)
-def whisper_speech_demo(multilingual_text, speaker_audio):
     segments = parse_multilingual_text(multilingual_text)
-    if not segments:
-        return None, "No valid language segments found. Please use the format: <lang> text"
-    pipe = Pipeline()
-    if not hasattr(pipe, 's2a'):
-        return None, "Pipeline initialization failed. s2a model not loaded."
-    speaker_url = speaker_audio if speaker_audio is not None else None
-    audio_segments = []
-    for lang, text in segments:
-        text_str = text if isinstance(text, str) else str(text)
-        audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
-        # Debug statement    print("Audio segment shape:", audio_np.shape)
-        audio_segments.append(audio_np)
-    concatenated_audio = concatenate_audio_segments(audio_segments)
-    # Debug statement        print("Final concatenated audio shape:", concatenated_audio.shape)
-    concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
-    return (24000, concatenated_audio.T)
 with gr.Blocks() as demo:
     gr.Markdown(title)
-    output_audio = gr.Audio(label="🌟Collabora🌬️💬📝WhisperSpeech")
-    generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech")
-    with gr.Accordion("🌟Collabora🌬️WhisperSpeech💬Voice Print and📝Language List", open=False):
-        with gr.Row():
             speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
-                                     sources=["upload", "microphone"])
-        with gr.Row():
-            with gr.Accordion("Available Languages and Their Tags", open=False):
-                formatted_language_list = "\n".join([f"`<{lang}>` {LANGUAGES[lang]}" for lang in LANGUAGES])
-                gr.Markdown(formatted_language_list)
-    with gr.Row():
-        text_input = gr.Textbox(label="Enter multilingual text💬📝",
-                                placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola")
     with gr.Row():
-        with gr.Accordion("Try Multilingual Text Examples", open=False):
-            gr.Examples(
-                examples=text_examples,
-                inputs=[text_input],
-                outputs=[output_audio],
-                fn=whisper_speech_demo,
-                cache_examples=False,
-                label="Try these to get started !🌟🌬️"
-            )
-    generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
-demo.launch()

 import spaces
 import gradio as gr
+import io
 import os
+import re
 import torch
+import torchaudio
+from pathlib import Path
 from whisperspeech.pipeline import Pipeline
 title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
 text_examples = [
+    ["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
+    ["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
+    ["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
+    ["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
+    # ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
+    ["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
+    # ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
 ]
 def parse_multilingual_text(input_text):
     return segments
 @spaces.GPU(enable_queue=True)
+def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
+    if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
+    elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
+    else: speaker = pipe.default_speaker
+    langs, texts = [list(x) for x in zip(*segments)]
+    print(texts, langs)
+    stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0]
+    atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
+    audio = pipe.vocoder.decode(atoks)
+    return audio.cpu()
+def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps):
+    if len(multilingual_text) == 0:
+        raise gr.Error("Please enter some text for me to speak!")
     segments = parse_multilingual_text(multilingual_text)
+    audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
+    return (24000, audio.T.numpy())
+    # Did not work for me in Safari:
+    # mp3 = io.BytesIO()
+    # torchaudio.save(mp3, audio, 24000, format='mp3')
+    # return mp3.getvalue()
 with gr.Blocks() as demo:
     gr.Markdown(title)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(label="Enter multilingual text💬📝",
+                                    value=text_examples[0][0],
+                                    info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
+            cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
+                            label="Tempo (in characters per second)")
             speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
+                                     sources=["upload", "microphone"],
+                                     type='filepath')
+            gr.Markdown("  \n  ") # fixes the bottom overflow from Audio
+            url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
+            generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="WhisperSpeech says…")
     with gr.Row():
+        gr.Examples(
+            examples=text_examples,
+            inputs=[text_input, url_input],
+            outputs=[output_audio],
+            fn=whisper_speech_demo,
+            cache_examples=False,
+            label="Try these to get started !🌟🌬️"
+        )
+    generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
+pipe = Pipeline()#torch_compile=True)
+pipe.generate("WhisperSpeech warmup")
+demo.launch(server_port=3000)#, share=True)