Multi-language_Text-to-Speech

Running

App Files Files Community

Fabrice-TIERCELIN commited on Nov 17, 2024

Commit

dcb8a32

verified ·

1 Parent(s): dd59b92

Up to 5 generations

Browse files

Files changed (1) hide show

app.py +167 -35

app.py CHANGED Viewed

@@ -35,11 +35,29 @@ else:
 tts = TTS(model_name, gpu=torch.cuda.is_available())
 tts.to(device_type)
-def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, randomize_seed, seed):
-    start = time.time()
-    if randomize_seed:
-        seed = random.randint(0, max_64_bit_int)
     if len(prompt) < 2:
         gr.Warning("Please give a longer prompt text")
@@ -75,7 +93,7 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
         else:
             speaker_wav = "./examples/female.wav"
-    output_filename = f"{re.sub('[^a-zA-Z0-9]', '_', prompt)}_{re.sub('[^a-zA-Z0-9]', '_', language)}"[:250] + ".wav"
     try:
         if language == "fr":
@@ -83,7 +101,13 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
                 language = "fr-fr"
         if m.find("/fr/") != -1:
             language = None
-        predict_on_gpu(prompt, speaker_wav, language, output_filename, seed)
     except RuntimeError as e :
         if "device-assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
@@ -99,17 +123,33 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
     secondes = secondes - (minutes * 60)
     hours = math.floor(minutes / 60)
     minutes = minutes - (hours * 60)
-    is_randomize_seed = False
     information = ("Start again to get a different result. " if is_randomize_seed else "") + "The sound has been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec."
     return (
-        output_filename,
-        output_filename,
         information,
     )
 @spaces.GPU(duration=60)
-def predict_on_gpu(prompt, speaker_wav, language, output_filename, seed):
     random.seed(seed)
     torch.manual_seed(seed)
@@ -117,13 +157,16 @@ def predict_on_gpu(prompt, speaker_wav, language, output_filename, seed):
         text = prompt,
         file_path = output_filename,
         speaker_wav = speaker_wav,
-        language = language
     )
 with gr.Blocks() as interface:
-    gr.HTML("Multi-language Text-to-Speech")
     gr.HTML(
         """
 <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
@@ -134,20 +177,21 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
 <br/>
 <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
 <br/>
-<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
         """
     )
     with gr.Column():
         prompt = gr.Textbox(
-            label="Text Prompt",
-            info="One or two sentences at a time is better",
-            value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
         )
         with gr.Group():
             language = gr.Dropdown(
-            label="Language",
             info="Select an output language for the synthesised speech",
             choices=[
                     ["Arabic", "ar"],
@@ -166,46 +210,134 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
             ],
             max_choices=1,
             value="en",
             )
             gr.HTML("More languages <a href='https://huggingface.co/spaces/Brasd99/TTS-Voice-Cloner'>here</a>")
-        gender = gr.Radio(["female", "male"], label="Gender", info="Gender of the voice")
         audio_file_pth = gr.Audio(
             label="Reference Audio",
             #info="Click on the ✎ button to upload your own target speaker audio",
             type="filepath",
             value=None,
         )
-        mic_file_path = gr.Audio(sources=["microphone"],
-                 type="filepath",
-                 #info="Use your microphone to record audio",
-                 label="Use Microphone for Reference")
-        use_mic = gr.Checkbox(label="Check to use Microphone as Reference",
-                    value=False,
-                    info="Notice: Microphone input may not work properly under traffic",)
         with gr.Accordion("Advanced options", open = False):
-             debug_mode = gr.Checkbox(label = "Debug mode", value = False, info = "Show intermediate results")
-             randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
-             seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
-        submit = gr.Button("🚀 Speak", variant = "primary")
-        waveform_visual = gr.Video(label="Waveform Visual", autoplay=True)
-        synthesised_audio = gr.Audio(label="Synthesised Audio", autoplay=False)
         information = gr.HTML()
-    submit.click(predict, inputs = [
         prompt,
         language,
         gender,
         audio_file_pth,
         mic_file_path,
         use_mic,
         randomize_seed,
         seed
     ], outputs = [
-        waveform_visual,
-        synthesised_audio,
         information
     ], scroll_to_output = True)
-interface.queue().launch(debug=True)

 tts = TTS(model_name, gpu=torch.cuda.is_available())
 tts.to(device_type)
+def update_output(output_number):
+    return [
+        gr.update(visible = (2 <= output_number)),
+        gr.update(visible = (3 <= output_number)),
+        gr.update(visible = (4 <= output_number)),
+        gr.update(visible = (5 <= output_number))
+    ]
+def predict(
+    prompt,
+    language,
+    gender,
+    audio_file_pth,
+    mic_file_path,
+    use_mic,
+    generation_number,
+    temperature,
+    is_randomize_seed,
+    seed,
+    progress = gr.Progress()
+):
+    start = time.time()
+    progress(0, desc = "Preparing data...")
     if len(prompt) < 2:
         gr.Warning("Please give a longer prompt text")
         else:
             speaker_wav = "./examples/female.wav"
+    output_filename = []
     try:
         if language == "fr":
                 language = "fr-fr"
         if m.find("/fr/") != -1:
             language = None
+        for i in range(5):
+            if i < generation_number:
+                output_filename.append(f"{i}_{re.sub('[^a-zA-Z0-9]', '_', language)}_{re.sub('[^a-zA-Z0-9]', '_', prompt)}"[:250] + ".wav")
+                predict_on_gpu(i, prompt, speaker_wav, language, output_filename[i], temperature, is_randomize_seed, seed, progress)
+            else:
+                output_filename.append(None)
     except RuntimeError as e :
         if "device-assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
     secondes = secondes - (minutes * 60)
     hours = math.floor(minutes / 60)
     minutes = minutes - (hours * 60)
     information = ("Start again to get a different result. " if is_randomize_seed else "") + "The sound has been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec."
     return (
+        output_filename[0],
+        output_filename[1],
+        output_filename[2],
+        output_filename[3],
+        output_filename[4],
         information,
     )
 @spaces.GPU(duration=60)
+def predict_on_gpu(
+    i,
+    prompt,
+    speaker_wav,
+    language,
+    output_filename,
+    temperature,
+    is_randomize_seed,
+    seed,
+    progress
+):
+    progress((i + 1) / 5, desc = "Generating the audio #" + str(i + 1) + "...")
+    if is_randomize_seed:
+        seed = random.randint(0, max_64_bit_int)
     random.seed(seed)
     torch.manual_seed(seed)
         text = prompt,
         file_path = output_filename,
         speaker_wav = speaker_wav,
+        language = language,
+        temperature = temperature
     )
 with gr.Blocks() as interface:
     gr.HTML(
         """
+        <h1><center>XTTS</center></h1>
+        <big><center>Generate long vocal from text in several languages following voice freely, without account, without watermark and download it</center></big>
+        <br/>
 <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 <br/>
 <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
 <br/>
+<a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/Multi-language_Text-to-Speech?duplicate=true">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
         """
     )
     with gr.Column():
         prompt = gr.Textbox(
+            label = "Text Prompt",
+            info = "One or two sentences at a time is better",
+            value = "Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
+            elem_id = "prompt-id",
         )
         with gr.Group():
             language = gr.Dropdown(
+                label="Language",
             info="Select an output language for the synthesised speech",
             choices=[
                     ["Arabic", "ar"],
             ],
             max_choices=1,
             value="en",
+            elem_id = "language-id",
             )
             gr.HTML("More languages <a href='https://huggingface.co/spaces/Brasd99/TTS-Voice-Cloner'>here</a>")
+        gender = gr.Radio(
+            ["female", "male"],
+            label="Gender",
+            info="Gender of the voice",
+            elem_id = "gender-id",
+        )
         audio_file_pth = gr.Audio(
             label="Reference Audio",
             #info="Click on the ✎ button to upload your own target speaker audio",
             type="filepath",
             value=None,
+            elem_id = "audio-file-pth-id",
         )
+        mic_file_path = gr.Audio(
+            sources=["microphone"],
+            type="filepath",
+            #info="Use your microphone to record audio",
+            label="Use Microphone for Reference",
+            elem_id = "mic-file-path-id",
+        )
+        use_mic = gr.Checkbox(
+            label = "Check to use Microphone as Reference",
+                    value = False,
+                    info = "Notice: Microphone input may not work properly under traffic",
+                              elem_id = "use-mic-id",
+                             )
+        generation_number = gr.Slider(
+                 minimum = 1,
+                 maximum = 5,
+                 step = 1,
+                 value = 1,
+                 label = "Generation number",
+                 info = "How many audios to generate",
+                 elem_id = "generation-number-id"
+             )
         with gr.Accordion("Advanced options", open = False):
+             temperature = gr.Slider(
+                 minimum = 0,
+                 maximum = 10,
+                 step = .1,
+                 value = .75,
+                 label = "Temperature",
+                 elem_id = "temperature-id"
+             )
+             randomize_seed = gr.Checkbox(
+                 label = "\U0001F3B2 Randomize seed",
+                 value = True,
+                 info = "If checked, result is always different",
+                 elem_id = "randomize-seed-id"
+             )
+             seed = gr.Slider(
+                 minimum = 0,
+                 maximum = max_64_bit_int,
+                 step = 1,
+                 randomize = True,
+                 label = "Seed",
+                 elem_id = "seed-id"
+             )
+        submit = gr.Button(
+            "🚀 Speak",
+            variant = "primary",
+            elem_id = "submit-id"
+        )
+        synthesised_audio_1 = gr.Audio(
+            label="Synthesised Audio #1",
+            autoplay = False,
+            elem_id = "synthesised-audio-1-id"
+        )
+        synthesised_audio_2 = gr.Audio(
+            label="Synthesised Audio #2",
+            autoplay = False,
+            elem_id = "synthesised-audio-2-id",
+            visible = False
+        )
+        synthesised_audio_3 = gr.Audio(
+            label="Synthesised Audio #3",
+            autoplay = False,
+            elem_id = "synthesised-audio-3-id",
+            visible = False
+        )
+        synthesised_audio_4 = gr.Audio(
+            label="Synthesised Audio #4",
+            autoplay = False,
+            elem_id = "synthesised-audio-4-id",
+            visible = False
+        )
+        synthesised_audio_5 = gr.Audio(
+            label="Synthesised Audio #5",
+            autoplay = False,
+            elem_id = "synthesised-audio-5-id",
+            visible = False
+        )
         information = gr.HTML()
+    submit.click(fn = update_output, inputs = [
+        generation_number
+    ], outputs = [
+        synthesised_audio_2,
+        synthesised_audio_3,
+        synthesised_audio_4,
+        synthesised_audio_5
+    ], queue = False, show_progress = False).success(predict, inputs = [
         prompt,
         language,
         gender,
         audio_file_pth,
         mic_file_path,
         use_mic,
+        generation_number,
+        temperature,
         randomize_seed,
         seed
     ], outputs = [
+        synthesised_audio_1,
+        synthesised_audio_2,
+        synthesised_audio_3,
+        synthesised_audio_4,
+        synthesised_audio_5,
         information
     ], scroll_to_output = True)
+interface.queue(max_size = 5).launch(debug=True)