Spaces:

AlexK-PL
/

vits-v2-8khz-inference

Runtime error

App Files Files Community

AlexK-PL commited on Jan 16, 2024

Commit

9a5d905

verified ·

1 Parent(s): 18a1e06

Create app.py

Browse files

Files changed (1) hide show

app.py +241 -0

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import tempfile
+import subprocess
+import time
+from typing import Optional
+from AinaTheme import AinaGradioTheme
+import gradio as gr
+import numpy as np
+import torch
+import os
+from TTS.utils.synthesizer import Synthesizer
+from dotenv import load_dotenv
+torch.manual_seed(0)
+np.random.seed(0)
+# CleanUnet Dependencies
+import json
+from copy import deepcopy
+import numpy as np
+import torch
+# from util import print_size, sampling
+import torchaudio
+import torchaudio.transforms as T
+import random
+random.seed(0)
+torch.manual_seed(0)
+np.random.seed(0)
+SAMPLE_RATE = 8000
+CONFIG = "configs/DNS-large-full.json"
+# CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl"
+# Parse configs. Globals nicer in this case
+with open(CONFIG) as f:
+    data = f.read()
+    config = json.loads(data)
+    gen_config = config["gen_config"]
+    global network_config
+    network_config = config["network_config"]  # to define wavenet
+    global train_config
+    train_config = config["train_config"]  # train config
+    global trainset_config
+    trainset_config = config["trainset_config"]  # to read trainset configurations
+# global use_denoise
+# use_denoise = False
+# setup local experiment path
+exp_path = train_config["exp_path"]
+print('exp_path:', exp_path)
+# load data
+loader_config = deepcopy(trainset_config)
+loader_config["crop_length_sec"] = 0
+#############################################################################################################
+load_dotenv()
+MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500))
+# Dynamically read model files, exclude 'speakers.pth'
+model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth']
+model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True)
+speakers_path = "speakers.pth"
+speakers_list = torch.load(speakers_path)
+speakers_list = list(speakers_list.keys())
+speakers_list = [speaker for speaker in speakers_list]
+default_speaker_list = speakers_list  #
+# Filtered lists based on dataset
+festcat_speakers = [s for s in speakers_list if len(s) == 3]  #
+google_speakers = [s for s in speakers_list if 3 < len(s) < 20]  #
+commonvoice_speakers = [s for s in speakers_list if len(s) > 20]  #
+DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau")
+model_file = model_files[0]  # change this!!
+model_path = os.path.join(os.getcwd(), model_file)
+config_path = "config.json"
+vocoder_path = None
+vocoder_config_path = None
+synthesizer = Synthesizer(
+    model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path,
+)
+def get_phonetic_transcription(text: str):
+    try:
+        result = subprocess.run(
+            ['espeak-ng', '--ipa', '-v', 'ca', text],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True
+        )
+        return result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"An error occurred: {e}")
+        return None
+def tts_inference(text: str, speaker_idx: str = None, use_denoise: int = 0):
+    # synthesize
+    if synthesizer is None:
+        raise NameError("model not found")
+    t1 = time.time()
+    wavs = synthesizer.tts(text, speaker_idx)
+    print(type(wavs))
+    if use_denoise == 0:
+        wavs_den = torch.Tensor(wavs).unsqueeze(0)  # one sample
+        # wavs_den = denoise(wavs_den).tolist()
+    else:
+        wavs_den = wavs
+    # return output
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        # wavs must be a list of integers
+        synthesizer.save_wav(wavs, fp)
+        t2 = time.time() - t1
+        print(round(t2, 2))
+        output_audio = fp.name
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        # wavs must be a list of integers
+        synthesizer.save_wav(wavs_den, fp)
+        output_audio_den = fp.name
+    return output_audio, output_audio_den
+title = "🗣️ Catalan Multispeaker TTS Tester 🗣️"
+description = """
+1️⃣ Enter the text to synthesize.
+2️⃣ Select a voice from the dropdown menu.
+3️⃣ Enjoy!
+"""
+def submit_input(input_, speaker_id, use_dn):
+    output_audio = None
+    output_phonetic = None
+    if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN:
+        output_audio, output_audio_den = tts_inference(input_, speaker_id, use_dn)
+        output_phonetic = get_phonetic_transcription(input_)
+    else:
+        gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.")
+    return output_audio, output_audio_den, output_phonetic
+def change_interactive(text):
+    input_state = text
+    if input_state.strip() != "":
+        return gr.update(interactive=True)
+    else:
+        return gr.update(interactive=False)
+def clean():
+    return (
+        None,
+        None,
+    )
+with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
+    gr.Markdown(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
+    gr.Markdown(description)
+    with gr.Row(equal_height=False):
+        with gr.Column(variant='panel'):
+            input_ = gr.Textbox(
+                label="Text",
+                value="Introdueix el text a sintetitzar.",
+                lines=4
+            )
+            dataset = gr.Radio(["All", "Festcat", "Google TTS", "CommonVoice"], label="Speakers Dataset",
+                               value="All")
+            def update_speaker_list(dataset):
+                print("Updating speaker list based on dataset:", dataset)
+                if dataset == "Festcat":
+                    current_speakers = festcat_speakers
+                elif dataset == "Google TTS":
+                    current_speakers = google_speakers
+                elif dataset == "CommonVoice":
+                    current_speakers = commonvoice_speakers
+                else:
+                    current_speakers = speakers_list
+                return gr.update(choices=current_speakers, value=current_speakers[0])
+            speaker_id = gr.Dropdown(label="Select a voice", choices=speakers_list, value=DEFAULT_SPEAKER_ID,
+                                     interactive=True)
+            dataset.change(fn=update_speaker_list, inputs=dataset, outputs=speaker_id)
+            # model = gr.Dropdown(label="Select a model", choices=model_files, value=DEFAULT_MODEL_FILE_NAME)
+            with gr.Row():
+                clear_btn = gr.ClearButton(value='Clean', components=[input_])
+                # clear_btn = gr.Button(
+                #     "Clean",
+                # )
+                submit_btn = gr.Button(
+                    "Submit",
+                    variant="primary",
+                )
+                use_denoise = gr.Radio(choices=[("Yes", 0), ("No", 1)], value=0)
+        with gr.Column(variant='panel'):
+            output_audio = gr.Audio(label="Output", type="filepath", autoplay=True, show_share_button=False)
+            output_audio_den = gr.Audio(label="Output denoised", type="filepath", autoplay=False,
+                                        show_share_button=False)
+            output_phonetic = gr.Textbox(label="Phonetic Transcription", readonly=True)
+    for button in [submit_btn]:  # clear_btn
+        input_.change(fn=change_interactive, inputs=[input_], outputs=button)
+    # clear_btn.click(fn=clean, inputs=[], outputs=[input_, output_audio, output_phonetic], queue=False)
+    submit_btn.click(fn=submit_input, inputs=[input_, speaker_id, use_denoise], outputs=[output_audio,
+                                                                                         output_audio_den,
+                                                                                         output_phonetic])
+app.queue(concurrency_count=1, api_open=False)
+app.launch(show_api=False, server_name="0.0.0.0", server_port=7860)