Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from hyper_parameters import tacotron_params as hparams | |
| from training import load_model | |
| from audio_processing import griffin_lim | |
| from nn_layers import TacotronSTFT | |
| from text import text_to_sequence | |
| from hifigan.env import AttrDict | |
| from examples_taco2 import * | |
| from hifigan.models import Generator | |
| import torch | |
| import numpy as np | |
| import json | |
| import os | |
| from matplotlib import pyplot as plt | |
| # Adjust vertical spacing between subplots | |
| plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed | |
| # Adjust the white space (margins) around the plot | |
| plt.tight_layout(pad=0.5) # You can adjust the pad value as needed | |
| torch.manual_seed(1234) | |
| MAX_WAV_VALUE = 32768.0 | |
| def load_checkpoint(filepath, device): | |
| assert os.path.isfile(filepath) | |
| print("Loading '{}'".format(filepath)) | |
| checkpoint_dict = torch.load(filepath, map_location=device) | |
| print("Complete.") | |
| return checkpoint_dict | |
| def plot_spec_align_sep(mel, align): | |
| plt.figure(figsize=(4, 3)) | |
| fig_mel = plt.figure() | |
| ax_mel = fig_mel.add_subplot(111) | |
| fig_mel.tight_layout() | |
| ax_mel.imshow(mel) | |
| # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12) | |
| fig_align = plt.figure() | |
| ax_align = fig_align.add_subplot(111) # fig_align | |
| fig_align.tight_layout() | |
| ax_align.imshow(align) | |
| # fig_align.set_title('Alignment', fontsize=12) | |
| return fig_mel, fig_align | |
| # load trained tacotron2 + GST model: | |
| model = load_model(hparams) | |
| checkpoint_path = "models/checkpoint_78000.model" | |
| model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) | |
| # model.to('cuda') | |
| _ = model.eval() | |
| # load pre-trained HiFi-GAN model for mel2audio: | |
| hifigan_checkpoint_path = "models/generator_v1" | |
| config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json') | |
| with open(config_file) as f: | |
| data = f.read() | |
| json_config = json.loads(data) | |
| h = AttrDict(json_config) | |
| device = torch.device("cpu") | |
| generator = Generator(h).to(device) | |
| state_dict_g = load_checkpoint(hifigan_checkpoint_path, device) | |
| generator.load_state_dict(state_dict_g['generator']) | |
| generator.eval() | |
| generator.remove_weight_norm() | |
| def synthesize(text, gst_1, gst_2, gst_3, voc): | |
| sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] | |
| sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) | |
| # gst_head_scores = np.array([0.5, 0.15, 0.35]) | |
| gst_head_scores = np.array([gst_1, gst_2, gst_3]) | |
| gst_scores = torch.from_numpy(gst_head_scores).float() | |
| with torch.no_grad(): | |
| mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) | |
| if voc == 0: | |
| # mel2wav inference: | |
| with torch.no_grad(): | |
| y_g_hat = generator(mel_outputs_postnet) | |
| audio = y_g_hat.squeeze() | |
| audio = audio * MAX_WAV_VALUE | |
| audio_numpy = audio.cpu().numpy().astype('int16') | |
| # audio = vocoder_model.inference(mel_outputs_postnet) | |
| # audio_numpy = audio.data.cpu().detach().numpy() | |
| else: | |
| # Griffin Lim vocoder synthesis: | |
| griffin_iters = 60 | |
| taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'], | |
| sampling_rate=hparams['sampling_rate']) | |
| mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) | |
| mel_decompress = mel_decompress.transpose(1, 2).data.cpu() | |
| spec_from_mel_scaling = 60 | |
| spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) | |
| spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) | |
| spec_from_mel = spec_from_mel * spec_from_mel_scaling | |
| audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) | |
| audio = audio.squeeze() | |
| audio_numpy = audio.cpu().numpy() | |
| # prepare plot for the output: | |
| mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0]) | |
| mel_outputs_postnet = mel_outputs_postnet.detach().numpy() | |
| alignments = alignments.squeeze().T.detach().numpy() | |
| # normalize numpy arrays between [-1, 1] | |
| min_val = np.min(mel_outputs_postnet) | |
| max_val = np.max(mel_outputs_postnet) | |
| scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val) | |
| normalized_mel = 2 * scaled_mel - 1 | |
| min_val = np.min(alignments) | |
| max_val = np.max(alignments) | |
| scaled_align = (alignments - min_val) / (max_val - min_val) | |
| normalized_align = 2 * scaled_align - 1 | |
| aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg', | |
| bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True) | |
| return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align | |
| with gr.Blocks() as demo: | |
| gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> " | |
| "<h2>Speech Synthesis with Partial Style Control</h2></center><br>") | |
| # gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder " | |
| # "with Tacotron2</center>") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # , value="Speech synthesis has evolved dramatically since the development of neural architectures capable of generating high quality samples." | |
| inp = gr.Textbox(label="Input Text") | |
| clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp]) | |
| # gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| with gr.Tab("Global Style Tokens"): | |
| gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4) | |
| gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26) | |
| gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33) | |
| with gr.Column(scale=0): | |
| with gr.Tab("Vocoder"): | |
| vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)], | |
| container=False, value=0, min_width=300) # label="Vocoder") | |
| greet_btn = gr.Button("Synthesize!", scale=1) | |
| with gr.Column(): | |
| with gr.Tab("Spectrogram"): | |
| spec_plot = gr.Image(container=False) | |
| with gr.Tab("Alignment"): | |
| align_plot = gr.Image(container=False) | |
| wave_video = gr.Video(label="Waveform", height=150, width=800, container=False) | |
| def display_video(): | |
| return wave_video | |
| greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder], | |
| outputs=[wave_video, spec_plot, align_plot], | |
| api_name="synthesize") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Examples(examples=infer_from_text_examples, | |
| inputs=[inp, gst_1, gst_2, gst_3, vocoder], | |
| outputs=[wave_video, spec_plot, align_plot], | |
| fn=synthesize, | |
| cache_examples=False, ) | |
| gr.Markdown(""" | |
| ### Details and Indications | |
| This is a Text-to-Speech (TTS) system that consists of two modules: 1) a replicated Tacotron2 model, which generates | |
| the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps | |
| spectrograms to a digital waveforms. Global Style Tokens (GST) have been implemented to catch style information from | |
| the female speaker with which the model has been trained (see the links below for more information). | |
| Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text. | |
| Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of | |
| style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or | |
| higher than 1 may cause low energy, mispronunciations or distortion. | |
| You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need | |
| to be trained but produces a "robotic" effect. | |
| ### More Information | |
| Spectrogram generator has been adapted and trained from the | |
| [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in | |
| <a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;" | |
| target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" | |
| src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a> | |
| <br> | |
| The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646" | |
| style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: | |
| 0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b" | |
| alt="HiFiGAN"></a> | |
| <br> | |
| Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display: | |
| inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display: | |
| inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b" | |
| alt="Global Style Tokens"></a> | |
| <br> | |
| """) | |
| demo.launch() | |