Spaces:
Running
Running
| # Copyright (c) 2023 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree | |
| import subprocess | |
| command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app" | |
| subprocess.check_output(command_to_run, shell=True, text=True) | |
| import gradio as gr | |
| import os | |
| import inference | |
| SUPPORTED_SPEAKERS = { | |
| "Cori Samuel":"hifitts_92", | |
| "Phil Benson":"hifitts_6097", | |
| "Mike Pelton":"hifitts_6670", | |
| "Tony Oliva":"hifitts_6671", | |
| "Maria Kasper":"hifitts_8051", | |
| "John Van Stan":"hifitts_9017", | |
| "Helen Taylor":"hifitts_9136", | |
| "Sylviamb":"hifitts_11614", | |
| "Celine Major":"hifitts_11697", | |
| "LikeManyWaters":"hifitts_12787" | |
| } | |
| def tts_inference( | |
| input_text, | |
| target_speaker, | |
| duration | |
| ): | |
| ### Target Speaker ### | |
| target_speaker = SUPPORTED_SPEAKERS[target_speaker] | |
| args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] | |
| args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] | |
| args_list += ["--speaker_name_1", target_speaker] | |
| args_list += ["--speaker_name_2", None] | |
| args_list += ["--text", input_text] | |
| args_list += ["--mode","single"] | |
| args_list += ["--duration_control",str(float(duration))] | |
| args_list += ["--output_dir", "result"] | |
| args_list += ["--log_level", "debug"] | |
| os.environ["WORK_DIR"] = "./" | |
| inference.main(args_list) | |
| ### Display ### | |
| result_file = os.path.join( | |
| "result/single/test_pred.wav" | |
| ) | |
| return result_file | |
| def tc_inference( | |
| input_text, | |
| target_speaker_1, | |
| target_speaker_2, | |
| confusion_degree, | |
| duration | |
| ): | |
| ### Target Speaker ### | |
| target_speaker_1 = SUPPORTED_SPEAKERS[target_speaker_1] | |
| if target_speaker_2 is not None: | |
| target_speaker_2 = SUPPORTED_SPEAKERS[target_speaker_2] | |
| args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] | |
| args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] | |
| args_list += ["--speaker_name_1", target_speaker_1] | |
| args_list += ["--speaker_name_2", target_speaker_2] | |
| args_list += ["--alpha", str(float(confusion_degree))] | |
| args_list += ["--text", input_text] | |
| args_list += ["--mode","single"] | |
| args_list += ["--duration_control",str(float(duration))] | |
| args_list += ["--output_dir", "result"] | |
| args_list += ["--log_level", "debug"] | |
| os.environ["WORK_DIR"] = "./" | |
| inference.main(args_list) | |
| ### Display ### | |
| source_speaker_1 = os.path.join( | |
| "result/single/s1.wav" | |
| ) | |
| source_speaker_2 = os.path.join( | |
| "result/single/s2.wav" | |
| ) | |
| result_file = os.path.join( | |
| "result/single/test_pred.wav" | |
| ) | |
| return source_speaker_1, source_speaker_2, result_file | |
| # Section 1: TTS | |
| tts_demo_inputs = [ | |
| gr.Textbox( | |
| label="Input Text", | |
| type="text", | |
| placeholder="Type something here.." | |
| ), | |
| gr.Radio( | |
| choices=list(SUPPORTED_SPEAKERS.keys()), | |
| label="Target Speaker", | |
| value="Cori Samuel" | |
| ), | |
| gr.Slider( | |
| 1, | |
| 5, | |
| value=1, | |
| step=0.25, | |
| label="Speaking Rate", | |
| info="As the step number increases, the speaking rate will be slower.", | |
| ) | |
| ] | |
| tts_demo_output = gr.Audio(label="Generated Speech") | |
| # Section 2: Timbre confusion | |
| tc_demo_inputs = [ | |
| gr.Textbox( | |
| label="Input Text", | |
| type="text", | |
| placeholder="Type something here.." | |
| ), | |
| gr.Radio( | |
| choices=list(SUPPORTED_SPEAKERS.keys()), | |
| label="Target Speaker 1", | |
| value="Cori Samuel" | |
| ), | |
| gr.Radio( | |
| choices=list(SUPPORTED_SPEAKERS.keys()), | |
| label="Target Speaker 2", | |
| value="Phil Benson" | |
| ), | |
| gr.Slider( | |
| 0, | |
| 1, | |
| value=0.5, | |
| step=0.1, | |
| label="Fusion Degree", | |
| info="As the step number increases, the generated voice will be more similar to speaker 2.", | |
| ), | |
| gr.Slider( | |
| 1, | |
| 5, | |
| value=1, | |
| step=0.25, | |
| label="Speaking Rate", | |
| info="As the step number increases, the speaking rate will be slower.", | |
| ) | |
| ] | |
| tc_demo_outputs = [ | |
| gr.Audio(label="Target Speaker 1"), | |
| gr.Audio(label="Target Speaker 2"), | |
| gr.Audio(label="Interpolated Speech") | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Interface( | |
| fn=tts_inference, | |
| inputs=tts_demo_inputs, | |
| outputs=tts_demo_output, | |
| title="Amphion Text-to-Speech", | |
| description="This demo offers an Amphion TTS pretrained model (VITS) for you to explore." | |
| ) | |
| gr.Interface( | |
| fn=tc_inference, | |
| inputs=tc_demo_inputs, | |
| outputs=tc_demo_outputs, | |
| title="Voice Fusion", | |
| description="In this section, you can choose two speakers to create a voice mix. Adjust the ‘Fusion Degree’ slider to customize your desired mix ratio between the two speakers." | |
| ) | |
| demo.queue() | |
| demo.launch() | |
| # if __name__ == "__main__": | |
| # demo.launch(share=True) |