Spaces:
Running
Running
| import torch | |
| import torchaudio | |
| import spaces | |
| from typing import List | |
| import soundfile as sf | |
| import gradio as gr | |
| import tempfile | |
| import subprocess | |
| def convert_to_16kHz_mono(input_file, output_file): | |
| """ | |
| Converts an audio file to 16KHz sample rate and single channel (mono) using ffmpeg. | |
| Parameters: | |
| input_file (str): Path to the input audio file. | |
| output_file (str): Path to the output WAV file. | |
| """ | |
| try: | |
| # Run the ffmpeg command | |
| subprocess.run(['ffmpeg', '-y', '-i', input_file, '-ar', '16000', '-ac', '1', output_file], check=True) | |
| print(f"Conversion complete: {output_file}") | |
| return output_file | |
| except subprocess.CalledProcessError as e: | |
| print(f"An error occurred during conversion: {e}") | |
| def create_temp_wav_file(): | |
| # Create a temporary file using NamedTemporaryFile | |
| temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| # Get the path of the temporary file | |
| temp_file_path = temp_file.name | |
| return temp_file_path | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device=device) | |
| def convert_voice(src_wav_path:str, ref_wav_paths, top_k:int): | |
| tmp_src_wav_path = create_temp_wav_file() | |
| tmp_ref_wav_path = create_temp_wav_file() | |
| src_wav_path = convert_to_16kHz_mono(src_wav_path, tmp_src_wav_path) | |
| ref_wav_paths = convert_to_16kHz_mono(ref_wav_paths, tmp_ref_wav_path) | |
| query_seq = knn_vc.get_features(src_wav_path) | |
| matching_set = knn_vc.get_matching_set([ref_wav_paths]) | |
| out_wav = knn_vc.match(query_seq, matching_set, topk=int(top_k)) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as converted_file: | |
| sf.write(converted_file.name, out_wav, 16000, "PCM_24") | |
| return converted_file.name | |
| title = """ | |
| <div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
| <div | |
| style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;" | |
| > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;"> | |
| KNN Voice Conversion | |
| </h1> </div> | |
| </div> | |
| """ | |
| description = """ | |
| Voice Conversion With Just k-Nearest Neighbors. The source and reference utterance(s) are encoded into self-supervised features using WavLM. | |
| Each source feature is assigned to the mean of the k closest features from the reference. | |
| The resulting feature sequence is then vocoded with HiFi-GAN to arrive at the converted waveform output. | |
| """ | |
| article = """ | |
| If the model contributes to your research please cite the following work: | |
| Baas, M., van Niekerk, B., & Kamper, H. (2023). Voice conversion with just nearest neighbors. arXiv preprint arXiv:2305.18975. | |
| demo contributed by [@wetdog](https://github.com/wetdog) | |
| """ | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown(title) | |
| gr.Markdown(description) | |
| gr.Interface( | |
| fn=convert_voice, | |
| inputs=[ | |
| gr.Audio(type='filepath'), | |
| gr.Audio(type='filepath'), | |
| #gr.File(file_count="multiple", type="filepath", label="Reference Audio Files"), | |
| gr.Slider( | |
| 3, | |
| 10, | |
| value=4, | |
| step=1, | |
| label="Top-k", | |
| info=f"These default settings provide pretty good results, but feel free to modify the kNN topk", | |
| )], | |
| outputs=[gr.Audio(type='filepath')], | |
| allow_flagging=False,) | |
| gr.Markdown(article) | |
| demo.queue(max_size=10) | |
| demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860) | |