Spaces:

wetdog
/

knn-voice-conversion

Running

App Files Files Community

knn-voice-conversion / app.py

LuisVasquezBSC

Add conversion to 16k mono

a611372 verified over 1 year ago

raw

history blame

3.57 kB

	import torch
	import torchaudio
	import spaces
	from typing import List
	import soundfile as sf
	import gradio as gr
	import tempfile
	import subprocess

	def convert_to_16kHz_mono(input_file, output_file):
	"""
	Converts an audio file to 16KHz sample rate and single channel (mono) using ffmpeg.

	Parameters:
	input_file (str): Path to the input audio file.
	output_file (str): Path to the output WAV file.
	"""
	try:
	# Run the ffmpeg command
	subprocess.run(['ffmpeg', '-y', '-i', input_file, '-ar', '16000', '-ac', '1', output_file], check=True)
	print(f"Conversion complete: {output_file}")
	return output_file
	except subprocess.CalledProcessError as e:
	print(f"An error occurred during conversion: {e}")

	def create_temp_wav_file():
	# Create a temporary file using NamedTemporaryFile
	temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)

	# Get the path of the temporary file
	temp_file_path = temp_file.name

	return temp_file_path

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	knn_vc = torch.hub.load('bshall/knn-vc', 'knn_vc', prematched=True, trust_repo=True, pretrained=True, device=device)


	def convert_voice(src_wav_path:str, ref_wav_paths, top_k:int):

	tmp_src_wav_path = create_temp_wav_file()
	tmp_ref_wav_path = create_temp_wav_file()
	src_wav_path = convert_to_16kHz_mono(src_wav_path, tmp_src_wav_path)
	ref_wav_paths = convert_to_16kHz_mono(ref_wav_paths, tmp_ref_wav_path)

	query_seq = knn_vc.get_features(src_wav_path)
	matching_set = knn_vc.get_matching_set([ref_wav_paths])
	out_wav = knn_vc.match(query_seq, matching_set, topk=int(top_k))

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as converted_file:
	sf.write(converted_file.name, out_wav, 16000, "PCM_24")

	return converted_file.name


	title = """
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
	> <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	KNN Voice Conversion
	</h1> </div>
	</div>
	"""

	description = """
	Voice Conversion With Just k-Nearest Neighbors. The source and reference utterance(s) are encoded into self-supervised features using WavLM.
	Each source feature is assigned to the mean of the k closest features from the reference.
	The resulting feature sequence is then vocoded with HiFi-GAN to arrive at the converted waveform output.
	"""

	article = """
	If the model contributes to your research please cite the following work:

	Baas, M., van Niekerk, B., & Kamper, H. (2023). Voice conversion with just nearest neighbors. arXiv preprint arXiv:2305.18975.

	demo contributed by [@wetdog](https://github.com/wetdog)
	"""
	demo = gr.Blocks()
	with demo:
	gr.Markdown(title)
	gr.Markdown(description)
	gr.Interface(
	fn=convert_voice,
	inputs=[
	gr.Audio(type='filepath'),
	gr.Audio(type='filepath'),
	#gr.File(file_count="multiple", type="filepath", label="Reference Audio Files"),
	gr.Slider(
	3,
	10,
	value=4,
	step=1,
	label="Top-k",
	info=f"These default settings provide pretty good results, but feel free to modify the kNN topk",
	)],
	outputs=[gr.Audio(type='filepath')],
	allow_flagging=False,)
	gr.Markdown(article)

	demo.queue(max_size=10)
	demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)