Spaces:

assasinatee
/

STAR

Sleeping

STAR / app.py

Yixuan Li

optimize some small problems

e7b2677 28 days ago

7.19 kB

	import gradio as gr
	from pathlib import Path

	import soundfile as sf

	# forcing torch.load to CPU
	import torch
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	_old_load = torch.load

	def safe_torch_load(args, *kwargs):
	args = list(args)
	if len(args) >= 2:
	args[1] = device
	else:
	kwargs['map_location'] = device
	return _old_load(args, *kwargs)

	torch.load = safe_torch_load

	import torchaudio
	import hydra
	from omegaconf import OmegaConf
	import diffusers.schedulers as noise_schedulers

	from utils.config import register_omegaconf_resolvers
	from models.common import LoadPretrainedBase

	from huggingface_hub import hf_hub_download
	import fairseq

	register_omegaconf_resolvers()
	config = OmegaConf.load("configs/infer.yaml")

	ckpt_path = hf_hub_download(
	repo_id="assasinatee/STAR",
	filename="model.safetensors",
	repo_type="model",
	force_download=False
	)

	exp_config = OmegaConf.load("configs/config.yaml")
	if "pretrained_ckpt" in exp_config["model"]:
	exp_config["model"]["pretrained_ckpt"] = ckpt_path
	model: LoadPretrainedBase = hydra.utils.instantiate(exp_config["model"])

	model = model.to(device)

	ckpt_path = hf_hub_download(
	repo_id="assasinatee/STAR",
	filename="hubert_large_ll60k.pt",
	repo_type="model",
	force_download=False
	)
	hubert_models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
	hubert_model = hubert_models[0].eval().to(device)

	scheduler = getattr(
	noise_schedulers,
	config["noise_scheduler"]["type"],
	).from_pretrained(
	config["noise_scheduler"]["name"],
	subfolder="scheduler",
	)

	@torch.no_grad()
	def infer(audio_path: str) -> str:
	waveform_tts, sample_rate = torchaudio.load(audio_path)
	if sample_rate != 16000:
	waveform_tts = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_tts)
	if waveform_tts.shape[0] > 1:
	waveform_tts = torch.mean(waveform_tts, dim=0, keepdim=True)
	with torch.no_grad():
	features, _ = hubert_model.extract_features(waveform_tts.to(device))

	kwargs = OmegaConf.to_container(config["infer_args"].copy(), resolve=True)
	kwargs['content'] = [features]
	kwargs['condition'] = None
	kwargs['task'] = ["speech_to_audio"]

	model.eval()
	waveform = model.inference(
	scheduler=scheduler,
	**kwargs,
	)

	output_file = "output_audio.wav"
	sf.write(output_file, waveform.squeeze().cpu().numpy(), samplerate=exp_config["sample_rate"])

	return output_file

	with gr.Blocks(title="STAR Online Inference", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# STAR: Speech-to-Audio Generation via Representation Learning")

	gr.Markdown("""
	<div style="text-align: left; padding: 10px;">

	## 📚️ Introduction

	STAR is the first end-to-end speech-to-audio generation framework, designed to enhance efficiency and address error propagation inherent in cascaded systems.

	Within this space, you have the opportunity to directly control our model through voice input, thereby generating the corresponding audio output.

	## 🗣️ Input

	A brief input speech utterance for the overall audio scene.

	> Example：A cat meowing and young female speaking

	### 🎙️ Input Speech Example
	""")

	speech = gr.Audio(value="wav/speech.wav", label="Input Speech Example", type="filepath")

	gr.Markdown("""
	<div style="text-align: left; padding: 10px;">

	## 🎧️ Output

	Capture both auditory events and scene cues and generate corresponding audio

	### 🔊 Output Audio Example
	""")

	audio = gr.Audio(value="wav/audio.wav", label="Generated Audio Example", type="filepath")

	gr.Markdown("""
	<div style="text-align: left; padding: 10px;">

	</div>

	---

	</div>

	## 🛠️ Online Inference

	You can upload your own samples, or try the quick examples provided below.
	""")

	with gr.Column():
	input_audio = gr.Audio(label="🗣️ Speech Input", type="filepath")
	btn = gr.Button("🎵Generate Audio!", variant="primary")
	output_audio = gr.Audio(label="🎧️ Generated Audio", type="filepath")
	btn.click(fn=infer, inputs=input_audio, outputs=output_audio)

	gr.Markdown("""
	<div style="text-align: left; padding: 10px;">

	## 🎯 Quick Examples
	""")

	display_caption = gr.Textbox(label="📝 Caption" ,visible=False)

	with gr.Tabs():
	with gr.Tab("VITS Generated Speech"):
	gr.Examples(
	examples=[
	["wav/vits/1.wav", "A cat meowing and young female speaking"],
	["wav/vits/2.wav", "Sustained industrial engine noise"],
	["wav/vits/3.wav", "A woman talks and a baby whispers"],
	["wav/vits/4.wav", "A man speaks followed by a toilet flush"],
	["wav/vits/5.wav", "It is raining and thundering, and then a man speaks"],
	["wav/vits/6.wav", "A man speaking as birds are chirping"],
	["wav/vits/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
	["wav/vits/8.wav", "Birds chirping and a horse neighing"],
	["wav/vits/9.wav", "Several church bells ringing"],
	["wav/vits/10.wav", "A telephone rings with bell sounds"]
	],
	inputs=[input_audio, display_caption],
	label="Click examples below to try!",
	cache_examples = False,
	examples_per_page = 10,
	)

	with gr.Tab("Real Human Speech"):
	gr.Examples(
	examples=[
	["wav/human/1.wav", "A cat meowing and young female speaking"],
	["wav/human/2.wav", "Sustained industrial engine noise"],
	["wav/human/3.wav", "A woman talks and a baby whispers"],
	["wav/human/4.wav", "A man speaks followed by a toilet flush"],
	["wav/human/5.wav", "It is raining and thundering, and then a man speaks"],
	["wav/human/6.wav", "A man speaking as birds are chirping"],
	["wav/human/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
	["wav/human/8.wav", "Birds chirping and a horse neighing"],
	["wav/human/9.wav", "Several church bells ringing"],
	["wav/human/10.wav", "A telephone rings with bell sounds"]
	],
	inputs=[input_audio, display_caption],
	label="Click examples below to try!",
	cache_examples = False,
	examples_per_page = 10,
	)


	demo.launch()