Spaces:

ChaolongYang
/

KDTalker

Running on Zero

App Files Files Community

KDTalker / app.py

ChaolongYang

Update app.py

af9e4bd verified 7 months ago

raw

history blame contribute delete

7.34 kB

	import os, sys
	import gradio as gr
	from difpoint.inference import Inferencer
	from TTS.api import TTS
	import torch
	import time
	from flask import send_from_directory
	from huggingface_hub import snapshot_download
	import spaces
	import tensorrt
	import multiprocessing as mp
	import pickle
	mp.set_start_method('spawn', force=True)


	repo_id = "ChaolongYang/KDTalker"
	local_dir = "./downloaded_repo"
	snapshot_download(repo_id=repo_id, local_dir=local_dir)
	print("\nFiles downloaded:")
	for root, dirs, files in os.walk(local_dir):
	for file in files:
	file_path = os.path.join(root, file)
	print(file_path)

	result_dir = "results"
	def set_upload():
	return "upload"
	def set_microphone():
	return "microphone"
	def set_tts():
	return "tts"
	def create_kd_talker():
	return Inferencer()

	example_folder = "example"
	example_choices = ["Example 1", "Example 2", "Example 3"]
	example_mapping = {
	"Example 1": {"audio": os.path.join(example_folder, "example1.wav"), "image": os.path.join(example_folder, "example1.png")},
	"Example 2": {"audio": os.path.join(example_folder, "example2.wav"), "image": os.path.join(example_folder, "example2.png")},
	"Example 3": {"audio": os.path.join(example_folder, "example3.wav"), "image": os.path.join(example_folder, "example3.png")},
	}

	@spaces.GPU
	def predict(prompt, upload_reference_audio, microphone_reference_audio, reference_audio_type):
	global result_dir
	output_file_path = os.path.join('./downloaded_repo/', 'output.wav')
	if reference_audio_type == 'upload':
	audio_file_pth = upload_reference_audio
	elif reference_audio_type == 'microphone':
	audio_file_pth = microphone_reference_audio
	tts = TTS('tts_models/multilingual/multi-dataset/your_tts')
	tts.tts_to_file(
	text=prompt,
	file_path=output_file_path,
	speaker_wav=audio_file_pth,
	language="en",
	)
	return gr.Audio(value=output_file_path, type='filepath')

	@spaces.GPU
	def generate(upload_driven_audio, tts_driven_audio, driven_audio_type, source_image, smoothed_pitch, smoothed_yaw, smoothed_roll, smoothed_t):
	return Inferencer().generate_with_audio_img(upload_driven_audio, tts_driven_audio, driven_audio_type, source_image,
	smoothed_pitch, smoothed_yaw, smoothed_roll, smoothed_t)


	def main():
	if torch.cuda.is_available():
	device = "cuda"
	else:
	device = "cpu"
	with gr.Blocks(analytics_enabled=False) as interface:
	with gr.Row():
	gr.HTML(
	"""
	<div align='center'>
	<h2> Unlock Pose Diversity: Accurate and Efficient Implicit Keypoint-based Spatiotemporal Diffusion for Audio-driven Talking Portrait </h2>
	<div style="display: flex; justify-content: center; align-items: center; gap: 20px;">
	<img src='https://newstatic.dukekunshan.edu.cn/mainsite/2021/08/07161629/large_dku-Logo-e1649298929570.png' alt='Logo' width='150'/>
	<img src='https://www.xjtlu.edu.cn/wp-content/uploads/2023/12/7c52fd62e9cf26cb493faa7f91c2782.png' width='250'/>
	</div>
	</div>
	"""
	)
	driven_audio_type = gr.Textbox(value="upload", visible=False)
	reference_audio_type = gr.Textbox(value="upload", visible=False)
	with gr.Row():
	with gr.Column(variant="panel"):
	with gr.Tabs(elem_id="kdtalker_source_image"):
	with gr.TabItem("Upload image"):
	source_image = gr.Image(label="Source image", sources="upload", type="filepath", scale=256)
	with gr.Tabs(elem_id="kdtalker_driven_audio"):
	with gr.TabItem("Upload"):
	upload_driven_audio = gr.Audio(label="Upload audio", sources="upload", type="filepath")
	upload_driven_audio.change(set_upload, outputs=driven_audio_type)
	with gr.TabItem("TTS"):
	upload_reference_audio = gr.Audio(label="Upload Reference Audio", sources="upload", type="filepath")
	upload_reference_audio.change(set_upload, outputs=reference_audio_type)
	microphone_reference_audio = gr.Audio(label="Recorded Reference Audio", sources="microphone", type="filepath")
	microphone_reference_audio.change(set_microphone, outputs=reference_audio_type)
	input_text = gr.Textbox(
	label="Generating audio from text",
	lines=5,
	placeholder="please enter some text here, we generate the audio from text using @Coqui.ai TTS."
	)
	tts_button = gr.Button("Generate audio", elem_id="kdtalker_audio_generate", variant="primary")
	tts_driven_audio = gr.Audio(label="Synthesised Audio", type="filepath")
	tts_button.click(fn=predict, inputs=[input_text, upload_reference_audio, microphone_reference_audio, reference_audio_type], outputs=[tts_driven_audio])
	tts_button.click(set_tts, outputs=driven_audio_type)
	with gr.Column(variant="panel"):
	gen_video = gr.Video(label="Generated video", format="mp4", width=256)
	with gr.Tabs(elem_id="talker_checkbox"):
	with gr.TabItem("KDTalker"):
	smoothed_pitch = gr.Slider(minimum=0, maximum=1, step=0.1, label="Pitch", value=0.8)
	smoothed_yaw = gr.Slider(minimum=0, maximum=1, step=0.1, label="Yaw", value=0.8)
	smoothed_roll = gr.Slider(minimum=0, maximum=1, step=0.1, label="Roll", value=0.8)
	smoothed_t = gr.Slider(minimum=0, maximum=1, step=0.1, label="T", value=0.8)
	kd_submit = gr.Button("Generate", elem_id="kdtalker_generate", variant="primary")
	kd_submit.click(
	fn=generate,
	inputs=[
	upload_driven_audio, tts_driven_audio, driven_audio_type, source_image,
	smoothed_pitch, smoothed_yaw, smoothed_roll, smoothed_t
	],
	outputs=[gen_video]
	)
	with gr.TabItem("Example"):
	example_choice = gr.Dropdown(choices=example_choices, label="Choose an example")
	def load_example(choice):
	example = example_mapping.get(choice, {})
	audio_path = example.get("audio", "")
	image_path = example.get("image", "")
	return [audio_path, image_path]
	example_choice.change(
	fn=load_example,
	inputs=[example_choice],
	outputs=[upload_driven_audio, source_image]
	)
	example_choice.change(set_upload, outputs=driven_audio_type)


	return interface


	demo = main()
	demo.queue().launch(share=True)