Spaces:

alexnasa
/

HuMo_local

Running on Zero

HuMo_local / app.py

alex

better example

282214b about 2 months ago

14.7 kB

	import spaces
	import gradio as gr
	import sys
	import os
	import subprocess
	import uuid
	import shutil



	from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
	import importlib, site


	# Re-discover all .pth/.egg-link files
	for sitedir in site.getsitepackages():
	site.addsitedir(sitedir)

	# Clear caches so importlib will pick up new modules
	importlib.invalidate_caches()

	def sh(cmd): subprocess.check_call(cmd, shell=True)

	flash_attention_installed = False

	try:
	flash_attention_wheel = hf_hub_download(
	repo_id="alexnasa/flash-attn-3",
	repo_type="model",
	filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
	)

	sh(f"pip install {flash_attention_wheel}")
	print("Attempting to download and install FlashAttention wheel...")
	# sh("pip install flash-attn")
	sh("pip install --no-build-isolation transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl")

	# tell Python to re-scan site-packages now that the egg-link exists
	import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()

	flash_attention_installed = True

	except Exception as e:
	print(f"⚠️ Could not install FlashAttention: {e}")
	print("Continuing without FlashAttention...")

	try:
	te_wheel = hf_hub_download(
	repo_id="alexnasa/transformer_engine_wheels",
	repo_type="model",
	filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl",
	)

	sh(f"pip install {te_wheel}")
	print("Attempting to download and install Transformer Engine wheel...")

	# tell Python to re-scan site-packages now that the egg-link exists
	import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()

	except Exception as e:
	print(f"⚠️ Could not install Transformer Engine : {e}")
	print("Continuing without Transformer Engine ...")

	import torch
	print(f"Torch version: {torch.__version__}")
	print(f"FlashAttention available: {flash_attention_installed}")

	import tempfile
	from pathlib import Path
	from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir
	from huggingface_hub import HfApi


	snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo")
	snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B")
	snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3")

	os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"

	path_to_insert = "humo"
	if path_to_insert not in sys.path:
	sys.path.insert(0, path_to_insert)

	from common.config import load_config, create_object

	config = load_config(
	"./humo/configs/inference/generate.yaml",
	[
	"dit.sp_size=1",
	"generation.frames=97",
	"generation.scale_t=5.5",
	"generation.scale_a=5.0",
	"generation.mode=TIA",
	"generation.height=480",
	"generation.width=832",
	],
	)
	runner = create_object(config)


	os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space") # or another writable path

	def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
	path_in_repo: str = "inductor_cache", repo_type: str = "model",
	hf_token: str \| None = None):
	cache_root = Path(_inductor_cache_dir()).resolve()
	cache_root.mkdir(parents=True, exist_ok=True)
	zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}",
	repo_type=repo_type, token=hf_token)
	shutil.unpack_archive(zip_path, extract_dir=str(cache_root))
	print(f"✓ Restored cache into {cache_root}")


	# restore_inductor_cache_from_hub("alexnasa/humo-compiled")


	def get_duration(prompt_text, steps, image_file, audio_file_path, max_duration, session_id):

	return calculate_required_time(steps, max_duration)

	def calculate_required_time(steps, max_duration):

	warmup_s = 60

	max_duration_duration_mapping = {
	1: 8,
	2: 8,
	3: 12,
	4: 20,
	}
	each_step_s = max_duration_duration_mapping[max_duration]
	duration_s = (each_step_s * steps) + warmup_s

	print(f'estimated duration:{duration_s}')

	return int(duration_s)

	def get_required_time_string(steps, max_duration):

	duration_s = calculate_required_time(steps, max_duration)
	duration_m = duration_s / 60

	return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>"

	def update_required_time(steps, max_duration):

	return get_required_time_string(steps, max_duration)


	def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None):

	print(image_paths)
	prompt_text_check = (prompt_text or "").strip()
	if not prompt_text_check:
	raise gr.Error("Please enter a prompt.")

	if not audio_file_path and not image_paths:
	raise gr.Error("Please provide a reference image or a lipsync audio.")

	return run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration, session_id)

	def upload_inductor_cache_to_hub(
	repo_id: str,
	path_in_repo: str = "inductor_cache",
	repo_type: str = "model", # or "dataset" if you prefer
	hf_token: str \| None = None,
	):
	"""
	Zips the current TorchInductor cache and uploads it to the given repo path.
	Assumes the model was already run once with torch.compile() so the cache exists.
	"""

	cache_dir = Path(_inductor_cache_dir()).resolve()
	if not cache_dir.exists():
	raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. "
	"Run a compiled model once to populate it.")

	# Create a zip archive of the entire cache directory
	with tempfile.TemporaryDirectory() as tmpdir:
	archive_base = Path(tmpdir) / "torch_compile_cache"
	archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir))
	archive_path = Path(archive_path)

	# Upload to Hub
	api = HfApi(token=hf_token)
	api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
	# Put each artifact under path_in_repo, including a tiny metadata stamp for traceability
	# Upload the zip
	dest_path = f"{path_in_repo}/{archive_path.name}"
	api.upload_file(
	path_or_fileobj=str(archive_path),
	path_in_repo=dest_path,
	repo_id=repo_id,
	repo_type=repo_type,
	)
	# Upload a small metadata file (optional but handy)
	meta_txt = (
	f"pytorch={torch.__version__}\n"
	f"inductor_cache_dir={cache_dir}\n"
	f"cuda_available={torch.cuda.is_available()}\n"
	f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n"
	)
	api.upload_file(
	path_or_fileobj=meta_txt.encode(),
	path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt",
	repo_id=repo_id,
	repo_type=repo_type,
	)

	print("✔ Uploaded TorchInductor cache to the Hub.")


	@spaces.GPU(duration=get_duration)
	def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None):

	if session_id is None:
	session_id = uuid.uuid4().hex

	inference_mode = "TIA"

	# Validate inputs
	prompt_text = (prompt_text or "").strip()
	if not prompt_text:
	raise gr.Error("Please enter a prompt.")

	if not audio_file_path and not image_paths:
	raise gr.Error("Please provide a reference image or a lipsync audio.")

	if not audio_file_path:
	inference_mode = "TI"
	audio_path = None
	else:
	audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path))

	if not image_paths:
	inference_mode = "TA"
	img_paths = None
	else:
	img_paths = [image_data[0] for image_data in image_paths]


	# Prepare output
	output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
	os.makedirs(output_dir, exist_ok=True)

	# Random filename
	filename = f"gen_{uuid.uuid4().hex[:10]}"
	width, height = 832, 480

	duration_frame_mapping = {
	1:25,
	2:45,
	3:70,
	4:97
	}

	# Run inference
	runner.inference_loop(
	prompt_text,
	img_paths,
	audio_path,
	output_dir,
	filename,
	inference_mode,
	width,
	height,
	steps,
	frames = int(duration_frame_mapping[max_duration]),
	tea_cache_l1_thresh = 0.0,
	)

	# Return resulting video path
	video_path = os.path.join(output_dir, f"{filename}.mp4")
	if os.path.exists(video_path):

	# upload_inductor_cache_to_hub("alexnasa/humo-compiled")

	return video_path
	else:
	candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
	if candidates:
	return max(candidates, key=lambda p: os.path.getmtime(p))
	return None

	css = """
	#col-container {
	margin: 0 auto;
	width: 100%;
	max-width: 720px;
	}
	"""

	def cleanup(request: gr.Request):

	sid = request.session_hash
	if sid:
	d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
	shutil.rmtree(d1, ignore_errors=True)

	def start_session(request: gr.Request):

	return request.session_hash

	with gr.Blocks(css=css) as demo:

	session_state = gr.State()
	demo.load(start_session, outputs=[session_state])

	with gr.Sidebar(width=400):


	gr.HTML(
	"""
	<div style="text-align: center;">
	<p style="font-size:16px; display: inline; margin: 0;">
	<strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning
	</p>
	<a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
	[Github]
	</a>
	</div>
	"""
	)

	gr.Markdown("REFERENCE IMAGES")

	img_input = gr.Gallery(
	show_label=False,
	label="",
	interactive=True,
	rows=1, columns=3, object_fit="contain", height="280",
	file_types=['image']
	)

	gr.Markdown("LIPSYNC AUDIO")

	audio_input = gr.Audio(
	sources=["upload"],
	show_label=False,
	type="filepath",
	)

	gr.Markdown("SETTINGS")

	default_steps = 10
	default_max_duration = 3

	max_duration = gr.Slider(minimum=2, maximum=4, value=default_max_duration, step=1, label="Max Duration")
	steps_input = gr.Slider(minimum=10, maximum=50, value=default_steps, step=5, label="Diffusion Steps")



	with gr.Column(elem_id="col-container"):

	gr.HTML(
	"""
	<div style="text-align: center;">
	<strong>HF Space by:</strong>
	<a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
	<img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
	</a>
	</div>
	"""
	)

	video_output = gr.Video(show_label=False)

	gr.Markdown("<center><h2>PROMPT</h2></center>")

	prompt_tb = gr.Textbox(
	show_label=False,
	lines=5,
	placeholder="Describe the scene and the person talking....",
	)

	gr.Markdown("")
	time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration))
	run_btn = gr.Button("🎬 Action", variant="primary")

	gr.Examples(
	examples=[


	[
	"A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. The clip is in black and white and patchy as she is explaining something to someone standing opposite her",
	10,
	["./examples/naomi.png"],
	"./examples/science.wav",
	3,
	],

	[
	"A reddish-brown haired woman sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and her thoughtful pose evoke a Post-Impressionist style in a studio-like setting.",
	10,
	["./examples/art.png"],
	"./examples/art.wav",
	2,
	],

	[
	"A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.",
	5,
	["./examples/naomi.png"],
	"./examples/dream.mp3",
	5,
	],

	[
	"A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.",
	40,
	["./examples/amber.png", "./examples/jacket.png"],
	"./examples/fictional.wav",
	4,
	],

	],
	inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration],
	outputs=[video_output],
	fn=run_pipeline,
	cache_examples=True,
	)
	max_duration.change(update_required_time, [steps_input, max_duration], time_required)
	steps_input.change(update_required_time, [steps_input, max_duration], time_required)

	run_btn.click(
	fn=generate_scene,
	inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration, session_state],
	outputs=[video_output],
	)


	if __name__ == "__main__":
	demo.unload(cleanup)
	demo.queue()
	demo.launch(ssr_mode=False)