Spaces:

ali-vilab
/

modelscope-text-to-video-synthesis

Runtime error

App Files Files Community

151

modelscope-text-to-video-synthesis / app.py

kafarasi

Update app.py

68866fa verified 7 months ago

raw

history blame

5.38 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import os
	import random
	import tempfile
	import sys

	# Check critical dependencies before proceeding
	try:
	import numpy as np
	import torch
	import gradio as gr
	import imageio
	from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
	except ImportError as e:
	print(f"Error: Missing required dependency - {e}")
	print("Please ensure requirements.txt includes: numpy, torch, diffusers, gradio, imageio")
	sys.exit(1)

	DESCRIPTION = '''# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)
	<p>For Colab usage, you can view <a href="https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing" style="text-decoration: underline;" target="_blank">this webpage</a>.</p>
	<p>This model can only be used for non-commercial purposes. See the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'''

	if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
	DESCRIPTION += f'''\n<p>For faster inference, you may duplicate this space and upgrade to GPU.
	<a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true">
	<img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>'''

	MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '64')) # Reduced from 200 for stability
	DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES, 16)

	# Initialize pipeline with error handling
	try:
	pipe = DiffusionPipeline.from_pretrained(
	'damo-vilab/text-to-video-ms-1.7b',
	torch_dtype=torch.float16,
	variant='fp16'
	)
	pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
	pipe.enable_model_cpu_offload()
	pipe.enable_vae_slicing()
	except Exception as e:
	print(f"Failed to initialize pipeline: {e}")
	print("This model requires significant GPU memory. Try a smaller model like 'cerspense/zeroscope_v2_576w' if needed.")
	sys.exit(1)

	def to_video(frames: list[np.ndarray], fps: int) -> str:
	"""Convert frames to video using imageio with FFMPEG."""
	try:
	out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
	writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
	for frame in frames:
	writer.append_data(frame)
	writer.close()
	return out_file.name
	except Exception as e:
	print(f"Video creation failed: {e}")
	raise

	def generate(prompt: str, seed: int, num_frames: int, num_inference_steps: int) -> str:
	"""Generate video from text prompt."""
	if not prompt.strip():
	raise gr.Error("Please enter a valid prompt")

	seed = random.randint(0, 1000000) if seed == -1 else seed
	generator = torch.Generator().manual_seed(seed)

	try:
	frames = pipe(
	prompt,
	num_inference_steps=num_inference_steps,
	num_frames=num_frames,
	generator=generator
	).frames
	return to_video(frames, 8)
	except torch.cuda.OutOfMemoryError:
	raise gr.Error("Out of GPU memory - Try reducing frame count or use a smaller model")
	except Exception as e:
	raise gr.Error(f"Generation failed: {str(e)}")

	examples = [
	['An astronaut riding a horse.', 0, 16, 25],
	['A panda eating bamboo on a rock.', 0, 16, 25],
	['Spiderman is surfing.', 0, 16, 25],
	]

	with gr.Blocks(css='style.css') as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Group():
	with gr.Box():
	with gr.Row(elem_id='prompt-container').style(equal_height=True):
	prompt = gr.Text(
	label='Prompt',
	show_label=False,
	max_lines=1,
	placeholder='Enter your prompt',
	elem_id='prompt-text-input'
	)
	run_button = gr.Button('Generate video')

	result = gr.Video(label='Result', show_label=False)

	with gr.Accordion('Advanced options', open=False):
	seed = gr.Slider(
	label='Seed',
	minimum=-1,
	maximum=1000000,
	step=1,
	value=-1,
	info='-1 = random seed each time'
	)
	num_frames = gr.Slider(
	label='Number of frames',
	minimum=16,
	maximum=MAX_NUM_FRAMES,
	step=1,
	value=DEFAULT_NUM_FRAMES,
	info='Higher values require more GPU memory'
	)
	num_inference_steps = gr.Slider(
	label='Inference steps',
	minimum=10,
	maximum=50,
	step=1,
	value=25
	)

	inputs = [prompt, seed, num_frames, num_inference_steps]

	gr.Examples(
	examples=examples,
	inputs=inputs,
	outputs=result,
	fn=generate,
	cache_examples=os.getenv('SYSTEM') == 'spaces'
	)

	prompt.submit(fn=generate, inputs=inputs, outputs=result)
	run_button.click(fn=generate, inputs=inputs, outputs=result)

	# Additional UI sections remain unchanged...

	demo.queue(max_size=10).launch()