Spaces:

Pyramid-Flow
/

pyramid-flow

Running on Zero

App Files Files Community

some vertical video here

by hannibaking - opened Mar 11

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+28

-7

Files changed (1) hide show

app.py +28 -7

app.py CHANGED Viewed

@@ -18,6 +18,10 @@ MODEL_REPO = "rain1011/pyramid-flow-sd3"
 MODEL_VARIANT = "diffusion_transformer_768p"
 MODEL_DTYPE = "bf16"
 def center_crop(image, target_width, target_height):
     width, height = image.size
     aspect_ratio_target = target_width / target_height
@@ -62,13 +66,24 @@ model = load_model()
 # Text-to-video generation function
 @spaces.GPU(duration=140)
-def generate_video(prompt, image=None, duration=3, guidance_scale=9, video_guidance_scale=5, frames_per_second=8, progress=gr.Progress(track_tqdm=True)):
     multiplier = 1.2 if is_canonical else 3.0
     temp = int(duration * multiplier) + 1
     torch_dtype = torch.bfloat16 if MODEL_DTYPE == "bf16" else torch.float32
-    if(image):
-        cropped_image = center_crop(image, 1280, 768)
-        resized_image = cropped_image.resize((1280, 768))
         with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
             frames = model.generate_i2v(
                 prompt=prompt,
@@ -86,14 +101,15 @@ def generate_video(prompt, image=None, duration=3, guidance_scale=9, video_guida
                 prompt=prompt,
                 num_inference_steps=[20, 20, 20],
                 video_num_inference_steps=[10, 10, 10],
-                height=768,
-                width=1280,
                 temp=temp,
                 guidance_scale=guidance_scale,
                 video_guidance_scale=video_guidance_scale,
                 output_type="pil",
                 save_memory=True,
             )
     output_path = f"{str(uuid.uuid4())}_output_video.mp4"
     export_to_video(frames, output_path, fps=frames_per_second)
     return output_path
@@ -110,6 +126,11 @@ with gr.Blocks() as demo:
                 i2v_image = gr.Image(type="pil", label="Input Image")
             t2v_prompt = gr.Textbox(label="Prompt")
             with gr.Accordion("Advanced settings", open=False):
                 t2v_duration = gr.Slider(minimum=1, maximum=3 if is_canonical else 10, value=3 if is_canonical else 5, step=1, label="Duration (seconds)", visible=not is_canonical)
                 t2v_fps = gr.Slider(minimum=8, maximum=24, step=16, value=8 if is_canonical else 24, label="Frames per second", visible=is_canonical)
                 t2v_guidance_scale = gr.Slider(minimum=1, maximum=15, value=9, step=0.1, label="Guidance Scale")
@@ -140,7 +161,7 @@ with gr.Blocks() as demo:
     )
     t2v_generate_btn.click(
         generate_video,
-        inputs=[t2v_prompt, i2v_image, t2v_duration, t2v_guidance_scale, t2v_video_guidance_scale, t2v_fps],
         outputs=t2v_output
     )

 MODEL_VARIANT = "diffusion_transformer_768p"
 MODEL_DTYPE = "bf16"
+# Define resolution presets
+LANDSCAPE_RESOLUTION = {"width": 1280, "height": 768}
+PORTRAIT_RESOLUTION = {"width": 768, "height": 1280}
 def center_crop(image, target_width, target_height):
     width, height = image.size
     aspect_ratio_target = target_width / target_height
 # Text-to-video generation function
 @spaces.GPU(duration=140)
+def generate_video(prompt, image=None, orientation="landscape", duration=3, guidance_scale=9, video_guidance_scale=5, frames_per_second=8, progress=gr.Progress(track_tqdm=True)):
+    # Set width and height based on orientation
+    if orientation == "landscape":
+        width = LANDSCAPE_RESOLUTION["width"]
+        height = LANDSCAPE_RESOLUTION["height"]
+    else:  # portrait
+        width = PORTRAIT_RESOLUTION["width"]
+        height = PORTRAIT_RESOLUTION["height"]
     multiplier = 1.2 if is_canonical else 3.0
     temp = int(duration * multiplier) + 1
     torch_dtype = torch.bfloat16 if MODEL_DTYPE == "bf16" else torch.float32
+    if image:
+        # Process the input image according to the selected orientation
+        cropped_image = center_crop(image, width, height)
+        resized_image = cropped_image.resize((width, height))
         with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
             frames = model.generate_i2v(
                 prompt=prompt,
                 prompt=prompt,
                 num_inference_steps=[20, 20, 20],
                 video_num_inference_steps=[10, 10, 10],
+                height=height,
+                width=width,
                 temp=temp,
                 guidance_scale=guidance_scale,
                 video_guidance_scale=video_guidance_scale,
                 output_type="pil",
                 save_memory=True,
             )
     output_path = f"{str(uuid.uuid4())}_output_video.mp4"
     export_to_video(frames, output_path, fps=frames_per_second)
     return output_path
                 i2v_image = gr.Image(type="pil", label="Input Image")
             t2v_prompt = gr.Textbox(label="Prompt")
             with gr.Accordion("Advanced settings", open=False):
+                t2v_orientation = gr.Radio(
+                    choices=["landscape", "portrait"],
+                    value="landscape",
+                    label="Video Orientation"
+                )
                 t2v_duration = gr.Slider(minimum=1, maximum=3 if is_canonical else 10, value=3 if is_canonical else 5, step=1, label="Duration (seconds)", visible=not is_canonical)
                 t2v_fps = gr.Slider(minimum=8, maximum=24, step=16, value=8 if is_canonical else 24, label="Frames per second", visible=is_canonical)
                 t2v_guidance_scale = gr.Slider(minimum=1, maximum=15, value=9, step=0.1, label="Guidance Scale")
     )
     t2v_generate_btn.click(
         generate_video,
+        inputs=[t2v_prompt, i2v_image, t2v_orientation, t2v_duration, t2v_guidance_scale, t2v_video_guidance_scale, t2v_fps],
         outputs=t2v_output
     )