Spaces:

TIGER-Lab
/

AnyV2V

Running on Zero

App Files Files Community

vinesmsuic commited on Apr 3, 2024

Commit

26378e3

1 Parent(s): 15186bb

update

Browse files

Files changed (2) hide show

app.py +182 -191
gradio_demo.py +182 -191

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ import imageio
 DEBUG_MODE = False
 demo_examples = [
-                    ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
                     ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
                     ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
                     ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
@@ -39,198 +39,189 @@ demo_examples = [
 TEMP_DIR = "_demo_temp"
-class ImageEditor:
-    def __init__(self) -> None:
-        self.image_edit_model = InstructPix2Pix()
-    @torch.no_grad()
-    @spaces.GPU(duration=30)
-    def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
-        edited_image_path = infer_video(self.image_edit_model,
                     video_path,
-                    output_dir=TEMP_DIR,
-                    prompt=prompt,
-                    prompt_type="instruct",
-                    force_512=force_512,
-                    seed=seed,
-                    negative_prompt=negative_prompt,
-                    overwrite=True)
-        return edited_image_path
-class AnyV2V_I2VGenXL:
-    def __init__(self) -> None:
-        # Set up default inversion config file
-        config = {
-            # DDIM inversion
-            "inverse_config": {
-                "image_size": [512, 512],
-                "n_frames": 16,
-                "cfg": 1.0,
-                "target_fps": 8,
-                "ddim_inv_prompt": "",
-                "prompt": "",
-                "negative_prompt": "",
-            },
-            "pnp_config": {
-                "random_ratio": 0.0,
-                "target_fps": 8,
-            },
-        }
-        self.config = OmegaConf.create(config)
-    @torch.no_grad()
-    @spaces.GPU(duration=150)
-    def perform_anyv2v(self,
-                       video_path,
-                       video_prompt,
-                       video_negative_prompt,
-                       edited_first_frame_path,
-                       conv_inj,
-                       spatial_inj,
-                       temp_inj,
-                       num_inference_steps,
-                       guidance_scale,
-                       ddim_init_latents_t_idx,
-                       ddim_inversion_steps,
-                       seed,
-                       ):
-        # Initialize the I2VGenXL pipeline
-        self.pipe = I2VGenXLPipeline.from_pretrained(
-            "ali-vilab/i2vgen-xl",
-            torch_dtype=torch.float16,
-            variant="fp16",
-        ).to("cuda:0")
-        # Initialize the DDIM inverse scheduler
-        self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
-                "ali-vilab/i2vgen-xl",
-                subfolder="scheduler",
-        )
-        # Initialize the DDIM scheduler
-        self.ddim_scheduler = DDIMScheduler.from_pretrained(
-                "ali-vilab/i2vgen-xl",
-                subfolder="scheduler",
-        )
-        tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
-        if os.path.exists(tmp_dir):
-            shutil.rmtree(tmp_dir)
-        os.makedirs(tmp_dir)
-        ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
-        def read_frames(video_path):
-            frames = []
-            with imageio.get_reader(video_path) as reader:
-                for i, frame in enumerate(reader):
-                    pil_image = Image.fromarray(frame)
-                    frames.append(pil_image)
-            return frames
-        frame_list = read_frames(str(video_path))
-        self.config.inverse_config.image_size = list(frame_list[0].size)
-        self.config.inverse_config.n_steps = ddim_inversion_steps
-        self.config.inverse_config.n_frames = len(frame_list)
-        self.config.inverse_config.output_dir = ddim_latents_path
-        ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
-        # Step 1. DDIM Inversion
-        first_frame = frame_list[0]
-        generator = torch.Generator(device="cuda:0")
-        generator = generator.manual_seed(seed)
-        _ddim_latents = ddim_inversion(
-            self.config.inverse_config,
-            first_frame,
-            frame_list,
-            self.pipe,
-            self.inverse_scheduler,
-            generator,
-        )
-        # Step 2. DDIM Sampling + PnP feature and attention injection
-        # Load the edited first frame
-        edited_1st_frame = load_image(edited_first_frame_path).resize(
-            self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
-        )
-        # Load the initial latents at t
-        self.ddim_scheduler.set_timesteps(num_inference_steps)
-        print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
-        ddim_latents_at_t = load_ddim_latents_at_t(
-            self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
-            ddim_latents_path=ddim_latents_path,
-        )
-        print(
-            f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
-        )
-        print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
-        # Blend the latents
-        random_latents = torch.randn_like(ddim_latents_at_t)
-        print(
-            f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
-        )
-        mixed_latents = (
-            random_latents * self.config.pnp_config.random_ratio
-            + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
-        )
-        # Init Pnp
-        self.config.pnp_config.n_steps = num_inference_steps
-        self.config.pnp_config.pnp_f_t = conv_inj
-        self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
-        self.config.pnp_config.pnp_temp_attn_t = temp_inj
-        self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
-        init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
-        # Edit video
-        self.pipe.register_modules(scheduler=self.ddim_scheduler)
-        edited_video = self.pipe.sample_with_pnp(
-            prompt=video_prompt,
-            image=edited_1st_frame,
-            height=self.config.inverse_config.image_size[1],
-            width=self.config.inverse_config.image_size[0],
-            num_frames=self.config.inverse_config.n_frames,
-            num_inference_steps=self.config.pnp_config.n_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=video_negative_prompt,
-            target_fps=self.config.pnp_config.target_fps,
-            latents=mixed_latents,
-            generator=generator,
-            return_dict=True,
-            ddim_init_latents_t_idx=ddim_init_latents_t_idx,
-            ddim_inv_latents_path=ddim_latents_path,
-            ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
-            ddim_inv_1st_frame=first_frame,
-        ).frames[0]
-        edited_video = [
-            frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
-            for frame in edited_video
-        ]
-        def images_to_video(images, output_path, fps=24):
-            writer = imageio.get_writer(output_path, fps=fps)
-            for img in images:
-                img_np = np.array(img)
-                writer.append_data(img_np)
-            writer.close()
-        output_path = os.path.join(tmp_dir, "edited_video.mp4")
-        images_to_video(
-            edited_video, output_path, fps=self.config.pnp_config.target_fps
-        )
-        return output_path
-# Init the class
-#=====================================
-if not DEBUG_MODE:
-    Image_Editor = ImageEditor()
-    AnyV2V_Editor = AnyV2V_I2VGenXL()
-#=====================================
 def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
     def check_video(video_path):

 DEBUG_MODE = False
 demo_examples = [
+                    ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "man walking", 0.1, 0.1, 1.0],
                     ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
                     ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
                     ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
 TEMP_DIR = "_demo_temp"
+image_edit_model = InstructPix2Pix()
+@torch.no_grad()
+@spaces.GPU(duration=30)
+def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
+    edited_image_path = infer_video(image_edit_model,
+                video_path,
+                output_dir=TEMP_DIR,
+                prompt=prompt,
+                prompt_type="instruct",
+                force_512=force_512,
+                seed=seed,
+                negative_prompt=negative_prompt,
+                overwrite=True)
+    return edited_image_path
+# Set up default inversion config file
+config = {
+    # DDIM inversion
+    "inverse_config": {
+        "image_size": [512, 512],
+        "n_frames": 16,
+        "cfg": 1.0,
+        "target_fps": 8,
+        "ddim_inv_prompt": "",
+        "prompt": "",
+        "negative_prompt": "",
+    },
+    "pnp_config": {
+        "random_ratio": 0.0,
+        "target_fps": 8,
+    },
+}
+config = OmegaConf.create(config)
+# Initialize the I2VGenXL pipeline
+pipe = I2VGenXLPipeline.from_pretrained(
+    "ali-vilab/i2vgen-xl",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda:0")
+# Initialize the DDIM inverse scheduler
+inverse_scheduler = DDIMInverseScheduler.from_pretrained(
+        "ali-vilab/i2vgen-xl",
+        subfolder="scheduler",
+)
+# Initialize the DDIM scheduler
+ddim_scheduler = DDIMScheduler.from_pretrained(
+        "ali-vilab/i2vgen-xl",
+        subfolder="scheduler",
+)
+@torch.no_grad()
+@spaces.GPU(duration=150)
+def perform_anyv2v(
                     video_path,
+                    video_prompt,
+                    video_negative_prompt,
+                    edited_first_frame_path,
+                    conv_inj,
+                    spatial_inj,
+                    temp_inj,
+                    num_inference_steps,
+                    guidance_scale,
+                    ddim_init_latents_t_idx,
+                    ddim_inversion_steps,
+                    seed,
+                    ):
+    tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
+    os.makedirs(tmp_dir)
+    ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
+    def read_frames(video_path):
+        frames = []
+        with imageio.get_reader(video_path) as reader:
+            for i, frame in enumerate(reader):
+                pil_image = Image.fromarray(frame)
+                frames.append(pil_image)
+        return frames
+    frame_list = read_frames(str(video_path))
+    config.inverse_config.image_size = list(frame_list[0].size)
+    config.inverse_config.n_steps = ddim_inversion_steps
+    config.inverse_config.n_frames = len(frame_list)
+    config.inverse_config.output_dir = ddim_latents_path
+    ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
+    # Step 1. DDIM Inversion
+    first_frame = frame_list[0]
+    generator = torch.Generator(device="cuda:0")
+    generator = generator.manual_seed(seed)
+    _ddim_latents = ddim_inversion(
+        config.inverse_config,
+        first_frame,
+        frame_list,
+        pipe,
+        inverse_scheduler,
+        generator,
+    )
+    # Step 2. DDIM Sampling + PnP feature and attention injection
+    # Load the edited first frame
+    edited_1st_frame = load_image(edited_first_frame_path).resize(
+        config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
+    )
+    # Load the initial latents at t
+    ddim_scheduler.set_timesteps(num_inference_steps)
+    print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
+    ddim_latents_at_t = load_ddim_latents_at_t(
+        ddim_scheduler.timesteps[ddim_init_latents_t_idx],
+        ddim_latents_path=ddim_latents_path,
+    )
+    print(
+        f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
+    )
+    print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
+    # Blend the latents
+    random_latents = torch.randn_like(ddim_latents_at_t)
+    print(
+        f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
+    )
+    mixed_latents = (
+        random_latents * config.pnp_config.random_ratio
+        + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
+    )
+    # Init Pnp
+    config.pnp_config.n_steps = num_inference_steps
+    config.pnp_config.pnp_f_t = conv_inj
+    config.pnp_config.pnp_spatial_attn_t = spatial_inj
+    config.pnp_config.pnp_temp_attn_t = temp_inj
+    config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
+    init_pnp(pipe, ddim_scheduler, config.pnp_config)
+    # Edit video
+    pipe.register_modules(scheduler=ddim_scheduler)
+    edited_video = pipe.sample_with_pnp(
+        prompt=video_prompt,
+        image=edited_1st_frame,
+        height=config.inverse_config.image_size[1],
+        width=config.inverse_config.image_size[0],
+        num_frames=config.inverse_config.n_frames,
+        num_inference_steps=config.pnp_config.n_steps,
+        guidance_scale=guidance_scale,
+        negative_prompt=video_negative_prompt,
+        target_fps=config.pnp_config.target_fps,
+        latents=mixed_latents,
+        generator=generator,
+        return_dict=True,
+        ddim_init_latents_t_idx=ddim_init_latents_t_idx,
+        ddim_inv_latents_path=ddim_latents_path,
+        ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
+        ddim_inv_1st_frame=first_frame,
+    ).frames[0]
+    edited_video = [
+        frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
+        for frame in edited_video
+    ]
+    def images_to_video(images, output_path, fps=24):
+        writer = imageio.get_writer(output_path, fps=fps)
+        for img in images:
+            img_np = np.array(img)
+            writer.append_data(img_np)
+        writer.close()
+    output_path = os.path.join(tmp_dir, "edited_video.mp4")
+    images_to_video(
+        edited_video, output_path, fps=config.pnp_config.target_fps
+    )
+    return output_path
 def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
     def check_video(video_path):

gradio_demo.py CHANGED Viewed

@@ -30,7 +30,7 @@ import imageio
 DEBUG_MODE = False
 demo_examples = [
-                    ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
                     ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
                     ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
                     ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
@@ -39,198 +39,189 @@ demo_examples = [
 TEMP_DIR = "_demo_temp"
-class ImageEditor:
-    def __init__(self) -> None:
-        self.image_edit_model = InstructPix2Pix()
-    @torch.no_grad()
-    @spaces.GPU(duration=30)
-    def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
-        edited_image_path = infer_video(self.image_edit_model,
                     video_path,
-                    output_dir=TEMP_DIR,
-                    prompt=prompt,
-                    prompt_type="instruct",
-                    force_512=force_512,
-                    seed=seed,
-                    negative_prompt=negative_prompt,
-                    overwrite=True)
-        return edited_image_path
-class AnyV2V_I2VGenXL:
-    def __init__(self) -> None:
-        # Set up default inversion config file
-        config = {
-            # DDIM inversion
-            "inverse_config": {
-                "image_size": [512, 512],
-                "n_frames": 16,
-                "cfg": 1.0,
-                "target_fps": 8,
-                "ddim_inv_prompt": "",
-                "prompt": "",
-                "negative_prompt": "",
-            },
-            "pnp_config": {
-                "random_ratio": 0.0,
-                "target_fps": 8,
-            },
-        }
-        self.config = OmegaConf.create(config)
-    @torch.no_grad()
-    @spaces.GPU(duration=150)
-    def perform_anyv2v(self,
-                       video_path,
-                       video_prompt,
-                       video_negative_prompt,
-                       edited_first_frame_path,
-                       conv_inj,
-                       spatial_inj,
-                       temp_inj,
-                       num_inference_steps,
-                       guidance_scale,
-                       ddim_init_latents_t_idx,
-                       ddim_inversion_steps,
-                       seed,
-                       ):
-        # Initialize the I2VGenXL pipeline
-        self.pipe = I2VGenXLPipeline.from_pretrained(
-            "ali-vilab/i2vgen-xl",
-            torch_dtype=torch.float16,
-            variant="fp16",
-        ).to("cuda:0")
-        # Initialize the DDIM inverse scheduler
-        self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
-                "ali-vilab/i2vgen-xl",
-                subfolder="scheduler",
-        )
-        # Initialize the DDIM scheduler
-        self.ddim_scheduler = DDIMScheduler.from_pretrained(
-                "ali-vilab/i2vgen-xl",
-                subfolder="scheduler",
-        )
-        tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
-        if os.path.exists(tmp_dir):
-            shutil.rmtree(tmp_dir)
-        os.makedirs(tmp_dir)
-        ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
-        def read_frames(video_path):
-            frames = []
-            with imageio.get_reader(video_path) as reader:
-                for i, frame in enumerate(reader):
-                    pil_image = Image.fromarray(frame)
-                    frames.append(pil_image)
-            return frames
-        frame_list = read_frames(str(video_path))
-        self.config.inverse_config.image_size = list(frame_list[0].size)
-        self.config.inverse_config.n_steps = ddim_inversion_steps
-        self.config.inverse_config.n_frames = len(frame_list)
-        self.config.inverse_config.output_dir = ddim_latents_path
-        ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
-        # Step 1. DDIM Inversion
-        first_frame = frame_list[0]
-        generator = torch.Generator(device="cuda:0")
-        generator = generator.manual_seed(seed)
-        _ddim_latents = ddim_inversion(
-            self.config.inverse_config,
-            first_frame,
-            frame_list,
-            self.pipe,
-            self.inverse_scheduler,
-            generator,
-        )
-        # Step 2. DDIM Sampling + PnP feature and attention injection
-        # Load the edited first frame
-        edited_1st_frame = load_image(edited_first_frame_path).resize(
-            self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
-        )
-        # Load the initial latents at t
-        self.ddim_scheduler.set_timesteps(num_inference_steps)
-        print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
-        ddim_latents_at_t = load_ddim_latents_at_t(
-            self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
-            ddim_latents_path=ddim_latents_path,
-        )
-        print(
-            f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
-        )
-        print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
-        # Blend the latents
-        random_latents = torch.randn_like(ddim_latents_at_t)
-        print(
-            f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
-        )
-        mixed_latents = (
-            random_latents * self.config.pnp_config.random_ratio
-            + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
-        )
-        # Init Pnp
-        self.config.pnp_config.n_steps = num_inference_steps
-        self.config.pnp_config.pnp_f_t = conv_inj
-        self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
-        self.config.pnp_config.pnp_temp_attn_t = temp_inj
-        self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
-        init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
-        # Edit video
-        self.pipe.register_modules(scheduler=self.ddim_scheduler)
-        edited_video = self.pipe.sample_with_pnp(
-            prompt=video_prompt,
-            image=edited_1st_frame,
-            height=self.config.inverse_config.image_size[1],
-            width=self.config.inverse_config.image_size[0],
-            num_frames=self.config.inverse_config.n_frames,
-            num_inference_steps=self.config.pnp_config.n_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=video_negative_prompt,
-            target_fps=self.config.pnp_config.target_fps,
-            latents=mixed_latents,
-            generator=generator,
-            return_dict=True,
-            ddim_init_latents_t_idx=ddim_init_latents_t_idx,
-            ddim_inv_latents_path=ddim_latents_path,
-            ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
-            ddim_inv_1st_frame=first_frame,
-        ).frames[0]
-        edited_video = [
-            frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
-            for frame in edited_video
-        ]
-        def images_to_video(images, output_path, fps=24):
-            writer = imageio.get_writer(output_path, fps=fps)
-            for img in images:
-                img_np = np.array(img)
-                writer.append_data(img_np)
-            writer.close()
-        output_path = os.path.join(tmp_dir, "edited_video.mp4")
-        images_to_video(
-            edited_video, output_path, fps=self.config.pnp_config.target_fps
-        )
-        return output_path
-# Init the class
-#=====================================
-if not DEBUG_MODE:
-    Image_Editor = ImageEditor()
-    AnyV2V_Editor = AnyV2V_I2VGenXL()
-#=====================================
 def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
     def check_video(video_path):

 DEBUG_MODE = False
 demo_examples = [
+                    ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "man walking", 0.1, 0.1, 1.0],
                     ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
                     ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
                     ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
 TEMP_DIR = "_demo_temp"
+image_edit_model = InstructPix2Pix()
+@torch.no_grad()
+@spaces.GPU(duration=30)
+def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
+    edited_image_path = infer_video(image_edit_model,
+                video_path,
+                output_dir=TEMP_DIR,
+                prompt=prompt,
+                prompt_type="instruct",
+                force_512=force_512,
+                seed=seed,
+                negative_prompt=negative_prompt,
+                overwrite=True)
+    return edited_image_path
+# Set up default inversion config file
+config = {
+    # DDIM inversion
+    "inverse_config": {
+        "image_size": [512, 512],
+        "n_frames": 16,
+        "cfg": 1.0,
+        "target_fps": 8,
+        "ddim_inv_prompt": "",
+        "prompt": "",
+        "negative_prompt": "",
+    },
+    "pnp_config": {
+        "random_ratio": 0.0,
+        "target_fps": 8,
+    },
+}
+config = OmegaConf.create(config)
+# Initialize the I2VGenXL pipeline
+pipe = I2VGenXLPipeline.from_pretrained(
+    "ali-vilab/i2vgen-xl",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda:0")
+# Initialize the DDIM inverse scheduler
+inverse_scheduler = DDIMInverseScheduler.from_pretrained(
+        "ali-vilab/i2vgen-xl",
+        subfolder="scheduler",
+)
+# Initialize the DDIM scheduler
+ddim_scheduler = DDIMScheduler.from_pretrained(
+        "ali-vilab/i2vgen-xl",
+        subfolder="scheduler",
+)
+@torch.no_grad()
+@spaces.GPU(duration=150)
+def perform_anyv2v(
                     video_path,
+                    video_prompt,
+                    video_negative_prompt,
+                    edited_first_frame_path,
+                    conv_inj,
+                    spatial_inj,
+                    temp_inj,
+                    num_inference_steps,
+                    guidance_scale,
+                    ddim_init_latents_t_idx,
+                    ddim_inversion_steps,
+                    seed,
+                    ):
+    tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
+    os.makedirs(tmp_dir)
+    ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
+    def read_frames(video_path):
+        frames = []
+        with imageio.get_reader(video_path) as reader:
+            for i, frame in enumerate(reader):
+                pil_image = Image.fromarray(frame)
+                frames.append(pil_image)
+        return frames
+    frame_list = read_frames(str(video_path))
+    config.inverse_config.image_size = list(frame_list[0].size)
+    config.inverse_config.n_steps = ddim_inversion_steps
+    config.inverse_config.n_frames = len(frame_list)
+    config.inverse_config.output_dir = ddim_latents_path
+    ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
+    # Step 1. DDIM Inversion
+    first_frame = frame_list[0]
+    generator = torch.Generator(device="cuda:0")
+    generator = generator.manual_seed(seed)
+    _ddim_latents = ddim_inversion(
+        config.inverse_config,
+        first_frame,
+        frame_list,
+        pipe,
+        inverse_scheduler,
+        generator,
+    )
+    # Step 2. DDIM Sampling + PnP feature and attention injection
+    # Load the edited first frame
+    edited_1st_frame = load_image(edited_first_frame_path).resize(
+        config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
+    )
+    # Load the initial latents at t
+    ddim_scheduler.set_timesteps(num_inference_steps)
+    print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
+    ddim_latents_at_t = load_ddim_latents_at_t(
+        ddim_scheduler.timesteps[ddim_init_latents_t_idx],
+        ddim_latents_path=ddim_latents_path,
+    )
+    print(
+        f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
+    )
+    print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
+    # Blend the latents
+    random_latents = torch.randn_like(ddim_latents_at_t)
+    print(
+        f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
+    )
+    mixed_latents = (
+        random_latents * config.pnp_config.random_ratio
+        + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
+    )
+    # Init Pnp
+    config.pnp_config.n_steps = num_inference_steps
+    config.pnp_config.pnp_f_t = conv_inj
+    config.pnp_config.pnp_spatial_attn_t = spatial_inj
+    config.pnp_config.pnp_temp_attn_t = temp_inj
+    config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
+    init_pnp(pipe, ddim_scheduler, config.pnp_config)
+    # Edit video
+    pipe.register_modules(scheduler=ddim_scheduler)
+    edited_video = pipe.sample_with_pnp(
+        prompt=video_prompt,
+        image=edited_1st_frame,
+        height=config.inverse_config.image_size[1],
+        width=config.inverse_config.image_size[0],
+        num_frames=config.inverse_config.n_frames,
+        num_inference_steps=config.pnp_config.n_steps,
+        guidance_scale=guidance_scale,
+        negative_prompt=video_negative_prompt,
+        target_fps=config.pnp_config.target_fps,
+        latents=mixed_latents,
+        generator=generator,
+        return_dict=True,
+        ddim_init_latents_t_idx=ddim_init_latents_t_idx,
+        ddim_inv_latents_path=ddim_latents_path,
+        ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
+        ddim_inv_1st_frame=first_frame,
+    ).frames[0]
+    edited_video = [
+        frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
+        for frame in edited_video
+    ]
+    def images_to_video(images, output_path, fps=24):
+        writer = imageio.get_writer(output_path, fps=fps)
+        for img in images:
+            img_np = np.array(img)
+            writer.append_data(img_np)
+        writer.close()
+    output_path = os.path.join(tmp_dir, "edited_video.mp4")
+    images_to_video(
+        edited_video, output_path, fps=config.pnp_config.target_fps
+    )
+    return output_path
 def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
     def check_video(video_path):