Test

Paused

App Files Files Community

EuuIia commited on Oct 3

Commit

ac23084

verified ·

1 Parent(s): f54c95b

Upload 6 files

Browse files

Files changed (6) hide show

README.md +9 -6
app.py +202 -0
inference.py +774 -0
requirements.txt +15 -0
setup.py +63 -0
video_service.py +295 -0

README.md CHANGED Viewed

@@ -1,10 +1,13 @@
 ---
-title: Test
-emoji: 📊
-colorFrom: pink
-colorTo: red
-sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LTX Video Fast
+emoji: 🎥
+colorFrom: yellow
+colorTo: pink
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app.py
 pinned: false
+short_description: ultra-fast video model, LTX 0.9.8 13B distilled
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# app.py (Versão Corrigida)
+import gradio as gr
+from PIL import Image
+import os
+import imageio
+from video_service import video_generation_service
+# --- FUNÇÕES DE AJUDA PARA A UI ---
+# ... (calculate_new_dimensions e handle_media_upload_for_dims permanecem as mesmas) ...
+TARGET_FIXED_SIDE = 768
+MIN_DIM_SLIDER = 256
+MAX_IMAGE_SIZE = 1280
+def calculate_new_dimensions(orig_w, orig_h):
+    if orig_w == 0 or orig_h == 0: return int(TARGET_FIXED_SIDE), int(TARGET_FIXED_SIDE)
+    if orig_w >= orig_h:
+        new_h, aspect_ratio = TARGET_FIXED_SIDE, orig_w / orig_h
+        new_w = round((new_h * aspect_ratio) / 32) * 32
+        new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
+        new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
+    else:
+        new_w, aspect_ratio = TARGET_FIXED_SIDE, orig_h / orig_w
+        new_h = round((new_w * aspect_ratio) / 32) * 32
+        new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
+        new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
+    return int(new_h), int(new_w)
+def handle_media_upload_for_dims(filepath, current_h, current_w):
+    if not filepath or not os.path.exists(str(filepath)): return gr.update(value=current_h), gr.update(value=current_w)
+    try:
+        if str(filepath).lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
+            with Image.open(filepath) as img:
+                orig_w, orig_h = img.size
+        else: # Assumir que é um vídeo
+            with imageio.get_reader(filepath) as reader:
+                meta = reader.get_meta_data()
+                orig_w, orig_h = meta.get('size', (current_w, current_h))
+        new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
+        return gr.update(value=new_h), gr.update(value=new_w)
+    except Exception as e:
+        print(f"Erro ao processar mídia para dimensões: {e}")
+        return gr.update(value=current_h), gr.update(value=current_w)
+def update_frame_slider(duration):
+    """Atualiza o valor máximo do slider de frame do meio com base na duração."""
+    fps = 24.0
+    max_frames = int(duration * fps)
+    # Garante que o valor padrão não seja maior que o novo máximo
+    new_value = 48 if max_frames >= 48 else max_frames // 2
+    return gr.update(maximum=max_frames, value=new_value)
+# --- FUNÇÃO WRAPPER PARA CHAMAR O SERVIÇO ---
+def gradio_generate_wrapper(
+    prompt, negative_prompt, mode,
+    # Entradas de Keyframe
+    start_image,
+    middle_image, middle_frame, middle_weight,
+    end_image, end_weight,
+    # Outras entradas
+    input_video, height, width, duration,
+    frames_to_use, seed, randomize_seed,
+    guidance_scale, improve_texture,
+    progress=gr.Progress(track_tqdm=True)
+):
+    try:
+        def progress_handler(step, total_steps):
+            progress(step / total_steps, desc="Salvando vídeo...")
+        output_path, used_seed = video_generation_service.generate(
+            prompt=prompt, negative_prompt=negative_prompt, mode=mode,
+            start_image_filepath=start_image,
+            middle_image_filepath=middle_image,
+            middle_frame_number=middle_frame,
+            middle_image_weight=middle_weight,
+            end_image_filepath=end_image,
+            end_image_weight=end_weight,
+            input_video_filepath=input_video,
+            height=int(height), width=int(width), duration=float(duration),
+            frames_to_use=int(frames_to_use), seed=int(seed),
+            randomize_seed=bool(randomize_seed), guidance_scale=float(guidance_scale),
+            improve_texture=bool(improve_texture), progress_callback=progress_handler
+        )
+        return output_path, used_seed
+    except ValueError as e:
+        raise gr.Error(str(e))
+    except Exception as e:
+        print(f"Erro inesperado na geração: {e}")
+        raise gr.Error("Ocorreu um erro inesperado. Verifique os logs.")
+# --- DEFINIÇÃO DA INTERFACE GRADIO ---
+css = "#col-container { margin: 0 auto; max-width: 900px; }"
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# LTX Video com Keyframes")
+    gr.Markdown("Guie a geração de vídeo usando imagens de início, meio e fim.")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab("image-to-video (Keyframes)") as image_tab:
+                i2v_prompt = gr.Textbox(label="Prompt", value="Uma bela transição entre as imagens", lines=2)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Início (Obrigatório)")
+                        start_image_i2v = gr.Image(label="Imagem de Início", type="filepath", sources=["upload", "clipboard"])
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Meio (Opcional)")
+                        middle_image_i2v = gr.Image(label="Imagem do Meio", type="filepath", sources=["upload", "clipboard"])
+                        middle_frame_i2v = gr.Slider(label="Frame Alvo", minimum=0, maximum=200, step=1, value=48)
+                        middle_weight_i2v = gr.Slider(label="Peso/Força", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
+                    with gr.Column(scale=1):
+                        gr.Markdown("#### Fim (Opcional)")
+                        end_image_i2v = gr.Image(label="Imagem de Fim", type="filepath", sources=["upload", "clipboard"])
+                        end_weight_i2v = gr.Slider(label="Peso/Força", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
+                i2v_button = gr.Button("Generate Image-to-Video", variant="primary")
+            with gr.Tab("text-to-video") as text_tab:
+                t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
+                t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
+            with gr.Tab("video-to-video") as video_tab:
+                video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"])
+                frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=257, value=9, step=8, info="Must be N*8+1.")
+                v2v_prompt = gr.Textbox(label="Prompt", value="Change the style to cinematic anime", lines=3)
+                v2v_button = gr.Button("Generate Video-to-Video", variant="primary")
+            duration_input = gr.Slider(label="Video Duration (seconds)", minimum=0.3, maximum=8.5, value=4, step=0.1)
+            improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True, visible=True)
+        with gr.Column():
+            output_video = gr.Video(label="Generated Video", interactive=False)
+    with gr.Accordion("Advanced settings", open=False):
+        mode = gr.Dropdown(["text-to-video", "image-to-video", "video-to-video"], label="task", value="image-to-video", visible=False)
+        negative_prompt_input = gr.Textbox(label="Negative Prompt", value="worst quality, blurry, jittery", lines=2)
+        with gr.Row():
+            seed_input = gr.Number(label="Seed", value=42, precision=0)
+            randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
+        guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+        with gr.Row():
+            height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
+            width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
+    # --- LÓGICA DE EVENTOS DA UI ---
+    start_image_i2v.upload(fn=handle_media_upload_for_dims, inputs=[start_image_i2v, height_input, width_input], outputs=[height_input, width_input])
+    video_v2v.upload(fn=handle_media_upload_for_dims, inputs=[video_v2v, height_input, width_input], outputs=[height_input, width_input])
+    duration_input.change(fn=update_frame_slider, inputs=duration_input, outputs=middle_frame_i2v)
+    image_tab.select(fn=lambda: "image-to-video", outputs=[mode])
+    text_tab.select(fn=lambda: "text-to-video", outputs=[mode])
+    video_tab.select(fn=lambda: "video-to-video", outputs=[mode])
+    # --- <INÍCIO DA CORREÇÃO> ---
+    # Reescrevendo as listas de inputs de forma explícita para evitar erros.
+    # Placeholders para os botões que não usam certos inputs
+    none_image = gr.Textbox(visible=False, value=None)
+    none_video = gr.Textbox(visible=False, value=None)
+    # Parâmetros comuns a todos
+    shared_params = [
+        height_input, width_input, duration_input, frames_to_use,
+        seed_input, randomize_seed_input, guidance_scale_input, improve_texture
+    ]
+    i2v_inputs = [
+        i2v_prompt, negative_prompt_input, mode,
+        start_image_i2v, middle_image_i2v, middle_frame_i2v, middle_weight_i2v,
+        end_image_i2v, end_weight_i2v,
+        none_video, # Placeholder para input_video
+        *shared_params
+    ]
+    t2v_inputs = [
+        t2v_prompt, negative_prompt_input, mode,
+        none_image, none_image, gr.Number(value=-1, visible=False), gr.Slider(value=0, visible=False), # Placeholders para keyframes
+        none_image, gr.Slider(value=0, visible=False),
+        none_video, # Placeholder para input_video
+        *shared_params
+    ]
+    v2v_inputs = [
+        v2v_prompt, negative_prompt_input, mode,
+        none_image, none_image, gr.Number(value=-1, visible=False), gr.Slider(value=0, visible=False), # Placeholders para keyframes
+        none_image, gr.Slider(value=0, visible=False),
+        video_v2v, # Input de vídeo real
+        *shared_params
+    ]
+    common_outputs = [output_video, seed_input]
+    i2v_button.click(fn=gradio_generate_wrapper, inputs=i2v_inputs, outputs=common_outputs, api_name="image_to_video_keyframes")
+    t2v_button.click(fn=gradio_generate_wrapper, inputs=t2v_inputs, outputs=common_outputs, api_name="text_to_video")
+    v2v_button.click(fn=gradio_generate_wrapper, inputs=v2v_inputs, outputs=common_outputs, api_name="video_to_video")
+    # --- <FIM DA CORREÇÃO> ---
+if __name__ == "__main__":
+    demo.queue().launch(debug=True, share=False)

inference.py ADDED Viewed

	@@ -0,0 +1,774 @@

+import argparse
+import os
+import random
+from datetime import datetime
+from pathlib import Path
+from diffusers.utils import logging
+from typing import Optional, List, Union
+import yaml
+import imageio
+import json
+import numpy as np
+import torch
+import cv2
+from safetensors import safe_open
+from PIL import Image
+from transformers import (
+    T5EncoderModel,
+    T5Tokenizer,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+)
+from huggingface_hub import hf_hub_download
+from ltx_video.models.autoencoders.causal_video_autoencoder import (
+    CausalVideoAutoencoder,
+)
+from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
+from ltx_video.models.transformers.transformer3d import Transformer3DModel
+from ltx_video.pipelines.pipeline_ltx_video import (
+    ConditioningItem,
+    LTXVideoPipeline,
+    LTXMultiScalePipeline,
+)
+from ltx_video.schedulers.rf import RectifiedFlowScheduler
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
+import ltx_video.pipelines.crf_compressor as crf_compressor
+MAX_HEIGHT = 720
+MAX_WIDTH = 1280
+MAX_NUM_FRAMES = 257
+logger = logging.get_logger("LTX-Video")
+def get_total_gpu_memory():
+    if torch.cuda.is_available():
+        total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        return total_memory
+    return 0
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def load_image_to_tensor_with_resize_and_crop(
+    image_input: Union[str, Image.Image],
+    target_height: int = 512,
+    target_width: int = 768,
+    just_crop: bool = False,
+) -> torch.Tensor:
+    """Load and process an image into a tensor.
+    Args:
+        image_input: Either a file path (str) or a PIL Image object
+        target_height: Desired height of output tensor
+        target_width: Desired width of output tensor
+        just_crop: If True, only crop the image to the target size without resizing
+    """
+    if isinstance(image_input, str):
+        image = Image.open(image_input).convert("RGB")
+    elif isinstance(image_input, Image.Image):
+        image = image_input
+    else:
+        raise ValueError("image_input must be either a file path or a PIL Image object")
+    input_width, input_height = image.size
+    aspect_ratio_target = target_width / target_height
+    aspect_ratio_frame = input_width / input_height
+    if aspect_ratio_frame > aspect_ratio_target:
+        new_width = int(input_height * aspect_ratio_target)
+        new_height = input_height
+        x_start = (input_width - new_width) // 2
+        y_start = 0
+    else:
+        new_width = input_width
+        new_height = int(input_width / aspect_ratio_target)
+        x_start = 0
+        y_start = (input_height - new_height) // 2
+    image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
+    if not just_crop:
+        image = image.resize((target_width, target_height))
+    image = np.array(image)
+    image = cv2.GaussianBlur(image, (3, 3), 0)
+    frame_tensor = torch.from_numpy(image).float()
+    frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
+    frame_tensor = frame_tensor.permute(2, 0, 1)
+    frame_tensor = (frame_tensor / 127.5) - 1.0
+    # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
+    return frame_tensor.unsqueeze(0).unsqueeze(2)
+def calculate_padding(
+    source_height: int, source_width: int, target_height: int, target_width: int
+) -> tuple[int, int, int, int]:
+    # Calculate total padding needed
+    pad_height = target_height - source_height
+    pad_width = target_width - source_width
+    # Calculate padding for each side
+    pad_top = pad_height // 2
+    pad_bottom = pad_height - pad_top  # Handles odd padding
+    pad_left = pad_width // 2
+    pad_right = pad_width - pad_left  # Handles odd padding
+    # Return padded tensor
+    # Padding format is (left, right, top, bottom)
+    padding = (pad_left, pad_right, pad_top, pad_bottom)
+    return padding
+def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
+    # Remove non-letters and convert to lowercase
+    clean_text = "".join(
+        char.lower() for char in text if char.isalpha() or char.isspace()
+    )
+    # Split into words
+    words = clean_text.split()
+    # Build result string keeping track of length
+    result = []
+    current_length = 0
+    for word in words:
+        # Add word length plus 1 for underscore (except for first word)
+        new_length = current_length + len(word)
+        if new_length <= max_len:
+            result.append(word)
+            current_length += len(word)
+        else:
+            break
+    return "-".join(result)
+# Generate output video name
+def get_unique_filename(
+    base: str,
+    ext: str,
+    prompt: str,
+    seed: int,
+    resolution: tuple[int, int, int],
+    dir: Path,
+    endswith=None,
+    index_range=1000,
+) -> Path:
+    base_filename = f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{seed}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
+    for i in range(index_range):
+        filename = dir / f"{base_filename}_{i}{endswith if endswith else ''}{ext}"
+        if not os.path.exists(filename):
+            return filename
+    raise FileExistsError(
+        f"Could not find a unique filename after {index_range} attempts."
+    )
+def seed_everething(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    if torch.backends.mps.is_available():
+        torch.mps.manual_seed(seed)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load models from separate directories and run the pipeline."
+    )
+    # Directories
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        help="Path to the folder to save output video, if None will save in outputs/ directory.",
+    )
+    parser.add_argument("--seed", type=int, default="171198")
+    # Pipeline parameters
+    parser.add_argument(
+        "--num_images_per_prompt",
+        type=int,
+        default=1,
+        help="Number of images per prompt",
+    )
+    parser.add_argument(
+        "--image_cond_noise_scale",
+        type=float,
+        default=0.15,
+        help="Amount of noise to add to the conditioned image",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=704,
+        help="Height of the output video frames. Optional if an input image provided.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1216,
+        help="Width of the output video frames. If None will infer from input image.",
+    )
+    parser.add_argument(
+        "--num_frames",
+        type=int,
+        default=121,
+        help="Number of frames to generate in the output video",
+    )
+    parser.add_argument(
+        "--frame_rate", type=int, default=30, help="Frame rate for the output video"
+    )
+    parser.add_argument(
+        "--device",
+        default=None,
+        help="Device to run inference on. If not specified, will automatically detect and use CUDA or MPS if available, else CPU.",
+    )
+    parser.add_argument(
+        "--pipeline_config",
+        type=str,
+        default="configs/ltxv-13b-0.9.7-dev.yaml",
+        help="The path to the config file for the pipeline, which contains the parameters for the pipeline",
+    )
+    # Prompts
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Text prompt to guide generation",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="worst quality, inconsistent motion, blurry, jittery, distorted",
+        help="Negative prompt for undesired features",
+    )
+    parser.add_argument(
+        "--offload_to_cpu",
+        action="store_true",
+        help="Offloading unnecessary computations to CPU.",
+    )
+    # video-to-video arguments:
+    parser.add_argument(
+        "--input_media_path",
+        type=str,
+        default=None,
+        help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
+    )
+    # Conditioning arguments
+    parser.add_argument(
+        "--conditioning_media_paths",
+        type=str,
+        nargs="*",
+        help="List of paths to conditioning media (images or videos). Each path will be used as a conditioning item.",
+    )
+    parser.add_argument(
+        "--conditioning_strengths",
+        type=float,
+        nargs="*",
+        help="List of conditioning strengths (between 0 and 1) for each conditioning item. Must match the number of conditioning items.",
+    )
+    parser.add_argument(
+        "--conditioning_start_frames",
+        type=int,
+        nargs="*",
+        help="List of frame indices where each conditioning item should be applied. Must match the number of conditioning items.",
+    )
+    args = parser.parse_args()
+    logger.warning(f"Running generation with arguments: {args}")
+    infer(**vars(args))
+def create_ltx_video_pipeline(
+    ckpt_path: str,
+    precision: str,
+    text_encoder_model_name_or_path: str,
+    sampler: Optional[str] = None,
+    device: Optional[str] = None,
+    enhance_prompt: bool = False,
+    prompt_enhancer_image_caption_model_name_or_path: Optional[str] = None,
+    prompt_enhancer_llm_model_name_or_path: Optional[str] = None,
+) -> LTXVideoPipeline:
+    ckpt_path = Path(ckpt_path)
+    assert os.path.exists(
+        ckpt_path
+    ), f"Ckpt path provided (--ckpt_path) {ckpt_path} does not exist"
+    with safe_open(ckpt_path, framework="pt") as f:
+        metadata = f.metadata()
+        config_str = metadata.get("config")
+        configs = json.loads(config_str)
+        allowed_inference_steps = configs.get("allowed_inference_steps", None)
+    vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
+    transformer = Transformer3DModel.from_pretrained(ckpt_path)
+    # Use constructor if sampler is specified, otherwise use from_pretrained
+    if sampler == "from_checkpoint" or not sampler:
+        scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
+    else:
+        scheduler = RectifiedFlowScheduler(
+            sampler=("Uniform" if sampler.lower() == "uniform" else "LinearQuadratic")
+        )
+    text_encoder = T5EncoderModel.from_pretrained(
+        text_encoder_model_name_or_path, subfolder="text_encoder"
+    )
+    patchifier = SymmetricPatchifier(patch_size=1)
+    tokenizer = T5Tokenizer.from_pretrained(
+        text_encoder_model_name_or_path, subfolder="tokenizer"
+    )
+    transformer = transformer.to(device)
+    vae = vae.to(device)
+    text_encoder = text_encoder.to(device)
+    if enhance_prompt:
+        prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained(
+            prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
+        )
+        prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained(
+            prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
+        )
+        prompt_enhancer_llm_model = AutoModelForCausalLM.from_pretrained(
+            prompt_enhancer_llm_model_name_or_path,
+            torch_dtype="bfloat16",
+        )
+        prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained(
+            prompt_enhancer_llm_model_name_or_path,
+        )
+    else:
+        prompt_enhancer_image_caption_model = None
+        prompt_enhancer_image_caption_processor = None
+        prompt_enhancer_llm_model = None
+        prompt_enhancer_llm_tokenizer = None
+    vae = vae.to(torch.bfloat16)
+    if precision == "bfloat16" and transformer.dtype != torch.bfloat16:
+        transformer = transformer.to(torch.bfloat16)
+    text_encoder = text_encoder.to(torch.bfloat16)
+    # Use submodels for the pipeline
+    submodel_dict = {
+        "transformer": transformer,
+        "patchifier": patchifier,
+        "text_encoder": text_encoder,
+        "tokenizer": tokenizer,
+        "scheduler": scheduler,
+        "vae": vae,
+        "prompt_enhancer_image_caption_model": prompt_enhancer_image_caption_model,
+        "prompt_enhancer_image_caption_processor": prompt_enhancer_image_caption_processor,
+        "prompt_enhancer_llm_model": prompt_enhancer_llm_model,
+        "prompt_enhancer_llm_tokenizer": prompt_enhancer_llm_tokenizer,
+        "allowed_inference_steps": allowed_inference_steps,
+    }
+    pipeline = LTXVideoPipeline(**submodel_dict)
+    pipeline = pipeline.to(device)
+    return pipeline
+def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
+    latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
+    latent_upsampler.to(device)
+    latent_upsampler.eval()
+    return latent_upsampler
+def infer(
+    output_path: Optional[str],
+    seed: int,
+    pipeline_config: str,
+    image_cond_noise_scale: float,
+    height: Optional[int],
+    width: Optional[int],
+    num_frames: int,
+    frame_rate: int,
+    prompt: str,
+    negative_prompt: str,
+    offload_to_cpu: bool,
+    input_media_path: Optional[str] = None,
+    conditioning_media_paths: Optional[List[str]] = None,
+    conditioning_strengths: Optional[List[float]] = None,
+    conditioning_start_frames: Optional[List[int]] = None,
+    device: Optional[str] = None,
+    **kwargs,
+):
+    # check if pipeline_config is a file
+    if not os.path.isfile(pipeline_config):
+        raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
+    with open(pipeline_config, "r") as f:
+        pipeline_config = yaml.safe_load(f)
+    models_dir = "MODEL_DIR"
+    ltxv_model_name_or_path = pipeline_config["checkpoint_path"]
+    if not os.path.isfile(ltxv_model_name_or_path):
+        ltxv_model_path = hf_hub_download(
+            repo_id="Lightricks/LTX-Video",
+            filename=ltxv_model_name_or_path,
+            local_dir=models_dir,
+            repo_type="model",
+        )
+    else:
+        ltxv_model_path = ltxv_model_name_or_path
+    spatial_upscaler_model_name_or_path = pipeline_config.get(
+        "spatial_upscaler_model_path"
+    )
+    if spatial_upscaler_model_name_or_path and not os.path.isfile(
+        spatial_upscaler_model_name_or_path
+    ):
+        spatial_upscaler_model_path = hf_hub_download(
+            repo_id="Lightricks/LTX-Video",
+            filename=spatial_upscaler_model_name_or_path,
+            local_dir=models_dir,
+            repo_type="model",
+        )
+    else:
+        spatial_upscaler_model_path = spatial_upscaler_model_name_or_path
+    if kwargs.get("input_image_path", None):
+        logger.warning(
+            "Please use conditioning_media_paths instead of input_image_path."
+        )
+        assert not conditioning_media_paths and not conditioning_start_frames
+        conditioning_media_paths = [kwargs["input_image_path"]]
+        conditioning_start_frames = [0]
+    # Validate conditioning arguments
+    if conditioning_media_paths:
+        # Use default strengths of 1.0
+        if not conditioning_strengths:
+            conditioning_strengths = [1.0] * len(conditioning_media_paths)
+        if not conditioning_start_frames:
+            raise ValueError(
+                "If `conditioning_media_paths` is provided, "
+                "`conditioning_start_frames` must also be provided"
+            )
+        if len(conditioning_media_paths) != len(conditioning_strengths) or len(
+            conditioning_media_paths
+        ) != len(conditioning_start_frames):
+            raise ValueError(
+                "`conditioning_media_paths`, `conditioning_strengths`, "
+                "and `conditioning_start_frames` must have the same length"
+            )
+        if any(s < 0 or s > 1 for s in conditioning_strengths):
+            raise ValueError("All conditioning strengths must be between 0 and 1")
+        if any(f < 0 or f >= num_frames for f in conditioning_start_frames):
+            raise ValueError(
+                f"All conditioning start frames must be between 0 and {num_frames-1}"
+            )
+    seed_everething(seed)
+    if offload_to_cpu and not torch.cuda.is_available():
+        logger.warning(
+            "offload_to_cpu is set to True, but offloading will not occur since the model is already running on CPU."
+        )
+        offload_to_cpu = False
+    else:
+        offload_to_cpu = offload_to_cpu and get_total_gpu_memory() < 30
+    output_dir = (
+        Path(output_path)
+        if output_path
+        else Path(f"outputs/{datetime.today().strftime('%Y-%m-%d')}")
+    )
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Adjust dimensions to be divisible by 32 and num_frames to be (N * 8 + 1)
+    height_padded = ((height - 1) // 32 + 1) * 32
+    width_padded = ((width - 1) // 32 + 1) * 32
+    num_frames_padded = ((num_frames - 2) // 8 + 1) * 8 + 1
+    padding = calculate_padding(height, width, height_padded, width_padded)
+    logger.warning(
+        f"Padded dimensions: {height_padded}x{width_padded}x{num_frames_padded}"
+    )
+    prompt_enhancement_words_threshold = pipeline_config[
+        "prompt_enhancement_words_threshold"
+    ]
+    prompt_word_count = len(prompt.split())
+    enhance_prompt = (
+        prompt_enhancement_words_threshold > 0
+        and prompt_word_count < prompt_enhancement_words_threshold
+    )
+    if prompt_enhancement_words_threshold > 0 and not enhance_prompt:
+        logger.info(
+            f"Prompt has {prompt_word_count} words, which exceeds the threshold of {prompt_enhancement_words_threshold}. Prompt enhancement disabled."
+        )
+    precision = pipeline_config["precision"]
+    text_encoder_model_name_or_path = pipeline_config["text_encoder_model_name_or_path"]
+    sampler = pipeline_config["sampler"]
+    prompt_enhancer_image_caption_model_name_or_path = pipeline_config[
+        "prompt_enhancer_image_caption_model_name_or_path"
+    ]
+    prompt_enhancer_llm_model_name_or_path = pipeline_config[
+        "prompt_enhancer_llm_model_name_or_path"
+    ]
+    pipeline = create_ltx_video_pipeline(
+        ckpt_path=ltxv_model_path,
+        precision=precision,
+        text_encoder_model_name_or_path=text_encoder_model_name_or_path,
+        sampler=sampler,
+        device=kwargs.get("device", get_device()),
+        enhance_prompt=enhance_prompt,
+        prompt_enhancer_image_caption_model_name_or_path=prompt_enhancer_image_caption_model_name_or_path,
+        prompt_enhancer_llm_model_name_or_path=prompt_enhancer_llm_model_name_or_path,
+    )
+    if pipeline_config.get("pipeline_type", None) == "multi-scale":
+        if not spatial_upscaler_model_path:
+            raise ValueError(
+                "spatial upscaler model path is missing from pipeline config file and is required for multi-scale rendering"
+            )
+        latent_upsampler = create_latent_upsampler(
+            spatial_upscaler_model_path, pipeline.device
+        )
+        pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
+    media_item = None
+    if input_media_path:
+        media_item = load_media_file(
+            media_path=input_media_path,
+            height=height,
+            width=width,
+            max_frames=num_frames_padded,
+            padding=padding,
+        )
+    conditioning_items = (
+        prepare_conditioning(
+            conditioning_media_paths=conditioning_media_paths,
+            conditioning_strengths=conditioning_strengths,
+            conditioning_start_frames=conditioning_start_frames,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            padding=padding,
+            pipeline=pipeline,
+        )
+        if conditioning_media_paths
+        else None
+    )
+    stg_mode = pipeline_config.get("stg_mode", "attention_values")
+    del pipeline_config["stg_mode"]
+    if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
+        skip_layer_strategy = SkipLayerStrategy.AttentionValues
+    elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
+        skip_layer_strategy = SkipLayerStrategy.AttentionSkip
+    elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
+        skip_layer_strategy = SkipLayerStrategy.Residual
+    elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
+        skip_layer_strategy = SkipLayerStrategy.TransformerBlock
+    else:
+        raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
+    # Prepare input for the pipeline
+    sample = {
+        "prompt": prompt,
+        "prompt_attention_mask": None,
+        "negative_prompt": negative_prompt,
+        "negative_prompt_attention_mask": None,
+    }
+    device = device or get_device()
+    generator = torch.Generator(device=device).manual_seed(seed)
+    images = pipeline(
+        **pipeline_config,
+        skip_layer_strategy=skip_layer_strategy,
+        generator=generator,
+        output_type="pt",
+        callback_on_step_end=None,
+        height=height_padded,
+        width=width_padded,
+        num_frames=num_frames_padded,
+        frame_rate=frame_rate,
+        **sample,
+        media_items=media_item,
+        conditioning_items=conditioning_items,
+        is_video=True,
+        vae_per_channel_normalize=True,
+        image_cond_noise_scale=image_cond_noise_scale,
+        mixed_precision=(precision == "mixed_precision"),
+        offload_to_cpu=offload_to_cpu,
+        device=device,
+        enhance_prompt=enhance_prompt,
+    ).images
+    # Crop the padded images to the desired resolution and number of frames
+    (pad_left, pad_right, pad_top, pad_bottom) = padding
+    pad_bottom = -pad_bottom
+    pad_right = -pad_right
+    if pad_bottom == 0:
+        pad_bottom = images.shape[3]
+    if pad_right == 0:
+        pad_right = images.shape[4]
+    images = images[:, :, :num_frames, pad_top:pad_bottom, pad_left:pad_right]
+    for i in range(images.shape[0]):
+        # Gathering from B, C, F, H, W to C, F, H, W and then permuting to F, H, W, C
+        video_np = images[i].permute(1, 2, 3, 0).cpu().float().numpy()
+        # Unnormalizing images to [0, 255] range
+        video_np = (video_np * 255).astype(np.uint8)
+        fps = frame_rate
+        height, width = video_np.shape[1:3]
+        # In case a single image is generated
+        if video_np.shape[0] == 1:
+            output_filename = get_unique_filename(
+                f"image_output_{i}",
+                ".png",
+                prompt=prompt,
+                seed=seed,
+                resolution=(height, width, num_frames),
+                dir=output_dir,
+            )
+            imageio.imwrite(output_filename, video_np[0])
+        else:
+            output_filename = get_unique_filename(
+                f"video_output_{i}",
+                ".mp4",
+                prompt=prompt,
+                seed=seed,
+                resolution=(height, width, num_frames),
+                dir=output_dir,
+            )
+            # Write video
+            with imageio.get_writer(output_filename, fps=fps) as video:
+                for frame in video_np:
+                    video.append_data(frame)
+        logger.warning(f"Output saved to {output_filename}")
+def prepare_conditioning(
+    conditioning_media_paths: List[str],
+    conditioning_strengths: List[float],
+    conditioning_start_frames: List[int],
+    height: int,
+    width: int,
+    num_frames: int,
+    padding: tuple[int, int, int, int],
+    pipeline: LTXVideoPipeline,
+) -> Optional[List[ConditioningItem]]:
+    """Prepare conditioning items based on input media paths and their parameters.
+    Args:
+        conditioning_media_paths: List of paths to conditioning media (images or videos)
+        conditioning_strengths: List of conditioning strengths for each media item
+        conditioning_start_frames: List of frame indices where each item should be applied
+        height: Height of the output frames
+        width: Width of the output frames
+        num_frames: Number of frames in the output video
+        padding: Padding to apply to the frames
+        pipeline: LTXVideoPipeline object used for condition video trimming
+    Returns:
+        A list of ConditioningItem objects.
+    """
+    conditioning_items = []
+    for path, strength, start_frame in zip(
+        conditioning_media_paths, conditioning_strengths, conditioning_start_frames
+    ):
+        num_input_frames = orig_num_input_frames = get_media_num_frames(path)
+        if hasattr(pipeline, "trim_conditioning_sequence") and callable(
+            getattr(pipeline, "trim_conditioning_sequence")
+        ):
+            num_input_frames = pipeline.trim_conditioning_sequence(
+                start_frame, orig_num_input_frames, num_frames
+            )
+        if num_input_frames < orig_num_input_frames:
+            logger.warning(
+                f"Trimming conditioning video {path} from {orig_num_input_frames} to {num_input_frames} frames."
+            )
+        media_tensor = load_media_file(
+            media_path=path,
+            height=height,
+            width=width,
+            max_frames=num_input_frames,
+            padding=padding,
+            just_crop=True,
+        )
+        conditioning_items.append(ConditioningItem(media_tensor, start_frame, strength))
+    return conditioning_items
+def get_media_num_frames(media_path: str) -> int:
+    is_video = any(
+        media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]
+    )
+    num_frames = 1
+    if is_video:
+        reader = imageio.get_reader(media_path)
+        num_frames = reader.count_frames()
+        reader.close()
+    return num_frames
+def load_media_file(
+    media_path: str,
+    height: int,
+    width: int,
+    max_frames: int,
+    padding: tuple[int, int, int, int],
+    just_crop: bool = False,
+) -> torch.Tensor:
+    is_video = any(
+        media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]
+    )
+    if is_video:
+        reader = imageio.get_reader(media_path)
+        num_input_frames = min(reader.count_frames(), max_frames)
+        # Read and preprocess the relevant frames from the video file.
+        frames = []
+        for i in range(num_input_frames):
+            frame = Image.fromarray(reader.get_data(i))
+            frame_tensor = load_image_to_tensor_with_resize_and_crop(
+                frame, height, width, just_crop=just_crop
+            )
+            frame_tensor = torch.nn.functional.pad(frame_tensor, padding)
+            frames.append(frame_tensor)
+        reader.close()
+        # Stack frames along the temporal dimension
+        media_tensor = torch.cat(frames, dim=2)
+    else:  # Input image
+        media_tensor = load_image_to_tensor_with_resize_and_crop(
+            media_path, height, width, just_crop=just_crop
+        )
+        media_tensor = torch.nn.functional.pad(media_tensor, padding)
+    return media_tensor
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+accelerate
+transformers
+sentencepiece
+pillow
+numpy
+torchvision
+huggingface_hub
+spaces
+opencv-python
+imageio
+imageio-ffmpeg
+einops
+timm
+av
+git+https://github.com/huggingface/diffusers.git@main

setup.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# setup.py
+#
+# Copyright (C) August 4, 2025  Carlos Rodrigues dos Santos
+#
+# Versão 2.0.0 (Clonagem Anônima e Robusta)
+# - Usa URLs HTTPS explícitas e anônimas para evitar que o Git tente
+#   usar credenciais em cache desnecessariamente para repositórios públicos.
+import os
+import subprocess
+import sys
+from pathlib import Path
+# --- Configuração ---
+DEPS_DIR = Path("./deps")
+# URLs explícitas e anônimas para os repositórios públicos
+REPOS_TO_CLONE = {
+    "LTX-Video": "https://huggingface.co/spaces/Lightricks/ltx-video-distilled",
+    "SeedVR_Space": "https://huggingface.co/spaces/ByteDance-Seed/SeedVR2-3B",
+    "MMAudio": "https://github.com/hkchengrex/MMAudio.git"
+}
+def run_command(command, cwd=None):
+    """Executa um comando no terminal e lida com erros."""
+    print(f"Executando: {' '.join(command)}")
+    try:
+        # Redireciona o stdin para DEVNULL para garantir que o git não tente pedir senha
+        subprocess.run(
+            command,
+            check=True,
+            cwd=cwd,
+            stdin=subprocess.DEVNULL,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"ERRO: O comando falhou com o código de saída {e.returncode}")
+        # stderr é capturado automaticamente se check=True falhar
+        print(f"Stderr: {e.stderr}")
+        sys.exit(1)
+    except FileNotFoundError:
+        print(f"ERRO: O comando '{command[0]}' não foi encontrado. Certifique-se de que o git está instalado e no seu PATH.")
+        sys.exit(1)
+def main():
+    print("--- Iniciando Setup do Ambiente ADUC-SDR ---")
+    DEPS_DIR.mkdir(exist_ok=True)
+    for repo_name, repo_url in REPOS_TO_CLONE.items():
+        repo_path = DEPS_DIR / repo_name
+        if repo_path.exists():
+            print(f"Repositório '{repo_name}' já existe. Pulando a clonagem.")
+        else:
+            print(f"Clonando '{repo_name}' de {repo_url}...")
+            run_command(["git", "clone", "--depth", "1", repo_url, str(repo_path)])
+            print(f"'{repo_name}' clonado com sucesso.")
+    print("\n--- Setup do Ambiente Concluído com Sucesso! ---")
+    print("Você agora pode iniciar a aplicação principal (ex: python app.py).")
+if __name__ == "__main__":
+    main()

video_service.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# video_service.py
+# --- 1. IMPORTAÇÕES ---
+import torch
+import numpy as np
+import random
+import os
+import yaml
+from pathlib import Path
+import imageio
+import tempfile
+from huggingface_hub import hf_hub_download
+import sys
+import subprocess
+from PIL import Image
+# --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
+def run_setup():
+    """Executa o script setup.py para clonar as dependências necessárias."""
+    setup_script_path = "setup.py"
+    if not os.path.exists(setup_script_path):
+        print("AVISO: script 'setup.py' não encontrado. Pulando a clonagem de dependências.")
+        return
+    try:
+        print("--- Executando setup.py para garantir que as dependências estão presentes ---")
+        subprocess.run([sys.executable, setup_script_path], check=True)
+        print("--- Setup concluído com sucesso ---")
+    except subprocess.CalledProcessError as e:
+        print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
+        sys.exit(1)
+DEPS_DIR = Path("./deps")
+LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
+if not LTX_VIDEO_REPO_DIR.exists():
+    run_setup()
+def add_deps_to_path():
+    """Adiciona o repositório clonado ao sys.path para que suas bibliotecas possam ser importadas."""
+    if not LTX_VIDEO_REPO_DIR.exists():
+        raise FileNotFoundError(f"Repositório LTX-Video não encontrado em '{LTX_VIDEO_REPO_DIR}'. Execute o setup.")
+    if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
+        sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))
+add_deps_to_path()
+# --- 3. IMPORTAÇÕES ESPECÍFICAS DO MODELO ---
+from inference import (
+    create_ltx_video_pipeline, create_latent_upsampler,
+    load_image_to_tensor_with_resize_and_crop, seed_everething,
+    calculate_padding, load_media_file
+)
+from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+# --- 4. FUNÇÕES HELPER DE LOG ---
+def log_tensor_info(tensor, name="Tensor"):
+    if not isinstance(tensor, torch.Tensor):
+        print(f"\n[INFO] O item '{name}' não é um tensor para logar.")
+        return
+    print(f"\n--- Informações do Tensor: {name} ---")
+    print(f"  - Shape: {tensor.shape}")
+    print(f"  - Dtype: {tensor.dtype}")
+    print(f"  - Device: {tensor.device}")
+    if tensor.numel() > 0:
+        print(f"  - Min valor: {tensor.min().item():.4f}")
+        print(f"  - Max valor: {tensor.max().item():.4f}")
+        print(f"  - Média: {tensor.mean().item():.4f}")
+    else:
+        print("  - O tensor está vazio, sem estatísticas.")
+    print("------------------------------------------\n")
+# --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
+class VideoService:
+    def __init__(self):
+        print("Inicializando VideoService...")
+        self.config = self._load_config()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.last_memory_reserved_mb = 0
+        self.pipeline, self.latent_upsampler = self._load_models()
+        print(f"Movendo modelos para o dispositivo de inferência: {self.device}")
+        self.pipeline.to(self.device)
+        if self.latent_upsampler:
+            self.latent_upsampler.to(self.device)
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+            self._log_gpu_memory("Após carregar modelos")
+        print("VideoService pronto para uso.")
+    def _log_gpu_memory(self, stage_name: str):
+        if self.device != "cuda": return
+        current_reserved_b = torch.cuda.memory_reserved()
+        current_reserved_mb = current_reserved_b / (1024 ** 2)
+        total_memory_b = torch.cuda.get_device_properties(0).total_memory
+        total_memory_mb = total_memory_b / (1024 ** 2)
+        peak_reserved_mb = torch.cuda.max_memory_reserved() / (1024 ** 2)
+        delta_mb = current_reserved_mb - self.last_memory_reserved_mb
+        print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} ---")
+        print(f"  - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
+        print(f"  - Variação desde o último log: {delta_mb:+.2f} MB")
+        if peak_reserved_mb > self.last_memory_reserved_mb:
+            print(f"  - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
+        print("--------------------------------------------------\n")
+        self.last_memory_reserved_mb = current_reserved_mb
+    def _load_config(self):
+        config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
+        with open(config_file_path, "r") as file:
+            return yaml.safe_load(file)
+    def _load_models(self):
+        models_dir = "downloaded_models_gradio"
+        Path(models_dir).mkdir(parents=True, exist_ok=True)
+        LTX_REPO = "Lightricks/LTX-Video"
+        distilled_model_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["checkpoint_path"], local_dir=models_dir, local_dir_use_symlinks=False)
+        self.config["checkpoint_path"] = distilled_model_path
+        spatial_upscaler_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["spatial_upscaler_model_path"], local_dir=models_dir, local_dir_use_symlinks=False)
+        self.config["spatial_upscaler_model_path"] = spatial_upscaler_path
+        pipeline = create_ltx_video_pipeline(ckpt_path=self.config["checkpoint_path"], precision=self.config["precision"], text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"], sampler=self.config["sampler"], device="cpu", enhance_prompt=False, prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"], prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"])
+        latent_upsampler = None
+        if self.config.get("spatial_upscaler_model_path"):
+            latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
+        return pipeline, latent_upsampler
+    def _prepare_conditioning_tensor_from_file(self, filepath, height, width, padding_values):
+        """Prepara um tensor de condicionamento a partir de um arquivo de imagem."""
+        tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
+        tensor = torch.nn.functional.pad(tensor, padding_values)
+        return tensor.to(self.device)
+    def _extract_frames_from_video(self, video_path: str, frame_indices: list) -> list:
+        print(f"[INFO] Extraindo frames nos índices: {frame_indices} do vídeo '{video_path}'")
+        extracted_frames = []
+        indices_to_get = set(frame_indices)
+        try:
+            with imageio.get_reader(video_path) as reader:
+                for i, frame in enumerate(reader):
+                    if i in indices_to_get:
+                        extracted_frames.append(frame)
+                    if len(extracted_frames) == len(indices_to_get):
+                        break
+            if len(extracted_frames) != len(frame_indices):
+                print(f"[AVISO] Esperava extrair {len(frame_indices)} frames, mas o vídeo só tinha {len(extracted_frames)} correspondentes.")
+        except Exception as e:
+            print(f"[ERRO] Falha ao extrair frames do vídeo: {e}")
+        return extracted_frames
+    def _get_video_dimensions(self, video_path: str) -> tuple[int, int]:
+        """Lê um arquivo de vídeo e retorna sua largura e altura."""
+        try:
+            with imageio.get_reader(video_path) as reader:
+                meta = reader.get_meta_data()
+                size = meta.get('size')
+                if size:
+                    return size
+            return (None, None)
+        except Exception as e:
+            print(f"[ERRO] Não foi possível ler as dimensões do vídeo: {e}")
+            return (None, None)
+    def generate(self, prompt, negative_prompt, mode="text-to-video",
+                 start_image_filepath=None,
+                 middle_image_filepath=None, middle_frame_number=None, middle_image_weight=1.0,
+                 end_image_filepath=None, end_image_weight=1.0,
+                 input_video_filepath=None, height=512, width=704, duration=2.0,
+                 frames_to_use=9, seed=42, randomize_seed=True, guidance_scale=3.0,
+                 improve_texture=True, progress_callback=None):
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+        self._log_gpu_memory("Início da Geração")
+        if mode == "image-to-video" and not start_image_filepath:
+            raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
+        if mode == "video-to-video" and not input_video_filepath:
+            raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
+        used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
+        seed_everething(used_seed)
+        if mode == "video-to-video":
+            orig_w, orig_h = self._get_video_dimensions(input_video_filepath)
+            if orig_w and orig_h:
+                width = round(orig_w / 32) * 32
+                height = round(orig_h / 32) * 32
+                print(f"[INFO] Modo video-to-video: Dimensões recalculadas para {width}x{height}")
+        FPS = 24.0
+        MAX_NUM_FRAMES = 257
+        target_frames_rounded = round(duration * FPS)
+        n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
+        actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
+        height_padded = ((height - 1) // 32 + 1) * 32
+        width_padded = ((width - 1) // 32 + 1) * 32
+        padding_values = calculate_padding(height, width, height_padded, width_padded)
+        generator = torch.Generator(device=self.device).manual_seed(used_seed)
+        conditioning_items = []
+        if mode == "image-to-video":
+            start_tensor = self._prepare_conditioning_tensor_from_file(start_image_filepath, height, width, padding_values)
+            conditioning_items.append(ConditioningItem(start_tensor, 0, 1.0))
+            if middle_image_filepath and middle_frame_number is not None:
+                middle_tensor = self._prepare_conditioning_tensor_from_file(middle_image_filepath, height, width, padding_values)
+                safe_middle_frame = max(0, min(int(middle_frame_number), actual_num_frames - 1))
+                conditioning_items.append(ConditioningItem(middle_tensor, safe_middle_frame, float(middle_image_weight)))
+            if end_image_filepath:
+                end_tensor = self._prepare_conditioning_tensor_from_file(end_image_filepath, height, width, padding_values)
+                last_frame_index = actual_num_frames - 1
+                conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
+        # --- <LÓGICA CORRIGIDA E SIMPLIFICADA> ---
+        elif mode == "video-to-video":
+            indices_to_extract = list(range(0, int(frames_to_use), 8))
+            extracted_frames_np = self._extract_frames_from_video(input_video_filepath, indices_to_extract)
+            x=1
+            with tempfile.TemporaryDirectory() as temp_dir:
+                for i, frame_np in enumerate(extracted_frames_np):
+                    x = x+1
+                    frame_index = indices_to_extract[i]
+                    temp_frame_path = os.path.join(temp_dir, f"frame_{frame_index}.png")
+                    imageio.imwrite(temp_frame_path, frame_np)
+                    # Reutiliza a função de processamento de imagem, como você sugeriu
+                    frame_tensor = self._prepare_conditioning_tensor_from_file(
+                        temp_frame_path, height, width, padding_values
+                    )
+                    conditioning_items.append(ConditioningItem(frame_tensor, ((x*8)-8)-1, 0.5))
+            print(f"[INFO] {len(conditioning_items)} frames do vídeo foram processados como keyframes de condicionamento.")
+        call_kwargs = {
+            "prompt": prompt, "negative_prompt": negative_prompt, "height": height_padded, "width": width_padded,
+            "num_frames": actual_num_frames, "frame_rate": int(FPS), "generator": generator, "output_type": "pt",
+            "conditioning_items": conditioning_items if conditioning_items else None,
+            "media_items": None,
+            "decode_timestep": self.config["decode_timestep"], "decode_noise_scale": self.config["decode_noise_scale"],
+            "stochastic_sampling": True, #self.config["stochastic_sampling"], "image_cond_noise_scale": 0.15,
+            "is_video": False, "vae_per_channel_normalize": True,
+            "mixed_precision": True, #(self.config["precision"] == "mixed_precision"),
+            "offload_to_cpu": False, "enhance_prompt": False,
+            "skip_layer_strategy": None, #$/#SkipLayerStrategy.AttentionValues
+        }
+        result_tensor = None
+        if improve_texture:
+            if not self.latent_upsampler:
+                raise ValueError("Upscaler espacial não carregado.")
+            multi_scale_pipeline = LTXMultiScalePipeline(self.pipeline, self.latent_upsampler)
+            first_pass_args = self.config.get("first_pass", {}).copy()
+            first_pass_args["guidance_scale"] = float(guidance_scale)
+            second_pass_args = self.config.get("second_pass", {}).copy()
+            second_pass_args["guidance_scale"] = float(guidance_scale)
+            multi_scale_call_kwargs = call_kwargs.copy()
+            multi_scale_call_kwargs.update({"downscale_factor": self.config["downscale_factor"], "first_pass": first_pass_args, "second_pass": second_pass_args})
+            result_tensor = multi_scale_pipeline(**multi_scale_call_kwargs).images
+            log_tensor_info(result_tensor, "Resultado da Etapa 2 (Saída do Pipeline Multi-Scale)")
+        else:
+            single_pass_kwargs = call_kwargs.copy()
+            first_pass_config = self.config.get("first_pass", {})
+            single_pass_kwargs.update({
+                "guidance_scale": float(guidance_scale),
+                "stg_scale": first_pass_config.get("stg_scale"),
+                "rescaling_scale": first_pass_config.get("rescaling_scale"),
+                "skip_block_list": first_pass_config.get("skip_block_list"),
+                "timesteps": first_pass_config.get("timesteps"),
+            })
+            print("\n[INFO] Executando pipeline de etapa única...")
+            result_tensor = self.pipeline(**single_pass_kwargs).images
+        pad_left, pad_right, pad_top, pad_bottom = padding_values
+        slice_h_end = -pad_bottom if pad_bottom > 0 else None
+        slice_w_end = -pad_right if pad_right > 0 else None
+        result_tensor = result_tensor[:, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
+        log_tensor_info(result_tensor, "Tensor Final (Após Pós-processamento, Antes de Salvar)")
+        video_np = (result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy() * 255).astype(np.uint8)
+        temp_dir = tempfile.mkdtemp()
+        output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
+        with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], codec='libx264', quality=8) as writer:
+            total_frames = len(video_np)
+            for i, frame in enumerate(video_np):
+                writer.append_data(frame)
+                if progress_callback:
+                    progress_callback(i + 1, total_frames)
+        self._log_gpu_memory("Fim da Geração")
+        return output_video_path, used_seed
+print("Criando instância do VideoService. O carregamento do modelo começará agora...")
+video_generation_service = VideoService()