EuuIia commited on
Commit
ac23084
·
verified ·
1 Parent(s): f54c95b

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +9 -6
  2. app.py +202 -0
  3. inference.py +774 -0
  4. requirements.txt +15 -0
  5. setup.py +63 -0
  6. video_service.py +295 -0
README.md CHANGED
@@ -1,10 +1,13 @@
1
  ---
2
- title: Test
3
- emoji: 📊
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: docker
 
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: LTX Video Fast
3
+ emoji: 🎥
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app.py
9
  pinned: false
10
+ short_description: ultra-fast video model, LTX 0.9.8 13B distilled
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py (Versão Corrigida)
2
+
3
+ import gradio as gr
4
+ from PIL import Image
5
+ import os
6
+ import imageio
7
+ from video_service import video_generation_service
8
+
9
+ # --- FUNÇÕES DE AJUDA PARA A UI ---
10
+ # ... (calculate_new_dimensions e handle_media_upload_for_dims permanecem as mesmas) ...
11
+ TARGET_FIXED_SIDE = 768
12
+ MIN_DIM_SLIDER = 256
13
+ MAX_IMAGE_SIZE = 1280
14
+
15
+ def calculate_new_dimensions(orig_w, orig_h):
16
+ if orig_w == 0 or orig_h == 0: return int(TARGET_FIXED_SIDE), int(TARGET_FIXED_SIDE)
17
+ if orig_w >= orig_h:
18
+ new_h, aspect_ratio = TARGET_FIXED_SIDE, orig_w / orig_h
19
+ new_w = round((new_h * aspect_ratio) / 32) * 32
20
+ new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
21
+ new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
22
+ else:
23
+ new_w, aspect_ratio = TARGET_FIXED_SIDE, orig_h / orig_w
24
+ new_h = round((new_w * aspect_ratio) / 32) * 32
25
+ new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
26
+ new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
27
+ return int(new_h), int(new_w)
28
+
29
+ def handle_media_upload_for_dims(filepath, current_h, current_w):
30
+ if not filepath or not os.path.exists(str(filepath)): return gr.update(value=current_h), gr.update(value=current_w)
31
+ try:
32
+ if str(filepath).lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
33
+ with Image.open(filepath) as img:
34
+ orig_w, orig_h = img.size
35
+ else: # Assumir que é um vídeo
36
+ with imageio.get_reader(filepath) as reader:
37
+ meta = reader.get_meta_data()
38
+ orig_w, orig_h = meta.get('size', (current_w, current_h))
39
+ new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
40
+ return gr.update(value=new_h), gr.update(value=new_w)
41
+ except Exception as e:
42
+ print(f"Erro ao processar mídia para dimensões: {e}")
43
+ return gr.update(value=current_h), gr.update(value=current_w)
44
+
45
+ def update_frame_slider(duration):
46
+ """Atualiza o valor máximo do slider de frame do meio com base na duração."""
47
+ fps = 24.0
48
+ max_frames = int(duration * fps)
49
+ # Garante que o valor padrão não seja maior que o novo máximo
50
+ new_value = 48 if max_frames >= 48 else max_frames // 2
51
+ return gr.update(maximum=max_frames, value=new_value)
52
+
53
+
54
+ # --- FUNÇÃO WRAPPER PARA CHAMAR O SERVIÇO ---
55
+ def gradio_generate_wrapper(
56
+ prompt, negative_prompt, mode,
57
+ # Entradas de Keyframe
58
+ start_image,
59
+ middle_image, middle_frame, middle_weight,
60
+ end_image, end_weight,
61
+ # Outras entradas
62
+ input_video, height, width, duration,
63
+ frames_to_use, seed, randomize_seed,
64
+ guidance_scale, improve_texture,
65
+ progress=gr.Progress(track_tqdm=True)
66
+ ):
67
+ try:
68
+ def progress_handler(step, total_steps):
69
+ progress(step / total_steps, desc="Salvando vídeo...")
70
+
71
+ output_path, used_seed = video_generation_service.generate(
72
+ prompt=prompt, negative_prompt=negative_prompt, mode=mode,
73
+ start_image_filepath=start_image,
74
+ middle_image_filepath=middle_image,
75
+ middle_frame_number=middle_frame,
76
+ middle_image_weight=middle_weight,
77
+ end_image_filepath=end_image,
78
+ end_image_weight=end_weight,
79
+ input_video_filepath=input_video,
80
+ height=int(height), width=int(width), duration=float(duration),
81
+ frames_to_use=int(frames_to_use), seed=int(seed),
82
+ randomize_seed=bool(randomize_seed), guidance_scale=float(guidance_scale),
83
+ improve_texture=bool(improve_texture), progress_callback=progress_handler
84
+ )
85
+ return output_path, used_seed
86
+ except ValueError as e:
87
+ raise gr.Error(str(e))
88
+ except Exception as e:
89
+ print(f"Erro inesperado na geração: {e}")
90
+ raise gr.Error("Ocorreu um erro inesperado. Verifique os logs.")
91
+
92
+ # --- DEFINIÇÃO DA INTERFACE GRADIO ---
93
+ css = "#col-container { margin: 0 auto; max-width: 900px; }"
94
+ with gr.Blocks(css=css) as demo:
95
+ gr.Markdown("# LTX Video com Keyframes")
96
+ gr.Markdown("Guie a geração de vídeo usando imagens de início, meio e fim.")
97
+
98
+ with gr.Row():
99
+ with gr.Column():
100
+ with gr.Tab("image-to-video (Keyframes)") as image_tab:
101
+ i2v_prompt = gr.Textbox(label="Prompt", value="Uma bela transição entre as imagens", lines=2)
102
+
103
+ with gr.Row():
104
+ with gr.Column(scale=1):
105
+ gr.Markdown("#### Início (Obrigatório)")
106
+ start_image_i2v = gr.Image(label="Imagem de Início", type="filepath", sources=["upload", "clipboard"])
107
+ with gr.Column(scale=1):
108
+ gr.Markdown("#### Meio (Opcional)")
109
+ middle_image_i2v = gr.Image(label="Imagem do Meio", type="filepath", sources=["upload", "clipboard"])
110
+ middle_frame_i2v = gr.Slider(label="Frame Alvo", minimum=0, maximum=200, step=1, value=48)
111
+ middle_weight_i2v = gr.Slider(label="Peso/Força", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
112
+ with gr.Column(scale=1):
113
+ gr.Markdown("#### Fim (Opcional)")
114
+ end_image_i2v = gr.Image(label="Imagem de Fim", type="filepath", sources=["upload", "clipboard"])
115
+ end_weight_i2v = gr.Slider(label="Peso/Força", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
116
+
117
+ i2v_button = gr.Button("Generate Image-to-Video", variant="primary")
118
+
119
+ with gr.Tab("text-to-video") as text_tab:
120
+ t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
121
+ t2v_button = gr.Button("Generate Text-to-Video", variant="primary")
122
+
123
+ with gr.Tab("video-to-video") as video_tab:
124
+ video_v2v = gr.Video(label="Input Video", sources=["upload", "webcam"])
125
+ frames_to_use = gr.Slider(label="Frames to use from input video", minimum=9, maximum=257, value=9, step=8, info="Must be N*8+1.")
126
+ v2v_prompt = gr.Textbox(label="Prompt", value="Change the style to cinematic anime", lines=3)
127
+ v2v_button = gr.Button("Generate Video-to-Video", variant="primary")
128
+
129
+ duration_input = gr.Slider(label="Video Duration (seconds)", minimum=0.3, maximum=8.5, value=4, step=0.1)
130
+ improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True, visible=True)
131
+
132
+ with gr.Column():
133
+ output_video = gr.Video(label="Generated Video", interactive=False)
134
+
135
+ with gr.Accordion("Advanced settings", open=False):
136
+ mode = gr.Dropdown(["text-to-video", "image-to-video", "video-to-video"], label="task", value="image-to-video", visible=False)
137
+ negative_prompt_input = gr.Textbox(label="Negative Prompt", value="worst quality, blurry, jittery", lines=2)
138
+ with gr.Row():
139
+ seed_input = gr.Number(label="Seed", value=42, precision=0)
140
+ randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
141
+ guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
142
+ with gr.Row():
143
+ height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
144
+ width_input = gr.Slider(label="Width", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
145
+
146
+ # --- LÓGICA DE EVENTOS DA UI ---
147
+
148
+ start_image_i2v.upload(fn=handle_media_upload_for_dims, inputs=[start_image_i2v, height_input, width_input], outputs=[height_input, width_input])
149
+ video_v2v.upload(fn=handle_media_upload_for_dims, inputs=[video_v2v, height_input, width_input], outputs=[height_input, width_input])
150
+ duration_input.change(fn=update_frame_slider, inputs=duration_input, outputs=middle_frame_i2v)
151
+
152
+ image_tab.select(fn=lambda: "image-to-video", outputs=[mode])
153
+ text_tab.select(fn=lambda: "text-to-video", outputs=[mode])
154
+ video_tab.select(fn=lambda: "video-to-video", outputs=[mode])
155
+
156
+ # --- <INÍCIO DA CORREÇÃO> ---
157
+ # Reescrevendo as listas de inputs de forma explícita para evitar erros.
158
+
159
+ # Placeholders para os botões que não usam certos inputs
160
+ none_image = gr.Textbox(visible=False, value=None)
161
+ none_video = gr.Textbox(visible=False, value=None)
162
+
163
+ # Parâmetros comuns a todos
164
+ shared_params = [
165
+ height_input, width_input, duration_input, frames_to_use,
166
+ seed_input, randomize_seed_input, guidance_scale_input, improve_texture
167
+ ]
168
+
169
+ i2v_inputs = [
170
+ i2v_prompt, negative_prompt_input, mode,
171
+ start_image_i2v, middle_image_i2v, middle_frame_i2v, middle_weight_i2v,
172
+ end_image_i2v, end_weight_i2v,
173
+ none_video, # Placeholder para input_video
174
+ *shared_params
175
+ ]
176
+
177
+ t2v_inputs = [
178
+ t2v_prompt, negative_prompt_input, mode,
179
+ none_image, none_image, gr.Number(value=-1, visible=False), gr.Slider(value=0, visible=False), # Placeholders para keyframes
180
+ none_image, gr.Slider(value=0, visible=False),
181
+ none_video, # Placeholder para input_video
182
+ *shared_params
183
+ ]
184
+
185
+ v2v_inputs = [
186
+ v2v_prompt, negative_prompt_input, mode,
187
+ none_image, none_image, gr.Number(value=-1, visible=False), gr.Slider(value=0, visible=False), # Placeholders para keyframes
188
+ none_image, gr.Slider(value=0, visible=False),
189
+ video_v2v, # Input de vídeo real
190
+ *shared_params
191
+ ]
192
+
193
+ common_outputs = [output_video, seed_input]
194
+
195
+ i2v_button.click(fn=gradio_generate_wrapper, inputs=i2v_inputs, outputs=common_outputs, api_name="image_to_video_keyframes")
196
+ t2v_button.click(fn=gradio_generate_wrapper, inputs=t2v_inputs, outputs=common_outputs, api_name="text_to_video")
197
+ v2v_button.click(fn=gradio_generate_wrapper, inputs=v2v_inputs, outputs=common_outputs, api_name="video_to_video")
198
+ # --- <FIM DA CORREÇÃO> ---
199
+
200
+
201
+ if __name__ == "__main__":
202
+ demo.queue().launch(debug=True, share=False)
inference.py ADDED
@@ -0,0 +1,774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import random
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from diffusers.utils import logging
7
+ from typing import Optional, List, Union
8
+ import yaml
9
+
10
+ import imageio
11
+ import json
12
+ import numpy as np
13
+ import torch
14
+ import cv2
15
+ from safetensors import safe_open
16
+ from PIL import Image
17
+ from transformers import (
18
+ T5EncoderModel,
19
+ T5Tokenizer,
20
+ AutoModelForCausalLM,
21
+ AutoProcessor,
22
+ AutoTokenizer,
23
+ )
24
+ from huggingface_hub import hf_hub_download
25
+
26
+ from ltx_video.models.autoencoders.causal_video_autoencoder import (
27
+ CausalVideoAutoencoder,
28
+ )
29
+ from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
30
+ from ltx_video.models.transformers.transformer3d import Transformer3DModel
31
+ from ltx_video.pipelines.pipeline_ltx_video import (
32
+ ConditioningItem,
33
+ LTXVideoPipeline,
34
+ LTXMultiScalePipeline,
35
+ )
36
+ from ltx_video.schedulers.rf import RectifiedFlowScheduler
37
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
38
+ from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
39
+ import ltx_video.pipelines.crf_compressor as crf_compressor
40
+
41
+ MAX_HEIGHT = 720
42
+ MAX_WIDTH = 1280
43
+ MAX_NUM_FRAMES = 257
44
+
45
+ logger = logging.get_logger("LTX-Video")
46
+
47
+
48
+ def get_total_gpu_memory():
49
+ if torch.cuda.is_available():
50
+ total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
51
+ return total_memory
52
+ return 0
53
+
54
+
55
+ def get_device():
56
+ if torch.cuda.is_available():
57
+ return "cuda"
58
+ elif torch.backends.mps.is_available():
59
+ return "mps"
60
+ return "cpu"
61
+
62
+
63
+ def load_image_to_tensor_with_resize_and_crop(
64
+ image_input: Union[str, Image.Image],
65
+ target_height: int = 512,
66
+ target_width: int = 768,
67
+ just_crop: bool = False,
68
+ ) -> torch.Tensor:
69
+ """Load and process an image into a tensor.
70
+
71
+ Args:
72
+ image_input: Either a file path (str) or a PIL Image object
73
+ target_height: Desired height of output tensor
74
+ target_width: Desired width of output tensor
75
+ just_crop: If True, only crop the image to the target size without resizing
76
+ """
77
+ if isinstance(image_input, str):
78
+ image = Image.open(image_input).convert("RGB")
79
+ elif isinstance(image_input, Image.Image):
80
+ image = image_input
81
+ else:
82
+ raise ValueError("image_input must be either a file path or a PIL Image object")
83
+
84
+ input_width, input_height = image.size
85
+ aspect_ratio_target = target_width / target_height
86
+ aspect_ratio_frame = input_width / input_height
87
+ if aspect_ratio_frame > aspect_ratio_target:
88
+ new_width = int(input_height * aspect_ratio_target)
89
+ new_height = input_height
90
+ x_start = (input_width - new_width) // 2
91
+ y_start = 0
92
+ else:
93
+ new_width = input_width
94
+ new_height = int(input_width / aspect_ratio_target)
95
+ x_start = 0
96
+ y_start = (input_height - new_height) // 2
97
+
98
+ image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
99
+ if not just_crop:
100
+ image = image.resize((target_width, target_height))
101
+
102
+ image = np.array(image)
103
+ image = cv2.GaussianBlur(image, (3, 3), 0)
104
+ frame_tensor = torch.from_numpy(image).float()
105
+ frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
106
+ frame_tensor = frame_tensor.permute(2, 0, 1)
107
+ frame_tensor = (frame_tensor / 127.5) - 1.0
108
+ # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
109
+ return frame_tensor.unsqueeze(0).unsqueeze(2)
110
+
111
+
112
+ def calculate_padding(
113
+ source_height: int, source_width: int, target_height: int, target_width: int
114
+ ) -> tuple[int, int, int, int]:
115
+
116
+ # Calculate total padding needed
117
+ pad_height = target_height - source_height
118
+ pad_width = target_width - source_width
119
+
120
+ # Calculate padding for each side
121
+ pad_top = pad_height // 2
122
+ pad_bottom = pad_height - pad_top # Handles odd padding
123
+ pad_left = pad_width // 2
124
+ pad_right = pad_width - pad_left # Handles odd padding
125
+
126
+ # Return padded tensor
127
+ # Padding format is (left, right, top, bottom)
128
+ padding = (pad_left, pad_right, pad_top, pad_bottom)
129
+ return padding
130
+
131
+
132
+ def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
133
+ # Remove non-letters and convert to lowercase
134
+ clean_text = "".join(
135
+ char.lower() for char in text if char.isalpha() or char.isspace()
136
+ )
137
+
138
+ # Split into words
139
+ words = clean_text.split()
140
+
141
+ # Build result string keeping track of length
142
+ result = []
143
+ current_length = 0
144
+
145
+ for word in words:
146
+ # Add word length plus 1 for underscore (except for first word)
147
+ new_length = current_length + len(word)
148
+
149
+ if new_length <= max_len:
150
+ result.append(word)
151
+ current_length += len(word)
152
+ else:
153
+ break
154
+
155
+ return "-".join(result)
156
+
157
+
158
+ # Generate output video name
159
+ def get_unique_filename(
160
+ base: str,
161
+ ext: str,
162
+ prompt: str,
163
+ seed: int,
164
+ resolution: tuple[int, int, int],
165
+ dir: Path,
166
+ endswith=None,
167
+ index_range=1000,
168
+ ) -> Path:
169
+ base_filename = f"{base}_{convert_prompt_to_filename(prompt, max_len=30)}_{seed}_{resolution[0]}x{resolution[1]}x{resolution[2]}"
170
+ for i in range(index_range):
171
+ filename = dir / f"{base_filename}_{i}{endswith if endswith else ''}{ext}"
172
+ if not os.path.exists(filename):
173
+ return filename
174
+ raise FileExistsError(
175
+ f"Could not find a unique filename after {index_range} attempts."
176
+ )
177
+
178
+
179
+ def seed_everething(seed: int):
180
+ random.seed(seed)
181
+ np.random.seed(seed)
182
+ torch.manual_seed(seed)
183
+ if torch.cuda.is_available():
184
+ torch.cuda.manual_seed(seed)
185
+ if torch.backends.mps.is_available():
186
+ torch.mps.manual_seed(seed)
187
+
188
+
189
+ def main():
190
+ parser = argparse.ArgumentParser(
191
+ description="Load models from separate directories and run the pipeline."
192
+ )
193
+
194
+ # Directories
195
+ parser.add_argument(
196
+ "--output_path",
197
+ type=str,
198
+ default=None,
199
+ help="Path to the folder to save output video, if None will save in outputs/ directory.",
200
+ )
201
+ parser.add_argument("--seed", type=int, default="171198")
202
+
203
+ # Pipeline parameters
204
+ parser.add_argument(
205
+ "--num_images_per_prompt",
206
+ type=int,
207
+ default=1,
208
+ help="Number of images per prompt",
209
+ )
210
+ parser.add_argument(
211
+ "--image_cond_noise_scale",
212
+ type=float,
213
+ default=0.15,
214
+ help="Amount of noise to add to the conditioned image",
215
+ )
216
+ parser.add_argument(
217
+ "--height",
218
+ type=int,
219
+ default=704,
220
+ help="Height of the output video frames. Optional if an input image provided.",
221
+ )
222
+ parser.add_argument(
223
+ "--width",
224
+ type=int,
225
+ default=1216,
226
+ help="Width of the output video frames. If None will infer from input image.",
227
+ )
228
+ parser.add_argument(
229
+ "--num_frames",
230
+ type=int,
231
+ default=121,
232
+ help="Number of frames to generate in the output video",
233
+ )
234
+ parser.add_argument(
235
+ "--frame_rate", type=int, default=30, help="Frame rate for the output video"
236
+ )
237
+ parser.add_argument(
238
+ "--device",
239
+ default=None,
240
+ help="Device to run inference on. If not specified, will automatically detect and use CUDA or MPS if available, else CPU.",
241
+ )
242
+ parser.add_argument(
243
+ "--pipeline_config",
244
+ type=str,
245
+ default="configs/ltxv-13b-0.9.7-dev.yaml",
246
+ help="The path to the config file for the pipeline, which contains the parameters for the pipeline",
247
+ )
248
+
249
+ # Prompts
250
+ parser.add_argument(
251
+ "--prompt",
252
+ type=str,
253
+ help="Text prompt to guide generation",
254
+ )
255
+ parser.add_argument(
256
+ "--negative_prompt",
257
+ type=str,
258
+ default="worst quality, inconsistent motion, blurry, jittery, distorted",
259
+ help="Negative prompt for undesired features",
260
+ )
261
+
262
+ parser.add_argument(
263
+ "--offload_to_cpu",
264
+ action="store_true",
265
+ help="Offloading unnecessary computations to CPU.",
266
+ )
267
+
268
+ # video-to-video arguments:
269
+ parser.add_argument(
270
+ "--input_media_path",
271
+ type=str,
272
+ default=None,
273
+ help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
274
+ )
275
+
276
+ # Conditioning arguments
277
+ parser.add_argument(
278
+ "--conditioning_media_paths",
279
+ type=str,
280
+ nargs="*",
281
+ help="List of paths to conditioning media (images or videos). Each path will be used as a conditioning item.",
282
+ )
283
+ parser.add_argument(
284
+ "--conditioning_strengths",
285
+ type=float,
286
+ nargs="*",
287
+ help="List of conditioning strengths (between 0 and 1) for each conditioning item. Must match the number of conditioning items.",
288
+ )
289
+ parser.add_argument(
290
+ "--conditioning_start_frames",
291
+ type=int,
292
+ nargs="*",
293
+ help="List of frame indices where each conditioning item should be applied. Must match the number of conditioning items.",
294
+ )
295
+
296
+ args = parser.parse_args()
297
+ logger.warning(f"Running generation with arguments: {args}")
298
+ infer(**vars(args))
299
+
300
+
301
+ def create_ltx_video_pipeline(
302
+ ckpt_path: str,
303
+ precision: str,
304
+ text_encoder_model_name_or_path: str,
305
+ sampler: Optional[str] = None,
306
+ device: Optional[str] = None,
307
+ enhance_prompt: bool = False,
308
+ prompt_enhancer_image_caption_model_name_or_path: Optional[str] = None,
309
+ prompt_enhancer_llm_model_name_or_path: Optional[str] = None,
310
+ ) -> LTXVideoPipeline:
311
+ ckpt_path = Path(ckpt_path)
312
+ assert os.path.exists(
313
+ ckpt_path
314
+ ), f"Ckpt path provided (--ckpt_path) {ckpt_path} does not exist"
315
+
316
+ with safe_open(ckpt_path, framework="pt") as f:
317
+ metadata = f.metadata()
318
+ config_str = metadata.get("config")
319
+ configs = json.loads(config_str)
320
+ allowed_inference_steps = configs.get("allowed_inference_steps", None)
321
+
322
+ vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
323
+ transformer = Transformer3DModel.from_pretrained(ckpt_path)
324
+
325
+ # Use constructor if sampler is specified, otherwise use from_pretrained
326
+ if sampler == "from_checkpoint" or not sampler:
327
+ scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
328
+ else:
329
+ scheduler = RectifiedFlowScheduler(
330
+ sampler=("Uniform" if sampler.lower() == "uniform" else "LinearQuadratic")
331
+ )
332
+
333
+ text_encoder = T5EncoderModel.from_pretrained(
334
+ text_encoder_model_name_or_path, subfolder="text_encoder"
335
+ )
336
+ patchifier = SymmetricPatchifier(patch_size=1)
337
+ tokenizer = T5Tokenizer.from_pretrained(
338
+ text_encoder_model_name_or_path, subfolder="tokenizer"
339
+ )
340
+
341
+ transformer = transformer.to(device)
342
+ vae = vae.to(device)
343
+ text_encoder = text_encoder.to(device)
344
+
345
+ if enhance_prompt:
346
+ prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained(
347
+ prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
348
+ )
349
+ prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained(
350
+ prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
351
+ )
352
+ prompt_enhancer_llm_model = AutoModelForCausalLM.from_pretrained(
353
+ prompt_enhancer_llm_model_name_or_path,
354
+ torch_dtype="bfloat16",
355
+ )
356
+ prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained(
357
+ prompt_enhancer_llm_model_name_or_path,
358
+ )
359
+ else:
360
+ prompt_enhancer_image_caption_model = None
361
+ prompt_enhancer_image_caption_processor = None
362
+ prompt_enhancer_llm_model = None
363
+ prompt_enhancer_llm_tokenizer = None
364
+
365
+ vae = vae.to(torch.bfloat16)
366
+ if precision == "bfloat16" and transformer.dtype != torch.bfloat16:
367
+ transformer = transformer.to(torch.bfloat16)
368
+ text_encoder = text_encoder.to(torch.bfloat16)
369
+
370
+ # Use submodels for the pipeline
371
+ submodel_dict = {
372
+ "transformer": transformer,
373
+ "patchifier": patchifier,
374
+ "text_encoder": text_encoder,
375
+ "tokenizer": tokenizer,
376
+ "scheduler": scheduler,
377
+ "vae": vae,
378
+ "prompt_enhancer_image_caption_model": prompt_enhancer_image_caption_model,
379
+ "prompt_enhancer_image_caption_processor": prompt_enhancer_image_caption_processor,
380
+ "prompt_enhancer_llm_model": prompt_enhancer_llm_model,
381
+ "prompt_enhancer_llm_tokenizer": prompt_enhancer_llm_tokenizer,
382
+ "allowed_inference_steps": allowed_inference_steps,
383
+ }
384
+
385
+ pipeline = LTXVideoPipeline(**submodel_dict)
386
+ pipeline = pipeline.to(device)
387
+ return pipeline
388
+
389
+
390
+ def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
391
+ latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
392
+ latent_upsampler.to(device)
393
+ latent_upsampler.eval()
394
+ return latent_upsampler
395
+
396
+
397
+ def infer(
398
+ output_path: Optional[str],
399
+ seed: int,
400
+ pipeline_config: str,
401
+ image_cond_noise_scale: float,
402
+ height: Optional[int],
403
+ width: Optional[int],
404
+ num_frames: int,
405
+ frame_rate: int,
406
+ prompt: str,
407
+ negative_prompt: str,
408
+ offload_to_cpu: bool,
409
+ input_media_path: Optional[str] = None,
410
+ conditioning_media_paths: Optional[List[str]] = None,
411
+ conditioning_strengths: Optional[List[float]] = None,
412
+ conditioning_start_frames: Optional[List[int]] = None,
413
+ device: Optional[str] = None,
414
+ **kwargs,
415
+ ):
416
+ # check if pipeline_config is a file
417
+ if not os.path.isfile(pipeline_config):
418
+ raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
419
+ with open(pipeline_config, "r") as f:
420
+ pipeline_config = yaml.safe_load(f)
421
+
422
+ models_dir = "MODEL_DIR"
423
+
424
+ ltxv_model_name_or_path = pipeline_config["checkpoint_path"]
425
+ if not os.path.isfile(ltxv_model_name_or_path):
426
+ ltxv_model_path = hf_hub_download(
427
+ repo_id="Lightricks/LTX-Video",
428
+ filename=ltxv_model_name_or_path,
429
+ local_dir=models_dir,
430
+ repo_type="model",
431
+ )
432
+ else:
433
+ ltxv_model_path = ltxv_model_name_or_path
434
+
435
+ spatial_upscaler_model_name_or_path = pipeline_config.get(
436
+ "spatial_upscaler_model_path"
437
+ )
438
+ if spatial_upscaler_model_name_or_path and not os.path.isfile(
439
+ spatial_upscaler_model_name_or_path
440
+ ):
441
+ spatial_upscaler_model_path = hf_hub_download(
442
+ repo_id="Lightricks/LTX-Video",
443
+ filename=spatial_upscaler_model_name_or_path,
444
+ local_dir=models_dir,
445
+ repo_type="model",
446
+ )
447
+ else:
448
+ spatial_upscaler_model_path = spatial_upscaler_model_name_or_path
449
+
450
+ if kwargs.get("input_image_path", None):
451
+ logger.warning(
452
+ "Please use conditioning_media_paths instead of input_image_path."
453
+ )
454
+ assert not conditioning_media_paths and not conditioning_start_frames
455
+ conditioning_media_paths = [kwargs["input_image_path"]]
456
+ conditioning_start_frames = [0]
457
+
458
+ # Validate conditioning arguments
459
+ if conditioning_media_paths:
460
+ # Use default strengths of 1.0
461
+ if not conditioning_strengths:
462
+ conditioning_strengths = [1.0] * len(conditioning_media_paths)
463
+ if not conditioning_start_frames:
464
+ raise ValueError(
465
+ "If `conditioning_media_paths` is provided, "
466
+ "`conditioning_start_frames` must also be provided"
467
+ )
468
+ if len(conditioning_media_paths) != len(conditioning_strengths) or len(
469
+ conditioning_media_paths
470
+ ) != len(conditioning_start_frames):
471
+ raise ValueError(
472
+ "`conditioning_media_paths`, `conditioning_strengths`, "
473
+ "and `conditioning_start_frames` must have the same length"
474
+ )
475
+ if any(s < 0 or s > 1 for s in conditioning_strengths):
476
+ raise ValueError("All conditioning strengths must be between 0 and 1")
477
+ if any(f < 0 or f >= num_frames for f in conditioning_start_frames):
478
+ raise ValueError(
479
+ f"All conditioning start frames must be between 0 and {num_frames-1}"
480
+ )
481
+
482
+ seed_everething(seed)
483
+ if offload_to_cpu and not torch.cuda.is_available():
484
+ logger.warning(
485
+ "offload_to_cpu is set to True, but offloading will not occur since the model is already running on CPU."
486
+ )
487
+ offload_to_cpu = False
488
+ else:
489
+ offload_to_cpu = offload_to_cpu and get_total_gpu_memory() < 30
490
+
491
+ output_dir = (
492
+ Path(output_path)
493
+ if output_path
494
+ else Path(f"outputs/{datetime.today().strftime('%Y-%m-%d')}")
495
+ )
496
+ output_dir.mkdir(parents=True, exist_ok=True)
497
+
498
+ # Adjust dimensions to be divisible by 32 and num_frames to be (N * 8 + 1)
499
+ height_padded = ((height - 1) // 32 + 1) * 32
500
+ width_padded = ((width - 1) // 32 + 1) * 32
501
+ num_frames_padded = ((num_frames - 2) // 8 + 1) * 8 + 1
502
+
503
+ padding = calculate_padding(height, width, height_padded, width_padded)
504
+
505
+ logger.warning(
506
+ f"Padded dimensions: {height_padded}x{width_padded}x{num_frames_padded}"
507
+ )
508
+
509
+ prompt_enhancement_words_threshold = pipeline_config[
510
+ "prompt_enhancement_words_threshold"
511
+ ]
512
+
513
+ prompt_word_count = len(prompt.split())
514
+ enhance_prompt = (
515
+ prompt_enhancement_words_threshold > 0
516
+ and prompt_word_count < prompt_enhancement_words_threshold
517
+ )
518
+
519
+ if prompt_enhancement_words_threshold > 0 and not enhance_prompt:
520
+ logger.info(
521
+ f"Prompt has {prompt_word_count} words, which exceeds the threshold of {prompt_enhancement_words_threshold}. Prompt enhancement disabled."
522
+ )
523
+
524
+ precision = pipeline_config["precision"]
525
+ text_encoder_model_name_or_path = pipeline_config["text_encoder_model_name_or_path"]
526
+ sampler = pipeline_config["sampler"]
527
+ prompt_enhancer_image_caption_model_name_or_path = pipeline_config[
528
+ "prompt_enhancer_image_caption_model_name_or_path"
529
+ ]
530
+ prompt_enhancer_llm_model_name_or_path = pipeline_config[
531
+ "prompt_enhancer_llm_model_name_or_path"
532
+ ]
533
+
534
+ pipeline = create_ltx_video_pipeline(
535
+ ckpt_path=ltxv_model_path,
536
+ precision=precision,
537
+ text_encoder_model_name_or_path=text_encoder_model_name_or_path,
538
+ sampler=sampler,
539
+ device=kwargs.get("device", get_device()),
540
+ enhance_prompt=enhance_prompt,
541
+ prompt_enhancer_image_caption_model_name_or_path=prompt_enhancer_image_caption_model_name_or_path,
542
+ prompt_enhancer_llm_model_name_or_path=prompt_enhancer_llm_model_name_or_path,
543
+ )
544
+
545
+ if pipeline_config.get("pipeline_type", None) == "multi-scale":
546
+ if not spatial_upscaler_model_path:
547
+ raise ValueError(
548
+ "spatial upscaler model path is missing from pipeline config file and is required for multi-scale rendering"
549
+ )
550
+ latent_upsampler = create_latent_upsampler(
551
+ spatial_upscaler_model_path, pipeline.device
552
+ )
553
+ pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
554
+
555
+ media_item = None
556
+ if input_media_path:
557
+ media_item = load_media_file(
558
+ media_path=input_media_path,
559
+ height=height,
560
+ width=width,
561
+ max_frames=num_frames_padded,
562
+ padding=padding,
563
+ )
564
+
565
+ conditioning_items = (
566
+ prepare_conditioning(
567
+ conditioning_media_paths=conditioning_media_paths,
568
+ conditioning_strengths=conditioning_strengths,
569
+ conditioning_start_frames=conditioning_start_frames,
570
+ height=height,
571
+ width=width,
572
+ num_frames=num_frames,
573
+ padding=padding,
574
+ pipeline=pipeline,
575
+ )
576
+ if conditioning_media_paths
577
+ else None
578
+ )
579
+
580
+ stg_mode = pipeline_config.get("stg_mode", "attention_values")
581
+ del pipeline_config["stg_mode"]
582
+ if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
583
+ skip_layer_strategy = SkipLayerStrategy.AttentionValues
584
+ elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
585
+ skip_layer_strategy = SkipLayerStrategy.AttentionSkip
586
+ elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
587
+ skip_layer_strategy = SkipLayerStrategy.Residual
588
+ elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
589
+ skip_layer_strategy = SkipLayerStrategy.TransformerBlock
590
+ else:
591
+ raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
592
+
593
+ # Prepare input for the pipeline
594
+ sample = {
595
+ "prompt": prompt,
596
+ "prompt_attention_mask": None,
597
+ "negative_prompt": negative_prompt,
598
+ "negative_prompt_attention_mask": None,
599
+ }
600
+
601
+ device = device or get_device()
602
+ generator = torch.Generator(device=device).manual_seed(seed)
603
+
604
+ images = pipeline(
605
+ **pipeline_config,
606
+ skip_layer_strategy=skip_layer_strategy,
607
+ generator=generator,
608
+ output_type="pt",
609
+ callback_on_step_end=None,
610
+ height=height_padded,
611
+ width=width_padded,
612
+ num_frames=num_frames_padded,
613
+ frame_rate=frame_rate,
614
+ **sample,
615
+ media_items=media_item,
616
+ conditioning_items=conditioning_items,
617
+ is_video=True,
618
+ vae_per_channel_normalize=True,
619
+ image_cond_noise_scale=image_cond_noise_scale,
620
+ mixed_precision=(precision == "mixed_precision"),
621
+ offload_to_cpu=offload_to_cpu,
622
+ device=device,
623
+ enhance_prompt=enhance_prompt,
624
+ ).images
625
+
626
+ # Crop the padded images to the desired resolution and number of frames
627
+ (pad_left, pad_right, pad_top, pad_bottom) = padding
628
+ pad_bottom = -pad_bottom
629
+ pad_right = -pad_right
630
+ if pad_bottom == 0:
631
+ pad_bottom = images.shape[3]
632
+ if pad_right == 0:
633
+ pad_right = images.shape[4]
634
+ images = images[:, :, :num_frames, pad_top:pad_bottom, pad_left:pad_right]
635
+
636
+ for i in range(images.shape[0]):
637
+ # Gathering from B, C, F, H, W to C, F, H, W and then permuting to F, H, W, C
638
+ video_np = images[i].permute(1, 2, 3, 0).cpu().float().numpy()
639
+ # Unnormalizing images to [0, 255] range
640
+ video_np = (video_np * 255).astype(np.uint8)
641
+ fps = frame_rate
642
+ height, width = video_np.shape[1:3]
643
+ # In case a single image is generated
644
+ if video_np.shape[0] == 1:
645
+ output_filename = get_unique_filename(
646
+ f"image_output_{i}",
647
+ ".png",
648
+ prompt=prompt,
649
+ seed=seed,
650
+ resolution=(height, width, num_frames),
651
+ dir=output_dir,
652
+ )
653
+ imageio.imwrite(output_filename, video_np[0])
654
+ else:
655
+ output_filename = get_unique_filename(
656
+ f"video_output_{i}",
657
+ ".mp4",
658
+ prompt=prompt,
659
+ seed=seed,
660
+ resolution=(height, width, num_frames),
661
+ dir=output_dir,
662
+ )
663
+
664
+ # Write video
665
+ with imageio.get_writer(output_filename, fps=fps) as video:
666
+ for frame in video_np:
667
+ video.append_data(frame)
668
+
669
+ logger.warning(f"Output saved to {output_filename}")
670
+
671
+
672
+ def prepare_conditioning(
673
+ conditioning_media_paths: List[str],
674
+ conditioning_strengths: List[float],
675
+ conditioning_start_frames: List[int],
676
+ height: int,
677
+ width: int,
678
+ num_frames: int,
679
+ padding: tuple[int, int, int, int],
680
+ pipeline: LTXVideoPipeline,
681
+ ) -> Optional[List[ConditioningItem]]:
682
+ """Prepare conditioning items based on input media paths and their parameters.
683
+
684
+ Args:
685
+ conditioning_media_paths: List of paths to conditioning media (images or videos)
686
+ conditioning_strengths: List of conditioning strengths for each media item
687
+ conditioning_start_frames: List of frame indices where each item should be applied
688
+ height: Height of the output frames
689
+ width: Width of the output frames
690
+ num_frames: Number of frames in the output video
691
+ padding: Padding to apply to the frames
692
+ pipeline: LTXVideoPipeline object used for condition video trimming
693
+
694
+ Returns:
695
+ A list of ConditioningItem objects.
696
+ """
697
+ conditioning_items = []
698
+ for path, strength, start_frame in zip(
699
+ conditioning_media_paths, conditioning_strengths, conditioning_start_frames
700
+ ):
701
+ num_input_frames = orig_num_input_frames = get_media_num_frames(path)
702
+ if hasattr(pipeline, "trim_conditioning_sequence") and callable(
703
+ getattr(pipeline, "trim_conditioning_sequence")
704
+ ):
705
+ num_input_frames = pipeline.trim_conditioning_sequence(
706
+ start_frame, orig_num_input_frames, num_frames
707
+ )
708
+ if num_input_frames < orig_num_input_frames:
709
+ logger.warning(
710
+ f"Trimming conditioning video {path} from {orig_num_input_frames} to {num_input_frames} frames."
711
+ )
712
+
713
+ media_tensor = load_media_file(
714
+ media_path=path,
715
+ height=height,
716
+ width=width,
717
+ max_frames=num_input_frames,
718
+ padding=padding,
719
+ just_crop=True,
720
+ )
721
+ conditioning_items.append(ConditioningItem(media_tensor, start_frame, strength))
722
+ return conditioning_items
723
+
724
+
725
+ def get_media_num_frames(media_path: str) -> int:
726
+ is_video = any(
727
+ media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]
728
+ )
729
+ num_frames = 1
730
+ if is_video:
731
+ reader = imageio.get_reader(media_path)
732
+ num_frames = reader.count_frames()
733
+ reader.close()
734
+ return num_frames
735
+
736
+
737
+ def load_media_file(
738
+ media_path: str,
739
+ height: int,
740
+ width: int,
741
+ max_frames: int,
742
+ padding: tuple[int, int, int, int],
743
+ just_crop: bool = False,
744
+ ) -> torch.Tensor:
745
+ is_video = any(
746
+ media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]
747
+ )
748
+ if is_video:
749
+ reader = imageio.get_reader(media_path)
750
+ num_input_frames = min(reader.count_frames(), max_frames)
751
+
752
+ # Read and preprocess the relevant frames from the video file.
753
+ frames = []
754
+ for i in range(num_input_frames):
755
+ frame = Image.fromarray(reader.get_data(i))
756
+ frame_tensor = load_image_to_tensor_with_resize_and_crop(
757
+ frame, height, width, just_crop=just_crop
758
+ )
759
+ frame_tensor = torch.nn.functional.pad(frame_tensor, padding)
760
+ frames.append(frame_tensor)
761
+ reader.close()
762
+
763
+ # Stack frames along the temporal dimension
764
+ media_tensor = torch.cat(frames, dim=2)
765
+ else: # Input image
766
+ media_tensor = load_image_to_tensor_with_resize_and_crop(
767
+ media_path, height, width, just_crop=just_crop
768
+ )
769
+ media_tensor = torch.nn.functional.pad(media_tensor, padding)
770
+ return media_tensor
771
+
772
+
773
+ if __name__ == "__main__":
774
+ main()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ transformers
3
+ sentencepiece
4
+ pillow
5
+ numpy
6
+ torchvision
7
+ huggingface_hub
8
+ spaces
9
+ opencv-python
10
+ imageio
11
+ imageio-ffmpeg
12
+ einops
13
+ timm
14
+ av
15
+ git+https://github.com/huggingface/diffusers.git@main
setup.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # setup.py
2
+ #
3
+ # Copyright (C) August 4, 2025 Carlos Rodrigues dos Santos
4
+ #
5
+ # Versão 2.0.0 (Clonagem Anônima e Robusta)
6
+ # - Usa URLs HTTPS explícitas e anônimas para evitar que o Git tente
7
+ # usar credenciais em cache desnecessariamente para repositórios públicos.
8
+
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ # --- Configuração ---
15
+ DEPS_DIR = Path("./deps")
16
+
17
+ # URLs explícitas e anônimas para os repositórios públicos
18
+ REPOS_TO_CLONE = {
19
+ "LTX-Video": "https://huggingface.co/spaces/Lightricks/ltx-video-distilled",
20
+ "SeedVR_Space": "https://huggingface.co/spaces/ByteDance-Seed/SeedVR2-3B",
21
+ "MMAudio": "https://github.com/hkchengrex/MMAudio.git"
22
+ }
23
+
24
+ def run_command(command, cwd=None):
25
+ """Executa um comando no terminal e lida com erros."""
26
+ print(f"Executando: {' '.join(command)}")
27
+ try:
28
+ # Redireciona o stdin para DEVNULL para garantir que o git não tente pedir senha
29
+ subprocess.run(
30
+ command,
31
+ check=True,
32
+ cwd=cwd,
33
+ stdin=subprocess.DEVNULL,
34
+ )
35
+ except subprocess.CalledProcessError as e:
36
+ print(f"ERRO: O comando falhou com o código de saída {e.returncode}")
37
+ # stderr é capturado automaticamente se check=True falhar
38
+ print(f"Stderr: {e.stderr}")
39
+ sys.exit(1)
40
+ except FileNotFoundError:
41
+ print(f"ERRO: O comando '{command[0]}' não foi encontrado. Certifique-se de que o git está instalado e no seu PATH.")
42
+ sys.exit(1)
43
+
44
+ def main():
45
+ print("--- Iniciando Setup do Ambiente ADUC-SDR ---")
46
+
47
+ DEPS_DIR.mkdir(exist_ok=True)
48
+
49
+ for repo_name, repo_url in REPOS_TO_CLONE.items():
50
+ repo_path = DEPS_DIR / repo_name
51
+ if repo_path.exists():
52
+ print(f"Repositório '{repo_name}' já existe. Pulando a clonagem.")
53
+ else:
54
+ print(f"Clonando '{repo_name}' de {repo_url}...")
55
+ run_command(["git", "clone", "--depth", "1", repo_url, str(repo_path)])
56
+ print(f"'{repo_name}' clonado com sucesso.")
57
+
58
+ print("\n--- Setup do Ambiente Concluído com Sucesso! ---")
59
+ print("Você agora pode iniciar a aplicação principal (ex: python app.py).")
60
+
61
+ if __name__ == "__main__":
62
+ main()
63
+
video_service.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # video_service.py
2
+
3
+ # --- 1. IMPORTAÇÕES ---
4
+ import torch
5
+ import numpy as np
6
+ import random
7
+ import os
8
+ import yaml
9
+ from pathlib import Path
10
+ import imageio
11
+ import tempfile
12
+ from huggingface_hub import hf_hub_download
13
+ import sys
14
+ import subprocess
15
+ from PIL import Image
16
+
17
+ # --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
18
+
19
+ def run_setup():
20
+ """Executa o script setup.py para clonar as dependências necessárias."""
21
+ setup_script_path = "setup.py"
22
+ if not os.path.exists(setup_script_path):
23
+ print("AVISO: script 'setup.py' não encontrado. Pulando a clonagem de dependências.")
24
+ return
25
+ try:
26
+ print("--- Executando setup.py para garantir que as dependências estão presentes ---")
27
+ subprocess.run([sys.executable, setup_script_path], check=True)
28
+ print("--- Setup concluído com sucesso ---")
29
+ except subprocess.CalledProcessError as e:
30
+ print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
31
+ sys.exit(1)
32
+
33
+ DEPS_DIR = Path("./deps")
34
+ LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
35
+ if not LTX_VIDEO_REPO_DIR.exists():
36
+ run_setup()
37
+
38
+ def add_deps_to_path():
39
+ """Adiciona o repositório clonado ao sys.path para que suas bibliotecas possam ser importadas."""
40
+ if not LTX_VIDEO_REPO_DIR.exists():
41
+ raise FileNotFoundError(f"Repositório LTX-Video não encontrado em '{LTX_VIDEO_REPO_DIR}'. Execute o setup.")
42
+ if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
43
+ sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))
44
+
45
+ add_deps_to_path()
46
+
47
+ # --- 3. IMPORTAÇÕES ESPECÍFICAS DO MODELO ---
48
+ from inference import (
49
+ create_ltx_video_pipeline, create_latent_upsampler,
50
+ load_image_to_tensor_with_resize_and_crop, seed_everething,
51
+ calculate_padding, load_media_file
52
+ )
53
+ from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
54
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
55
+
56
+ # --- 4. FUNÇÕES HELPER DE LOG ---
57
+ def log_tensor_info(tensor, name="Tensor"):
58
+ if not isinstance(tensor, torch.Tensor):
59
+ print(f"\n[INFO] O item '{name}' não é um tensor para logar.")
60
+ return
61
+ print(f"\n--- Informações do Tensor: {name} ---")
62
+ print(f" - Shape: {tensor.shape}")
63
+ print(f" - Dtype: {tensor.dtype}")
64
+ print(f" - Device: {tensor.device}")
65
+ if tensor.numel() > 0:
66
+ print(f" - Min valor: {tensor.min().item():.4f}")
67
+ print(f" - Max valor: {tensor.max().item():.4f}")
68
+ print(f" - Média: {tensor.mean().item():.4f}")
69
+ else:
70
+ print(" - O tensor está vazio, sem estatísticas.")
71
+ print("------------------------------------------\n")
72
+
73
+ # --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
74
+ class VideoService:
75
+ def __init__(self):
76
+ print("Inicializando VideoService...")
77
+ self.config = self._load_config()
78
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
79
+ self.last_memory_reserved_mb = 0
80
+ self.pipeline, self.latent_upsampler = self._load_models()
81
+ print(f"Movendo modelos para o dispositivo de inferência: {self.device}")
82
+ self.pipeline.to(self.device)
83
+ if self.latent_upsampler:
84
+ self.latent_upsampler.to(self.device)
85
+ if self.device == "cuda":
86
+ torch.cuda.empty_cache()
87
+ self._log_gpu_memory("Após carregar modelos")
88
+ print("VideoService pronto para uso.")
89
+
90
+ def _log_gpu_memory(self, stage_name: str):
91
+ if self.device != "cuda": return
92
+ current_reserved_b = torch.cuda.memory_reserved()
93
+ current_reserved_mb = current_reserved_b / (1024 ** 2)
94
+ total_memory_b = torch.cuda.get_device_properties(0).total_memory
95
+ total_memory_mb = total_memory_b / (1024 ** 2)
96
+ peak_reserved_mb = torch.cuda.max_memory_reserved() / (1024 ** 2)
97
+ delta_mb = current_reserved_mb - self.last_memory_reserved_mb
98
+ print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} ---")
99
+ print(f" - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
100
+ print(f" - Variação desde o último log: {delta_mb:+.2f} MB")
101
+ if peak_reserved_mb > self.last_memory_reserved_mb:
102
+ print(f" - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
103
+ print("--------------------------------------------------\n")
104
+ self.last_memory_reserved_mb = current_reserved_mb
105
+
106
+ def _load_config(self):
107
+ config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
108
+ with open(config_file_path, "r") as file:
109
+ return yaml.safe_load(file)
110
+
111
+ def _load_models(self):
112
+ models_dir = "downloaded_models_gradio"
113
+ Path(models_dir).mkdir(parents=True, exist_ok=True)
114
+ LTX_REPO = "Lightricks/LTX-Video"
115
+ distilled_model_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["checkpoint_path"], local_dir=models_dir, local_dir_use_symlinks=False)
116
+ self.config["checkpoint_path"] = distilled_model_path
117
+ spatial_upscaler_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["spatial_upscaler_model_path"], local_dir=models_dir, local_dir_use_symlinks=False)
118
+ self.config["spatial_upscaler_model_path"] = spatial_upscaler_path
119
+ pipeline = create_ltx_video_pipeline(ckpt_path=self.config["checkpoint_path"], precision=self.config["precision"], text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"], sampler=self.config["sampler"], device="cpu", enhance_prompt=False, prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"], prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"])
120
+ latent_upsampler = None
121
+ if self.config.get("spatial_upscaler_model_path"):
122
+ latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
123
+ return pipeline, latent_upsampler
124
+
125
+ def _prepare_conditioning_tensor_from_file(self, filepath, height, width, padding_values):
126
+ """Prepara um tensor de condicionamento a partir de um arquivo de imagem."""
127
+ tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
128
+ tensor = torch.nn.functional.pad(tensor, padding_values)
129
+ return tensor.to(self.device)
130
+
131
+ def _extract_frames_from_video(self, video_path: str, frame_indices: list) -> list:
132
+ print(f"[INFO] Extraindo frames nos índices: {frame_indices} do vídeo '{video_path}'")
133
+ extracted_frames = []
134
+ indices_to_get = set(frame_indices)
135
+ try:
136
+ with imageio.get_reader(video_path) as reader:
137
+ for i, frame in enumerate(reader):
138
+ if i in indices_to_get:
139
+ extracted_frames.append(frame)
140
+ if len(extracted_frames) == len(indices_to_get):
141
+ break
142
+ if len(extracted_frames) != len(frame_indices):
143
+ print(f"[AVISO] Esperava extrair {len(frame_indices)} frames, mas o vídeo só tinha {len(extracted_frames)} correspondentes.")
144
+ except Exception as e:
145
+ print(f"[ERRO] Falha ao extrair frames do vídeo: {e}")
146
+ return extracted_frames
147
+
148
+ def _get_video_dimensions(self, video_path: str) -> tuple[int, int]:
149
+ """Lê um arquivo de vídeo e retorna sua largura e altura."""
150
+ try:
151
+ with imageio.get_reader(video_path) as reader:
152
+ meta = reader.get_meta_data()
153
+ size = meta.get('size')
154
+ if size:
155
+ return size
156
+ return (None, None)
157
+ except Exception as e:
158
+ print(f"[ERRO] Não foi possível ler as dimensões do vídeo: {e}")
159
+ return (None, None)
160
+
161
+ def generate(self, prompt, negative_prompt, mode="text-to-video",
162
+ start_image_filepath=None,
163
+ middle_image_filepath=None, middle_frame_number=None, middle_image_weight=1.0,
164
+ end_image_filepath=None, end_image_weight=1.0,
165
+ input_video_filepath=None, height=512, width=704, duration=2.0,
166
+ frames_to_use=9, seed=42, randomize_seed=True, guidance_scale=3.0,
167
+ improve_texture=True, progress_callback=None):
168
+ if self.device == "cuda":
169
+ torch.cuda.empty_cache()
170
+ torch.cuda.reset_peak_memory_stats()
171
+ self._log_gpu_memory("Início da Geração")
172
+
173
+ if mode == "image-to-video" and not start_image_filepath:
174
+ raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
175
+ if mode == "video-to-video" and not input_video_filepath:
176
+ raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
177
+
178
+ used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
179
+ seed_everething(used_seed)
180
+
181
+ if mode == "video-to-video":
182
+ orig_w, orig_h = self._get_video_dimensions(input_video_filepath)
183
+ if orig_w and orig_h:
184
+ width = round(orig_w / 32) * 32
185
+ height = round(orig_h / 32) * 32
186
+ print(f"[INFO] Modo video-to-video: Dimensões recalculadas para {width}x{height}")
187
+
188
+ FPS = 24.0
189
+ MAX_NUM_FRAMES = 257
190
+ target_frames_rounded = round(duration * FPS)
191
+ n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
192
+ actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
193
+
194
+ height_padded = ((height - 1) // 32 + 1) * 32
195
+ width_padded = ((width - 1) // 32 + 1) * 32
196
+ padding_values = calculate_padding(height, width, height_padded, width_padded)
197
+
198
+ generator = torch.Generator(device=self.device).manual_seed(used_seed)
199
+
200
+ conditioning_items = []
201
+
202
+ if mode == "image-to-video":
203
+ start_tensor = self._prepare_conditioning_tensor_from_file(start_image_filepath, height, width, padding_values)
204
+ conditioning_items.append(ConditioningItem(start_tensor, 0, 1.0))
205
+ if middle_image_filepath and middle_frame_number is not None:
206
+ middle_tensor = self._prepare_conditioning_tensor_from_file(middle_image_filepath, height, width, padding_values)
207
+ safe_middle_frame = max(0, min(int(middle_frame_number), actual_num_frames - 1))
208
+ conditioning_items.append(ConditioningItem(middle_tensor, safe_middle_frame, float(middle_image_weight)))
209
+ if end_image_filepath:
210
+ end_tensor = self._prepare_conditioning_tensor_from_file(end_image_filepath, height, width, padding_values)
211
+ last_frame_index = actual_num_frames - 1
212
+ conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
213
+
214
+ # --- <LÓGICA CORRIGIDA E SIMPLIFICADA> ---
215
+ elif mode == "video-to-video":
216
+ indices_to_extract = list(range(0, int(frames_to_use), 8))
217
+ extracted_frames_np = self._extract_frames_from_video(input_video_filepath, indices_to_extract)
218
+ x=1
219
+ with tempfile.TemporaryDirectory() as temp_dir:
220
+ for i, frame_np in enumerate(extracted_frames_np):
221
+ x = x+1
222
+ frame_index = indices_to_extract[i]
223
+ temp_frame_path = os.path.join(temp_dir, f"frame_{frame_index}.png")
224
+ imageio.imwrite(temp_frame_path, frame_np)
225
+
226
+ # Reutiliza a função de processamento de imagem, como você sugeriu
227
+ frame_tensor = self._prepare_conditioning_tensor_from_file(
228
+ temp_frame_path, height, width, padding_values
229
+ )
230
+ conditioning_items.append(ConditioningItem(frame_tensor, ((x*8)-8)-1, 0.5))
231
+ print(f"[INFO] {len(conditioning_items)} frames do vídeo foram processados como keyframes de condicionamento.")
232
+
233
+ call_kwargs = {
234
+ "prompt": prompt, "negative_prompt": negative_prompt, "height": height_padded, "width": width_padded,
235
+ "num_frames": actual_num_frames, "frame_rate": int(FPS), "generator": generator, "output_type": "pt",
236
+ "conditioning_items": conditioning_items if conditioning_items else None,
237
+ "media_items": None,
238
+ "decode_timestep": self.config["decode_timestep"], "decode_noise_scale": self.config["decode_noise_scale"],
239
+ "stochastic_sampling": True, #self.config["stochastic_sampling"], "image_cond_noise_scale": 0.15,
240
+ "is_video": False, "vae_per_channel_normalize": True,
241
+ "mixed_precision": True, #(self.config["precision"] == "mixed_precision"),
242
+ "offload_to_cpu": False, "enhance_prompt": False,
243
+ "skip_layer_strategy": None, #$/#SkipLayerStrategy.AttentionValues
244
+ }
245
+
246
+ result_tensor = None
247
+ if improve_texture:
248
+ if not self.latent_upsampler:
249
+ raise ValueError("Upscaler espacial não carregado.")
250
+ multi_scale_pipeline = LTXMultiScalePipeline(self.pipeline, self.latent_upsampler)
251
+ first_pass_args = self.config.get("first_pass", {}).copy()
252
+ first_pass_args["guidance_scale"] = float(guidance_scale)
253
+ second_pass_args = self.config.get("second_pass", {}).copy()
254
+ second_pass_args["guidance_scale"] = float(guidance_scale)
255
+ multi_scale_call_kwargs = call_kwargs.copy()
256
+ multi_scale_call_kwargs.update({"downscale_factor": self.config["downscale_factor"], "first_pass": first_pass_args, "second_pass": second_pass_args})
257
+ result_tensor = multi_scale_pipeline(**multi_scale_call_kwargs).images
258
+ log_tensor_info(result_tensor, "Resultado da Etapa 2 (Saída do Pipeline Multi-Scale)")
259
+ else:
260
+ single_pass_kwargs = call_kwargs.copy()
261
+ first_pass_config = self.config.get("first_pass", {})
262
+ single_pass_kwargs.update({
263
+ "guidance_scale": float(guidance_scale),
264
+ "stg_scale": first_pass_config.get("stg_scale"),
265
+ "rescaling_scale": first_pass_config.get("rescaling_scale"),
266
+ "skip_block_list": first_pass_config.get("skip_block_list"),
267
+ "timesteps": first_pass_config.get("timesteps"),
268
+ })
269
+
270
+ print("\n[INFO] Executando pipeline de etapa única...")
271
+ result_tensor = self.pipeline(**single_pass_kwargs).images
272
+
273
+ pad_left, pad_right, pad_top, pad_bottom = padding_values
274
+ slice_h_end = -pad_bottom if pad_bottom > 0 else None
275
+ slice_w_end = -pad_right if pad_right > 0 else None
276
+
277
+ result_tensor = result_tensor[:, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
278
+ log_tensor_info(result_tensor, "Tensor Final (Após Pós-processamento, Antes de Salvar)")
279
+
280
+ video_np = (result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy() * 255).astype(np.uint8)
281
+ temp_dir = tempfile.mkdtemp()
282
+ output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
283
+
284
+ with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], codec='libx264', quality=8) as writer:
285
+ total_frames = len(video_np)
286
+ for i, frame in enumerate(video_np):
287
+ writer.append_data(frame)
288
+ if progress_callback:
289
+ progress_callback(i + 1, total_frames)
290
+
291
+ self._log_gpu_memory("Fim da Geração")
292
+ return output_video_path, used_seed
293
+
294
+ print("Criando instância do VideoService. O carregamento do modelo começará agora...")
295
+ video_generation_service = VideoService()