Spaces:
Paused
Paused
Update api/ltx_server.py
Browse files- api/ltx_server.py +17 -112
api/ltx_server.py
CHANGED
|
@@ -19,8 +19,7 @@ logging.set_verbosity_debug()
|
|
| 19 |
LTXV_DEBUG=1
|
| 20 |
LTXV_FRAME_LOG_EVERY=8
|
| 21 |
|
| 22 |
-
|
| 23 |
-
from dataclasses import dataclass
|
| 24 |
|
| 25 |
# --- 1. IMPORTAÇÕES ---
|
| 26 |
import os, subprocess, shlex, tempfile
|
|
@@ -50,10 +49,6 @@ import torch.nn.functional as F
|
|
| 50 |
from managers.vae_manager import vae_manager_singleton
|
| 51 |
from tools.video_encode_tool import video_encode_tool_singleton
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
# --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
|
| 58 |
def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
|
| 59 |
try:
|
|
@@ -223,9 +218,7 @@ def add_deps_to_path():
|
|
| 223 |
add_deps_to_path()
|
| 224 |
|
| 225 |
# --- 3. IMPORTAÇÕES ESPECÍFICAS DO MODELO ---
|
| 226 |
-
|
| 227 |
-
from ltx_video.models.autoencoders.vae_encode import vae_encode, latent_to_pixel_coords
|
| 228 |
-
|
| 229 |
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
|
| 230 |
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
|
| 231 |
from ltx_video.models.autoencoders.vae_encode import un_normalize_latents, normalize_latents
|
|
@@ -249,70 +242,7 @@ def log_tensor_info(tensor, name="Tensor"):
|
|
| 249 |
print("------------------------------------------\n")
|
| 250 |
|
| 251 |
|
| 252 |
-
|
| 253 |
-
class LatentConditioningItem:
|
| 254 |
-
"""Item de dados para condicionamento da pipeline LTX."""
|
| 255 |
-
latent_tensor: torch.Tensor
|
| 256 |
-
media_frame_number: int
|
| 257 |
-
conditioning_strength: float
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
def _aduc_prepare_conditioning_patch(
|
| 262 |
-
self: "LTXVideoPipeline",
|
| 263 |
-
conditioning_items: Optional[List[Union["ConditioningItem", "LatentConditioningItem"]]],
|
| 264 |
-
init_latents: torch.Tensor,
|
| 265 |
-
num_frames: int,
|
| 266 |
-
height: int,
|
| 267 |
-
width: int,
|
| 268 |
-
vae_per_channel_normalize: bool = True,
|
| 269 |
-
generator=None,
|
| 270 |
-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
|
| 271 |
-
if not conditioning_items:
|
| 272 |
-
init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
|
| 273 |
-
init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
|
| 274 |
-
return init_latents, init_pixel_coords, None, 0
|
| 275 |
-
|
| 276 |
-
init_conditioning_mask = torch.zeros_like(init_latents[:, 0, ...], dtype=torch.float32, device=init_latents.device)
|
| 277 |
-
extra_conditioning_latents, extra_conditioning_pixel_coords, extra_conditioning_mask = [], [], []
|
| 278 |
-
extra_conditioning_num_latents = 0
|
| 279 |
-
|
| 280 |
-
for item in conditioning_items:
|
| 281 |
-
if not isinstance(item, LatentConditioningItem):
|
| 282 |
-
print("Patch ADUC: Item de condicionamento não é um LatentConditioningItem e será ignorado.")
|
| 283 |
-
continue
|
| 284 |
-
|
| 285 |
-
media_item_latents = item.latent_tensor.to(dtype=init_latents.dtype, device=init_latents.device)
|
| 286 |
-
media_frame_number, strength = item.media_frame_number, item.conditioning_strength
|
| 287 |
-
|
| 288 |
-
if media_frame_number == 0:
|
| 289 |
-
f_l, h_l, w_l = media_item_latents.shape[-3:]
|
| 290 |
-
init_latents[..., :f_l, :h_l, :w_l] = torch.lerp(init_latents[..., :f_l, :h_l, :w_l], media_item_latents, strength)
|
| 291 |
-
init_conditioning_mask[..., :f_l, :h_l, :w_l] = strength
|
| 292 |
-
else:
|
| 293 |
-
noise = randn_tensor(media_item_latents.shape, generator=generator, device=media_item_latents.device, dtype=media_item_latents.dtype)
|
| 294 |
-
media_item_latents = torch.lerp(noise, media_item_latents, strength)
|
| 295 |
-
patched_latents, latent_coords = self.patchifier.patchify(latents=media_item_latents)
|
| 296 |
-
pixel_coords = latent_to_pixel_coords(latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
|
| 297 |
-
pixel_coords[:, 0] += media_frame_number
|
| 298 |
-
extra_conditioning_num_latents += patched_latents.shape[1]
|
| 299 |
-
new_mask = torch.full(patched_latents.shape[:2], strength, dtype=torch.float32, device=init_latents.device)
|
| 300 |
-
extra_conditioning_latents.append(patched_latents)
|
| 301 |
-
extra_conditioning_pixel_coords.append(pixel_coords)
|
| 302 |
-
extra_conditioning_mask.append(new_mask)
|
| 303 |
-
|
| 304 |
-
init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
|
| 305 |
-
init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
|
| 306 |
-
init_conditioning_mask, _ = self.patchifier.patchify(latents=init_conditioning_mask.unsqueeze(1))
|
| 307 |
-
init_conditioning_mask = init_conditioning_mask.squeeze(-1)
|
| 308 |
-
|
| 309 |
-
if extra_conditioning_latents:
|
| 310 |
-
init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
|
| 311 |
-
init_pixel_coords = torch.cat([*extra_conditioning_pixel_coords, init_pixel_coords], dim=2)
|
| 312 |
-
init_conditioning_mask = torch.cat([*extra_conditioning_mask, init_conditioning_mask], dim=1)
|
| 313 |
-
|
| 314 |
-
return init_latents, init_pixel_coords, init_conditioning_mask, extra_conditioning_num_latents
|
| 315 |
-
|
| 316 |
|
| 317 |
|
| 318 |
# --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
|
|
@@ -329,12 +259,9 @@ class VideoService:
|
|
| 329 |
self.last_memory_reserved_mb = 0.0
|
| 330 |
self._tmp_dirs = set(); self._tmp_files = set(); self._last_outputs = []
|
| 331 |
|
| 332 |
-
|
| 333 |
self.pipeline, self.latent_upsampler = self._load_models()
|
| 334 |
print(f"[DEBUG] Pipeline e Upsampler carregados. Upsampler ativo? {bool(self.latent_upsampler)}")
|
| 335 |
|
| 336 |
-
#self._apply_ltx_pipeline_patches()
|
| 337 |
-
|
| 338 |
print(f"[DEBUG] Movendo modelos para {self.device}...")
|
| 339 |
self.pipeline.to(self.device)
|
| 340 |
if self.latent_upsampler:
|
|
@@ -357,26 +284,6 @@ class VideoService:
|
|
| 357 |
|
| 358 |
print(f"[DEBUG] VideoService pronto. boot_time={time.perf_counter()-t0:.3f}s")
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
# Em ltx_server.py
|
| 364 |
-
def _apply_ltx_pipeline_patches(self):
|
| 365 |
-
"""Aplica patches em tempo de execução na pipeline LTX para compatibilidade com ADUC-SDR."""
|
| 366 |
-
# Lógica de importação tardia para o tipo, caso não esteja no escopo global
|
| 367 |
-
from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline
|
| 368 |
-
|
| 369 |
-
print("LTX POOL MANAGER: Aplicando patches ADUC-SDR na pipeline LTX...")
|
| 370 |
-
|
| 371 |
-
# APLICA O PATCH DIRETAMENTE NA ÚNICA INSTÂNCIA DO PIPELINE
|
| 372 |
-
self.pipeline.prepare_conditioning = _aduc_prepare_conditioning_patch.__get__(
|
| 373 |
-
self.pipeline, LTXVideoPipeline
|
| 374 |
-
)
|
| 375 |
-
|
| 376 |
-
print("LTX POOL MANAGER: A instância da pipeline foi corrigida com sucesso.")
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
def _log_gpu_memory(self, stage_name: str):
|
| 381 |
if self.device != "cuda":
|
| 382 |
return
|
|
@@ -636,10 +543,10 @@ class VideoService:
|
|
| 636 |
start = (num_latente_por_chunk*i)
|
| 637 |
end = (start+num_latente_por_chunk+overlap)
|
| 638 |
if i+1 < n_chunks:
|
| 639 |
-
chunk = latents_brutos[:, :, start:end, :, :]
|
| 640 |
print(f"[DEBUG] chunk{i+1}[:, :, {start}:{end}, :, :] = {chunk.shape[2]}")
|
| 641 |
else:
|
| 642 |
-
chunk = latents_brutos[:, :, start:, :, :]
|
| 643 |
print(f"[DEBUG] chunk{i+1}[:, :, {start}:, :, :] = {chunk.shape[2]}")
|
| 644 |
chunks.append(chunk)
|
| 645 |
i+=1
|
|
@@ -671,7 +578,7 @@ class VideoService:
|
|
| 671 |
#if total % 2 == 1: # ÍMPAR
|
| 672 |
# Ex: 11 → primeira 0..5, segunda 5..10
|
| 673 |
cut = total // 2
|
| 674 |
-
primeira = latents_brutos[:, :, :cut, :, :].clone()
|
| 675 |
segunda = latents_brutos[:, :, cut:, :, :].clone()
|
| 676 |
|
| 677 |
|
|
@@ -876,7 +783,7 @@ class VideoService:
|
|
| 876 |
"decode_timestep": self.config["decode_timestep"],
|
| 877 |
"decode_noise_scale": self.config["decode_noise_scale"],
|
| 878 |
"stochastic_sampling": self.config["stochastic_sampling"],
|
| 879 |
-
"image_cond_noise_scale": 0.
|
| 880 |
"is_video": True,
|
| 881 |
"vae_per_channel_normalize": True,
|
| 882 |
"mixed_precision": (self.config["precision"] == "mixed_precision"),
|
|
@@ -941,7 +848,7 @@ class VideoService:
|
|
| 941 |
del base_latents; gc.collect(); torch.cuda.empty_cache()
|
| 942 |
|
| 943 |
par = 0
|
| 944 |
-
latents_cpu_up = upsampled_latents.to("cpu", non_blocking=True)
|
| 945 |
torch.cuda.empty_cache()
|
| 946 |
try:
|
| 947 |
torch.cuda.ipc_collect()
|
|
@@ -959,13 +866,12 @@ class VideoService:
|
|
| 959 |
|
| 960 |
#latents_parts_up = [latents_cpu_up]
|
| 961 |
|
| 962 |
-
|
| 963 |
for latents in latents_parts_up:
|
| 964 |
|
| 965 |
# # --- ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---
|
| 966 |
print("\n--- INICIANDO ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---")
|
| 967 |
-
|
| 968 |
-
par+=1
|
| 969 |
second_pass_config = self.config.get("second_pass", {}).copy()
|
| 970 |
# --- <INÍCIO DA LÓGICA DE CÁLCULO EXATA PARA SECOND PASS> ---
|
| 971 |
# Usa as dimensões da primeira passagem dobradas, como na pipeline original
|
|
@@ -984,7 +890,6 @@ class VideoService:
|
|
| 984 |
|
| 985 |
second_pass_kwargs = call_kwargs.copy()
|
| 986 |
second_pass_kwargs.update({
|
| 987 |
-
"conditioning_items": None,
|
| 988 |
"output_type": "latent",
|
| 989 |
"width": second_pass_width,
|
| 990 |
"height": second_pass_height,
|
|
@@ -1025,7 +930,7 @@ class VideoService:
|
|
| 1025 |
|
| 1026 |
for latents_vae in latents_list:
|
| 1027 |
|
| 1028 |
-
latents_cpu_vae = latents_vae.to("cpu", non_blocking=True)
|
| 1029 |
torch.cuda.empty_cache()
|
| 1030 |
try:
|
| 1031 |
torch.cuda.ipc_collect()
|
|
@@ -1036,14 +941,14 @@ class VideoService:
|
|
| 1036 |
|
| 1037 |
|
| 1038 |
lat_a, lat_b = self._dividir_latentes(latents_cpu_vae)
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
|
| 1042 |
latents_parts_vae = [lat_a, lat_b]
|
| 1043 |
|
| 1044 |
|
| 1045 |
for latents in latents_parts_vae:
|
| 1046 |
-
print(f"[DEBUG] Partição {par}: {tuple(latents.shape)}")
|
| 1047 |
|
| 1048 |
par = par + 1
|
| 1049 |
output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
|
|
@@ -1077,10 +982,10 @@ class VideoService:
|
|
| 1077 |
|
| 1078 |
total_partes = len(partes_mp4)
|
| 1079 |
if (total_partes>1):
|
| 1080 |
-
final_vid = os.path.join(results_dir, f"
|
| 1081 |
#partes_mp4_fade = self._gerar_lista_com_transicoes(pasta=results_dir, video_paths=partes_mp4, crossfade_frames=8)
|
| 1082 |
-
final_vid = video_encode_tool_singleton.concatenate_videos(video_paths=partes_mp4, output_path="
|
| 1083 |
-
|
| 1084 |
else:
|
| 1085 |
final_vid = partes_mp4[0]
|
| 1086 |
|
|
|
|
| 19 |
LTXV_DEBUG=1
|
| 20 |
LTXV_FRAME_LOG_EVERY=8
|
| 21 |
|
| 22 |
+
|
|
|
|
| 23 |
|
| 24 |
# --- 1. IMPORTAÇÕES ---
|
| 25 |
import os, subprocess, shlex, tempfile
|
|
|
|
| 49 |
from managers.vae_manager import vae_manager_singleton
|
| 50 |
from tools.video_encode_tool import video_encode_tool_singleton
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
|
| 53 |
def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
|
| 54 |
try:
|
|
|
|
| 218 |
add_deps_to_path()
|
| 219 |
|
| 220 |
# --- 3. IMPORTAÇÕES ESPECÍFICAS DO MODELO ---
|
| 221 |
+
|
|
|
|
|
|
|
| 222 |
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
|
| 223 |
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
|
| 224 |
from ltx_video.models.autoencoders.vae_encode import un_normalize_latents, normalize_latents
|
|
|
|
| 242 |
print("------------------------------------------\n")
|
| 243 |
|
| 244 |
|
| 245 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
|
| 248 |
# --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
|
|
|
|
| 259 |
self.last_memory_reserved_mb = 0.0
|
| 260 |
self._tmp_dirs = set(); self._tmp_files = set(); self._last_outputs = []
|
| 261 |
|
|
|
|
| 262 |
self.pipeline, self.latent_upsampler = self._load_models()
|
| 263 |
print(f"[DEBUG] Pipeline e Upsampler carregados. Upsampler ativo? {bool(self.latent_upsampler)}")
|
| 264 |
|
|
|
|
|
|
|
| 265 |
print(f"[DEBUG] Movendo modelos para {self.device}...")
|
| 266 |
self.pipeline.to(self.device)
|
| 267 |
if self.latent_upsampler:
|
|
|
|
| 284 |
|
| 285 |
print(f"[DEBUG] VideoService pronto. boot_time={time.perf_counter()-t0:.3f}s")
|
| 286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
def _log_gpu_memory(self, stage_name: str):
|
| 288 |
if self.device != "cuda":
|
| 289 |
return
|
|
|
|
| 543 |
start = (num_latente_por_chunk*i)
|
| 544 |
end = (start+num_latente_por_chunk+overlap)
|
| 545 |
if i+1 < n_chunks:
|
| 546 |
+
chunk = latents_brutos[:, :, start:end, :, :].clone().detach()
|
| 547 |
print(f"[DEBUG] chunk{i+1}[:, :, {start}:{end}, :, :] = {chunk.shape[2]}")
|
| 548 |
else:
|
| 549 |
+
chunk = latents_brutos[:, :, start:, :, :].clone().detach()
|
| 550 |
print(f"[DEBUG] chunk{i+1}[:, :, {start}:, :, :] = {chunk.shape[2]}")
|
| 551 |
chunks.append(chunk)
|
| 552 |
i+=1
|
|
|
|
| 578 |
#if total % 2 == 1: # ÍMPAR
|
| 579 |
# Ex: 11 → primeira 0..5, segunda 5..10
|
| 580 |
cut = total // 2
|
| 581 |
+
primeira = latents_brutos[:, :, :cut+1, :, :].clone()
|
| 582 |
segunda = latents_brutos[:, :, cut:, :, :].clone()
|
| 583 |
|
| 584 |
|
|
|
|
| 783 |
"decode_timestep": self.config["decode_timestep"],
|
| 784 |
"decode_noise_scale": self.config["decode_noise_scale"],
|
| 785 |
"stochastic_sampling": self.config["stochastic_sampling"],
|
| 786 |
+
"image_cond_noise_scale": 0.05,
|
| 787 |
"is_video": True,
|
| 788 |
"vae_per_channel_normalize": True,
|
| 789 |
"mixed_precision": (self.config["precision"] == "mixed_precision"),
|
|
|
|
| 848 |
del base_latents; gc.collect(); torch.cuda.empty_cache()
|
| 849 |
|
| 850 |
par = 0
|
| 851 |
+
latents_cpu_up = upsampled_latents.detach().to("cpu", non_blocking=True)
|
| 852 |
torch.cuda.empty_cache()
|
| 853 |
try:
|
| 854 |
torch.cuda.ipc_collect()
|
|
|
|
| 866 |
|
| 867 |
#latents_parts_up = [latents_cpu_up]
|
| 868 |
|
| 869 |
+
|
| 870 |
for latents in latents_parts_up:
|
| 871 |
|
| 872 |
# # --- ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---
|
| 873 |
print("\n--- INICIANDO ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---")
|
| 874 |
+
|
|
|
|
| 875 |
second_pass_config = self.config.get("second_pass", {}).copy()
|
| 876 |
# --- <INÍCIO DA LÓGICA DE CÁLCULO EXATA PARA SECOND PASS> ---
|
| 877 |
# Usa as dimensões da primeira passagem dobradas, como na pipeline original
|
|
|
|
| 890 |
|
| 891 |
second_pass_kwargs = call_kwargs.copy()
|
| 892 |
second_pass_kwargs.update({
|
|
|
|
| 893 |
"output_type": "latent",
|
| 894 |
"width": second_pass_width,
|
| 895 |
"height": second_pass_height,
|
|
|
|
| 930 |
|
| 931 |
for latents_vae in latents_list:
|
| 932 |
|
| 933 |
+
latents_cpu_vae = latents_vae.detach().to("cpu", non_blocking=True)
|
| 934 |
torch.cuda.empty_cache()
|
| 935 |
try:
|
| 936 |
torch.cuda.ipc_collect()
|
|
|
|
| 941 |
|
| 942 |
|
| 943 |
lat_a, lat_b = self._dividir_latentes(latents_cpu_vae)
|
| 944 |
+
print(f"[DEBUG] Partição A: {tuple(lat_a.shape)}")
|
| 945 |
+
print(f"[DEBUG] Partição B: {tuple(lat_b.shape)}")
|
| 946 |
|
| 947 |
latents_parts_vae = [lat_a, lat_b]
|
| 948 |
|
| 949 |
|
| 950 |
for latents in latents_parts_vae:
|
| 951 |
+
#print(f"[DEBUG] Partição {par}: {tuple(latents.shape)}")
|
| 952 |
|
| 953 |
par = par + 1
|
| 954 |
output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
|
|
|
|
| 982 |
|
| 983 |
total_partes = len(partes_mp4)
|
| 984 |
if (total_partes>1):
|
| 985 |
+
final_vid = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
|
| 986 |
#partes_mp4_fade = self._gerar_lista_com_transicoes(pasta=results_dir, video_paths=partes_mp4, crossfade_frames=8)
|
| 987 |
+
final_vid = video_encode_tool_singleton.concatenate_videos(video_paths=partes_mp4, output_path="concate_fim.mp4", workspace_dir=results_dir)
|
| 988 |
+
self._concat_mp4s_no_reencode(partes_mp4, final_vid)
|
| 989 |
else:
|
| 990 |
final_vid = partes_mp4[0]
|
| 991 |
|