Test

Paused

App Files Files Community

EuuIia commited on Oct 3

Commit

9b10e93

verified ·

1 Parent(s): 47475ad

Upload ltx_server.py

Browse files

Files changed (1) hide show

api/ltx_server.py +61 -176

api/ltx_server.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# ltx_server.py — VideoService com logs de depuração detalhados (init→MP4)
-# Opção external_decode: True (default) decodifica latentes com VAE fora da pipeline.
 # --- 1. IMPORTAÇÕES ---
 import torch
@@ -26,7 +25,6 @@ def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
     try:
         import psutil
         import pynvml as nvml
-        print("[DEBUG] NVML: inicializando para consulta de processos...")
         nvml.nvmlInit()
         handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
         try:
@@ -51,29 +49,23 @@ def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
             except Exception:
                 pass
             results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
-        print("[DEBUG] NVML: finalizando...")
         nvml.nvmlShutdown()
         return results
-    except Exception as e:
-        print(f"[DEBUG] NVML indisponível ou falhou: {e}")
         return []
 def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
     cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
     try:
-        print(f"[DEBUG] Rodando: {cmd}")
         out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
-    except Exception as e:
-        print(f"[DEBUG] nvidia-smi falhou: {e}")
         return []
     results = []
     for line in out.strip().splitlines():
         parts = [p.strip() for p in line.split(",")]
         if len(parts) >= 3:
             try:
-                pid = int(parts[0])
-                name = parts[1]
-                used_mb = int(parts[2])
                 user = "unknown"
                 try:
                     import psutil
@@ -164,9 +156,7 @@ class VideoService:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"[DEBUG] Device selecionado: {self.device}")
         self.last_memory_reserved_mb = 0.0
-        self._tmp_dirs = set()
-        self._tmp_files = set()
-        self._last_outputs = []
         self.pipeline, self.latent_upsampler = self._load_models()
         print(f"[DEBUG] Pipeline e Upsampler carregados. Upsampler ativo? {bool(self.latent_upsampler)}")
@@ -195,9 +185,7 @@ class VideoService:
         total_memory_mb = total_memory_b / (1024 ** 2)
         peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
         delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
-        processes = _query_gpu_processes_via_nvml(device_index)
-        if not processes:
-            processes = _query_gpu_processes_via_nvidiasmi(device_index)
         print(f"\n--- [LOG GPU] {stage_name} (cuda:{device_index}) ---")
         print(f"  - Reservado: {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB  (Δ={delta_mb:+.2f} MB)")
         if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
@@ -208,43 +196,33 @@ class VideoService:
     def _register_tmp_dir(self, d: str):
         if d and os.path.isdir(d):
-            self._tmp_dirs.add(d)
-            print(f"[DEBUG] Registrado tmp dir: {d}")
     def _register_tmp_file(self, f: str):
         if f and os.path.exists(f):
-            self._tmp_files.add(f)
-            print(f"[DEBUG] Registrado tmp file: {f}")
     def finalize(self, keep_paths=None, extra_paths=None, clear_gpu=True):
         print("[DEBUG] Finalize: iniciando limpeza...")
-        keep = set(keep_paths or [])
-        extras = set(extra_paths or [])
         removed_files = 0
         for f in list(self._tmp_files | extras):
             try:
                 if f not in keep and os.path.isfile(f):
-                    os.remove(f)
-                    removed_files += 1
-                    print(f"[DEBUG] Removido arquivo tmp: {f}")
             except Exception as e:
                 print(f"[DEBUG] Falha removendo arquivo {f}: {e}")
             finally:
                 self._tmp_files.discard(f)
         removed_dirs = 0
         for d in list(self._tmp_dirs):
             try:
                 if d not in keep and os.path.isdir(d):
-                    shutil.rmtree(d, ignore_errors=True)
-                    removed_dirs += 1
-                    print(f"[DEBUG] Removido diretório tmp: {d}")
             except Exception as e:
                 print(f"[DEBUG] Falha removendo diretório {d}: {e}")
             finally:
                 self._tmp_dirs.discard(d)
         print(f"[DEBUG] Finalize: arquivos removidos={removed_files}, dirs removidos={removed_dirs}")
         gc.collect()
         try:
@@ -256,7 +234,6 @@ class VideoService:
                     pass
         except Exception as e:
             print(f"[DEBUG] Finalize: limpeza GPU falhou: {e}")
         try:
             self._log_gpu_memory("Após finalize")
         except Exception as e:
@@ -283,7 +260,7 @@ class VideoService:
     def _load_models(self):
         t0 = time.perf_counter()
         LTX_REPO = "Lightricks/LTX-Video"
-        print("[DEBUG] Baixando checkpoint principal (hf_hub_download)...")
         distilled_model_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["checkpoint_path"],
@@ -294,7 +271,7 @@ class VideoService:
         self.config["checkpoint_path"] = distilled_model_path
         print(f"[DEBUG] Checkpoint em: {distilled_model_path}")
-        print("[DEBUG] Baixando upscaler espacial (hf_hub_download)...")
         spatial_upscaler_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["spatial_upscaler_model_path"],
@@ -339,15 +316,13 @@ class VideoService:
             try:
                 if p.dtype == f8:
                     with torch.no_grad():
-                        p.data = p.data.to(torch.bfloat16)
-                        p_cnt += 1
             except Exception:
                 pass
         for _, b in module.named_buffers(recurse=True):
             try:
                 if hasattr(b, "dtype") and b.dtype == f8:
-                    b.data = b.data.to(torch.bfloat16)
-                    b_cnt += 1
             except Exception:
                 pass
         print(f"[DEBUG] FP8→BF16: params_promoted={p_cnt}, buffers_promoted={b_cnt}")
@@ -385,38 +360,32 @@ class VideoService:
         print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
         return out
     def _decode_one_latent_to_pixel(self, latent_chw: torch.Tensor) -> torch.Tensor:
         """
-        Decodifica um latente (C,H,W) para pixel (C,H,W) no intervalo [0,1].
-        Usa pipeline.decode_latents se existir, senão pipeline.vae.decode.
         """
-        if self.device == "cuda":
-            ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype)
-        else:
-            ctx = contextlib.nullcontext()
         with ctx:
             if hasattr(self.pipeline, "decode_latents"):
                 img_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
             elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
                 img_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
             else:
-                raise RuntimeError("Nenhum decoder encontrado (decode_latents/vae.decode).")
-        img_chw = img_bchw[0]
-        # Normaliza para [0,1] caso venha em [-1,1]
         if img_chw.min() < 0:
             img_chw = (img_chw.clamp(-1, 1) + 1.0) / 2.0
         else:
             img_chw = img_chw.clamp(0, 1)
         return img_chw
     def _pixels_to_uint8_np(self, pixel_chw: torch.Tensor, padding_values) -> np.ndarray:
         """
-        Converte (C,H,W) float [0,1] em (H,W,C) uint8, aplicando crop do padding.
         """
         pad_left, pad_right, pad_top, pad_bottom = padding_values
-        H, W = pixel_chw.shape[1], pixel_chw.shape[2]
         h_end = H - pad_bottom if pad_bottom > 0 else H
         w_end = W - pad_right if pad_right > 0 else W
         pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
@@ -426,14 +395,13 @@ class VideoService:
                         .cpu()
                         .numpy())
         return frame_hwc_u8
     def encode_latents_to_mp4(self, latents: torch.Tensor, output_path: str, fps: int, padding_values,
                               progress_callback=None):
         """
-        Pipeline final: latentes (B,C,T,H,W) -> decodifica cada quadro -> escreve MP4 incremental.
-        Segue o padrão do encoder no outro app (frame a frame sem array 4D gigante).
         """
-        T = latents.shape[2]
         print(f"[DEBUG] encode_latents_to_mp4: frames={T} out={output_path}")
         with imageio.get_writer(output_path, fps=fps, codec="libx264", quality=8) as writer:
             for i in range(T):
@@ -444,55 +412,7 @@ class VideoService:
                 if progress_callback:
                     progress_callback(i + 1, T)
                 if i % getattr(self, "frame_log_every", 8) == 0:
-                    print(f"[DEBUG] encode frame {i}/{T}")
-    def _decode_latents_to_video(self, latents: torch.Tensor, output_video_path: str, frame_rate: int,
-                                 padding_values, progress_callback=None):
-        print(f"[DEBUG] Decodificando latentes → vídeo: {output_video_path}")
-        pad_left, pad_right, pad_top, pad_bottom = padding_values
-        T = latents.shape[2]
-        print(f"[DEBUG] Latentes shape={tuple(latents.shape)} frames={T}")
-        start = time.perf_counter()
-        with imageio.get_writer(output_video_path, fps=frame_rate, codec="libx264", quality=8) as writer:
-            for i in range(T):
-                latent_chw = latents[0, :, i].to(self.device)
-                with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext():
-                    pixel_bchw = None
-                    if hasattr(self.pipeline, "decode_latents"):
-                        pixel_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
-                        if i % self.frame_log_every == 0:
-                            print(f"[DEBUG] decode_latents frame={i}")
-                    elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
-                        pixel_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
-                        if i % self.frame_log_every == 0:
-                            print(f"[DEBUG] vae.decode frame={i}")
-                    else:
-                        raise RuntimeError("Pipeline não possui decode_latents/vae.decode.")
-                pixel_chw = pixel_bchw[0]
-                if pixel_chw.min() < 0:
-                    pixel_chw = (pixel_chw.clamp(-1, 1) + 1.0) / 2.0
-                else:
-                    pixel_chw = pixel_chw.clamp(0, 1)
-                H, W = pixel_chw.shape[2]
-                h_end = H - pad_bottom if pad_bottom > 0 else H
-                w_end = W - pad_right if pad_right > 0 else W
-                pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
-                frame_hwc_u8 = (pixel_chw.permute(1, 2, 0)
-                                .mul(255)
-                                .to(torch.uint8)
-                                .cpu()
-                                .numpy())
-                writer.append_data(frame_hwc_u8)
-                if progress_callback:
-                    progress_callback(i + 1, T)
-                if i % self.frame_log_every == 0:
-                    print(f"[DEBUG] frame {i}/{T} escrito.")
-        print(f"[DEBUG] Decodificação+escrita concluída em {time.perf_counter()-start:.3f}s")
     def generate(
         self,
@@ -515,13 +435,13 @@ class VideoService:
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
         external_decode=True,
     ):
         t_all = time.perf_counter()
         print(f"[DEBUG] generate() begin mode={mode} external_decode={external_decode} improve_texture={improve_texture}")
         if self.device == "cuda":
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
         self._log_gpu_memory("Início da Geração")
         if mode == "image-to-video" and not start_image_filepath:
@@ -530,11 +450,9 @@ class VideoService:
             raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
-        seed_everething(used_seed)
-        print(f"[DEBUG] Seed usado: {used_seed}")
-        FPS = 24.0
-        MAX_NUM_FRAMES = 257
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
@@ -561,6 +479,7 @@ class VideoService:
                 conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
             print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
         call_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
@@ -569,7 +488,7 @@ class VideoService:
             "num_frames": actual_num_frames,
             "frame_rate": int(FPS),
             "generator": generator,
-            "output_type": "latent" if external_decode else "pt",
             "conditioning_items": conditioning_items if conditioning_items else None,
             "media_items": None,
             "decode_timestep": self.config["decode_timestep"],
@@ -583,7 +502,7 @@ class VideoService:
             "enhance_prompt": False,
             "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
         }
-        print(f"[DEBUG] call_kwargs.output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
         if mode == "video-to-video":
             media = load_media_file(
@@ -597,7 +516,6 @@ class VideoService:
             print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
-        result_tensor = None
         multi_scale_pipeline = None
         try:
@@ -626,18 +544,14 @@ class VideoService:
                     result = multi_scale_pipeline(**multi_scale_call_kwargs)
                 print(f"[DEBUG] multi_scale_pipeline tempo={time.perf_counter()-t_ms:.3f}s")
-                if external_decode:
-                    if hasattr(result, "latents"):
-                        latents = result.latents
-                    elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
-                        latents = result.images
-                    else:
-                        latents = result
-                    print(f"[DEBUG] Latentes obtidos (multi-escala): shape={tuple(latents.shape)}")
                 else:
-                    result_tensor = result.images if hasattr(result, "images") else result
-                    print(f"[DEBUG] Pixels obtidos (multi-escala): shape={tuple(result_tensor.shape)}")
-                    log_tensor_info(result_tensor, "Saída Multi-Scale (pixel)")
             else:
                 single_pass_kwargs = call_kwargs.copy()
                 first_pass_config = self.config.get("first_pass", {})
@@ -649,12 +563,9 @@ class VideoService:
                         "skip_block_list": first_pass_config.get("skip_block_list"),
                     }
                 )
-                schedule = first_pass_config.get("timesteps")
-                if schedule is None:
-                    schedule = first_pass_config.get("guidance_timesteps")
                 if mode == "video-to-video":
-                    schedule = [0.7]
-                    print("[INFO] Modo video-to-video (etapa única): timesteps=[0.7]")
                 if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
                     single_pass_kwargs["timesteps"] = schedule
                     single_pass_kwargs["guidance_timesteps"] = schedule
@@ -667,51 +578,28 @@ class VideoService:
                     result = self.pipeline(**single_pass_kwargs)
                 print(f"[DEBUG] single-pass tempo={time.perf_counter()-t_sp:.3f}s")
-                if external_decode:
-                    if hasattr(result, "latents"):
-                        latents = result.latents
-                    elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
-                        latents = result.images
-                    else:
-                        latents = result
-                    print(f"[DEBUG] Latentes obtidos (single-pass): shape={tuple(latents.shape)}")
                 else:
-                    result_tensor = result.images if hasattr(result, "images") else result
-                    print(f"[DEBUG] Pixels obtidos (single-pass): shape={tuple(result_tensor.shape)}")
-            temp_dir = tempfile.mkdtemp(prefix="ltxv_")
-            self._register_tmp_dir(temp_dir)
-            results_dir = "/app/output"
-            os.makedirs(results_dir, exist_ok=True)
             output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
             final_output_path = None
-            if external_decode:
-                print("[DEBUG] Codificando a partir dos latentes (VAE externo) → MP4...")
-                self.encode_latents_to_mp4(
-                    latents=latents,
-                    output_path=output_video_path,
-                    fps=call_kwargs["frame_rate"],
-                    padding_values=padding_values,
-                    progress_callback=progress_callback
-                )
-            else:
-                print("[DEBUG] Escrevendo vídeo a partir de pixels (sem latentes)...")
-                with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], codec="libx264", quality=8) as writer:
-                    T = result_tensor.shape[2]
-                    for i in range(T):
-                        frame_chw = result_tensor[0, :, i]
-                        frame_hwc_u8 = (frame_chw.permute(1, 2, 0)
-                                        .clamp(0, 1)
-                                        .mul(255)
-                                        .to(torch.uint8)
-                                        .cpu()
-                                        .numpy())
-                        writer.append_data(frame_hwc_u8)
-                        if progress_callback:
-                            progress_callback(i + 1, T)
-                        if i % self.frame_log_every == 0:
-                            print(f"[DEBUG] frame {i}/{T} escrito (pixel).")
             candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
             try:
@@ -736,10 +624,6 @@ class VideoService:
                 del latents
             except Exception:
                 pass
-            try:
-                del result_tensor
-            except Exception:
-                pass
             try:
                 del multi_scale_pipeline
             except Exception:
@@ -763,3 +647,4 @@ class VideoService:
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
 video_generation_service = VideoService()

+# ltx_server.py — VideoService (sempre output_type="latent") com VAE→pixels→MP4 no fim
 # --- 1. IMPORTAÇÕES ---
 import torch
     try:
         import psutil
         import pynvml as nvml
         nvml.nvmlInit()
         handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
         try:
             except Exception:
                 pass
             results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
         nvml.nvmlShutdown()
         return results
+    except Exception:
         return []
 def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
     cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
     try:
         out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
+    except Exception:
         return []
     results = []
     for line in out.strip().splitlines():
         parts = [p.strip() for p in line.split(",")]
         if len(parts) >= 3:
             try:
+                pid = int(parts[^23_0]); name = parts[^23_1]; used_mb = int(parts[^23_2])
                 user = "unknown"
                 try:
                     import psutil
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"[DEBUG] Device selecionado: {self.device}")
         self.last_memory_reserved_mb = 0.0
+        self._tmp_dirs = set(); self._tmp_files = set(); self._last_outputs = []
         self.pipeline, self.latent_upsampler = self._load_models()
         print(f"[DEBUG] Pipeline e Upsampler carregados. Upsampler ativo? {bool(self.latent_upsampler)}")
         total_memory_mb = total_memory_b / (1024 ** 2)
         peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
         delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
+        processes = _query_gpu_processes_via_nvml(device_index) or _query_gpu_processes_via_nvidiasmi(device_index)
         print(f"\n--- [LOG GPU] {stage_name} (cuda:{device_index}) ---")
         print(f"  - Reservado: {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB  (Δ={delta_mb:+.2f} MB)")
         if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
     def _register_tmp_dir(self, d: str):
         if d and os.path.isdir(d):
+            self._tmp_dirs.add(d); print(f"[DEBUG] Registrado tmp dir: {d}")
     def _register_tmp_file(self, f: str):
         if f and os.path.exists(f):
+            self._tmp_files.add(f); print(f"[DEBUG] Registrado tmp file: {f}")
     def finalize(self, keep_paths=None, extra_paths=None, clear_gpu=True):
         print("[DEBUG] Finalize: iniciando limpeza...")
+        keep = set(keep_paths or []); extras = set(extra_paths or [])
         removed_files = 0
         for f in list(self._tmp_files | extras):
             try:
                 if f not in keep and os.path.isfile(f):
+                    os.remove(f); removed_files += 1; print(f"[DEBUG] Removido arquivo tmp: {f}")
             except Exception as e:
                 print(f"[DEBUG] Falha removendo arquivo {f}: {e}")
             finally:
                 self._tmp_files.discard(f)
         removed_dirs = 0
         for d in list(self._tmp_dirs):
             try:
                 if d not in keep and os.path.isdir(d):
+                    shutil.rmtree(d, ignore_errors=True); removed_dirs += 1; print(f"[DEBUG] Removido diretório tmp: {d}")
             except Exception as e:
                 print(f"[DEBUG] Falha removendo diretório {d}: {e}")
             finally:
                 self._tmp_dirs.discard(d)
         print(f"[DEBUG] Finalize: arquivos removidos={removed_files}, dirs removidos={removed_dirs}")
         gc.collect()
         try:
                     pass
         except Exception as e:
             print(f"[DEBUG] Finalize: limpeza GPU falhou: {e}")
         try:
             self._log_gpu_memory("Após finalize")
         except Exception as e:
     def _load_models(self):
         t0 = time.perf_counter()
         LTX_REPO = "Lightricks/LTX-Video"
+        print("[DEBUG] Baixando checkpoint principal...")
         distilled_model_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["checkpoint_path"],
         self.config["checkpoint_path"] = distilled_model_path
         print(f"[DEBUG] Checkpoint em: {distilled_model_path}")
+        print("[DEBUG] Baixando upscaler espacial...")
         spatial_upscaler_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["spatial_upscaler_model_path"],
             try:
                 if p.dtype == f8:
                     with torch.no_grad():
+                        p.data = p.data.to(torch.bfloat16); p_cnt += 1
             except Exception:
                 pass
         for _, b in module.named_buffers(recurse=True):
             try:
                 if hasattr(b, "dtype") and b.dtype == f8:
+                    b.data = b.data.to(torch.bfloat16); b_cnt += 1
             except Exception:
                 pass
         print(f"[DEBUG] FP8→BF16: params_promoted={p_cnt}, buffers_promoted={b_cnt}")
         print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
         return out
+    # === Decodificação “simples”: latentes → pixels → MP4 ===
     def _decode_one_latent_to_pixel(self, latent_chw: torch.Tensor) -> torch.Tensor:
         """
+        Decodifica um latente (C,H,W) para pixel (C,H,W) em [0,1].
         """
+        ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
         with ctx:
             if hasattr(self.pipeline, "decode_latents"):
                 img_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
             elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
                 img_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
             else:
+                raise RuntimeError("Nenhum decoder (decode_latents/vae.decode) disponível.")
+        img_chw = img_bchw[^23_0]
         if img_chw.min() < 0:
             img_chw = (img_chw.clamp(-1, 1) + 1.0) / 2.0
         else:
             img_chw = img_chw.clamp(0, 1)
         return img_chw
     def _pixels_to_uint8_np(self, pixel_chw: torch.Tensor, padding_values) -> np.ndarray:
         """
+        Converte (C,H,W) float [0,1] em (H,W,C) uint8 com crop do padding.
         """
         pad_left, pad_right, pad_top, pad_bottom = padding_values
+        H, W = pixel_chw.shape[^23_1], pixel_chw.shape[^23_2]
         h_end = H - pad_bottom if pad_bottom > 0 else H
         w_end = W - pad_right if pad_right > 0 else W
         pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
                         .cpu()
                         .numpy())
         return frame_hwc_u8
     def encode_latents_to_mp4(self, latents: torch.Tensor, output_path: str, fps: int, padding_values,
                               progress_callback=None):
         """
+        Latentes (B,C,T,H,W) → decodifica quadro a quadro → escreve MP4 incremental.
         """
+        T = latents.shape[^23_2]
         print(f"[DEBUG] encode_latents_to_mp4: frames={T} out={output_path}")
         with imageio.get_writer(output_path, fps=fps, codec="libx264", quality=8) as writer:
             for i in range(T):
                 if progress_callback:
                     progress_callback(i + 1, T)
                 if i % getattr(self, "frame_log_every", 8) == 0:
+                    print(f"[DEBUG] frame {i}/{T} codificado")
     def generate(
         self,
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
+        # Sempre latent→VAE→MP4 (simples)
         external_decode=True,
     ):
         t_all = time.perf_counter()
         print(f"[DEBUG] generate() begin mode={mode} external_decode={external_decode} improve_texture={improve_texture}")
         if self.device == "cuda":
+            torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
         self._log_gpu_memory("Início da Geração")
         if mode == "image-to-video" and not start_image_filepath:
             raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
+        seed_everething(used_seed); print(f"[DEBUG] Seed usado: {used_seed}")
+        FPS = 24.0; MAX_NUM_FRAMES = 257
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
                 conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
             print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
+        # Sempre pedimos latentes (simples)
         call_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
             "num_frames": actual_num_frames,
             "frame_rate": int(FPS),
             "generator": generator,
+            "output_type": "latent",
             "conditioning_items": conditioning_items if conditioning_items else None,
             "media_items": None,
             "decode_timestep": self.config["decode_timestep"],
             "enhance_prompt": False,
             "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
         }
+        print(f"[DEBUG] output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
         if mode == "video-to-video":
             media = load_media_file(
             print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
         multi_scale_pipeline = None
         try:
                     result = multi_scale_pipeline(**multi_scale_call_kwargs)
                 print(f"[DEBUG] multi_scale_pipeline tempo={time.perf_counter()-t_ms:.3f}s")
+                # Captura latentes
+                if hasattr(result, "latents"):
+                    latents = result.latents
+                elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
+                    latents = result.images
                 else:
+                    latents = result
+                print(f"[DEBUG] Latentes (multi-escala): shape={tuple(latents.shape)}")
             else:
                 single_pass_kwargs = call_kwargs.copy()
                 first_pass_config = self.config.get("first_pass", {})
                         "skip_block_list": first_pass_config.get("skip_block_list"),
                     }
                 )
+                schedule = first_pass_config.get("timesteps") or first_pass_config.get("guidance_timesteps")
                 if mode == "video-to-video":
+                    schedule = [0.7]; print("[INFO] Modo video-to-video (etapa única): timesteps=[0.7]")
                 if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
                     single_pass_kwargs["timesteps"] = schedule
                     single_pass_kwargs["guidance_timesteps"] = schedule
                     result = self.pipeline(**single_pass_kwargs)
                 print(f"[DEBUG] single-pass tempo={time.perf_counter()-t_sp:.3f}s")
+                if hasattr(result, "latents"):
+                    latents = result.latents
+                elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
+                    latents = result.images
                 else:
+                    latents = result
+                print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
+            # Staging e escrita MP4 (simples: VAE→pixels→MP4)
+            temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
+            results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
             output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
             final_output_path = None
+            print("[DEBUG] Codificando a partir dos latentes (VAE externo) → MP4...")
+            self.encode_latents_to_mp4(
+                latents=latents,
+                output_path=output_video_path,
+                fps=call_kwargs["frame_rate"],
+                padding_values=padding_values,
+                progress_callback=progress_callback,
+            )
             candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
             try:
                 del latents
             except Exception:
                 pass
             try:
                 del multi_scale_pipeline
             except Exception:
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
 video_generation_service = VideoService()