Spaces:
Paused
Paused
Upload ltx_server.py
Browse files- api/ltx_server.py +21 -65
api/ltx_server.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
# ltx_server.py — VideoService (
|
|
|
|
| 2 |
|
| 3 |
# --- 1. IMPORTAÇÕES ---
|
| 4 |
import torch
|
|
@@ -20,6 +21,10 @@ import contextlib
|
|
| 20 |
import time
|
| 21 |
import traceback
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
|
| 24 |
def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
|
| 25 |
try:
|
|
@@ -65,7 +70,7 @@ def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
|
|
| 65 |
parts = [p.strip() for p in line.split(",")]
|
| 66 |
if len(parts) >= 3:
|
| 67 |
try:
|
| 68 |
-
pid = int(parts[0]); name = parts[
|
| 69 |
user = "unknown"
|
| 70 |
try:
|
| 71 |
import psutil
|
|
@@ -360,60 +365,7 @@ class VideoService:
|
|
| 360 |
print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
|
| 361 |
return out
|
| 362 |
|
| 363 |
-
#
|
| 364 |
-
def _decode_one_latent_to_pixel(self, latent_chw: torch.Tensor) -> torch.Tensor:
|
| 365 |
-
"""
|
| 366 |
-
Decodifica um latente (C,H,W) para pixel (C,H,W) em [0,1].
|
| 367 |
-
"""
|
| 368 |
-
ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
|
| 369 |
-
with ctx:
|
| 370 |
-
if hasattr(self.pipeline, "decode_latents"):
|
| 371 |
-
img_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
|
| 372 |
-
elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
|
| 373 |
-
img_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
|
| 374 |
-
else:
|
| 375 |
-
raise RuntimeError("Nenhum decoder (decode_latents/vae.decode) disponível.")
|
| 376 |
-
img_chw = img_bchw[0]
|
| 377 |
-
if img_chw.min() < 0:
|
| 378 |
-
img_chw = (img_chw.clamp(-1, 1) + 1.0) / 2.0
|
| 379 |
-
else:
|
| 380 |
-
img_chw = img_chw.clamp(0, 1)
|
| 381 |
-
return img_chw
|
| 382 |
-
|
| 383 |
-
def _pixels_to_uint8_np(self, pixel_chw: torch.Tensor, padding_values) -> np.ndarray:
|
| 384 |
-
"""
|
| 385 |
-
Converte (C,H,W) float [0,1] em (H,W,C) uint8 com crop do padding.
|
| 386 |
-
"""
|
| 387 |
-
pad_left, pad_right, pad_top, pad_bottom = padding_values
|
| 388 |
-
H, W = pixel_chw.shape[1], pixel_chw.shape[2]
|
| 389 |
-
h_end = H - pad_bottom if pad_bottom > 0 else H
|
| 390 |
-
w_end = W - pad_right if pad_right > 0 else W
|
| 391 |
-
pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
|
| 392 |
-
frame_hwc_u8 = (pixel_chw.permute(1, 2, 0)
|
| 393 |
-
.mul(255)
|
| 394 |
-
.to(torch.uint8)
|
| 395 |
-
.cpu()
|
| 396 |
-
.numpy())
|
| 397 |
-
return frame_hwc_u8
|
| 398 |
-
|
| 399 |
-
def encode_latents_to_mp4(self, latents: torch.Tensor, output_path: str, fps: int, padding_values,
|
| 400 |
-
progress_callback=None):
|
| 401 |
-
"""
|
| 402 |
-
Latentes (B,C,T,H,W) → decodifica quadro a quadro → escreve MP4 incremental.
|
| 403 |
-
"""
|
| 404 |
-
T = latents.shape[2]
|
| 405 |
-
print(f"[DEBUG] encode_latents_to_mp4: frames={T} out={output_path}")
|
| 406 |
-
with imageio.get_writer(output_path, fps=fps, codec="libx264", quality=8) as writer:
|
| 407 |
-
for i in range(T):
|
| 408 |
-
latent_chw = latents[0, :, i].to(self.device)
|
| 409 |
-
pixel_chw = self._decode_one_latent_to_pixel(latent_chw)
|
| 410 |
-
frame_hwc_u8 = self._pixels_to_uint8_np(pixel_chw, padding_values)
|
| 411 |
-
writer.append_data(frame_hwc_u8)
|
| 412 |
-
if progress_callback:
|
| 413 |
-
progress_callback(i + 1, T)
|
| 414 |
-
if i % getattr(self, "frame_log_every", 8) == 0:
|
| 415 |
-
print(f"[DEBUG] frame {i}/{T} codificado")
|
| 416 |
-
|
| 417 |
def generate(
|
| 418 |
self,
|
| 419 |
prompt,
|
|
@@ -435,7 +387,7 @@ class VideoService:
|
|
| 435 |
guidance_scale=3.0,
|
| 436 |
improve_texture=True,
|
| 437 |
progress_callback=None,
|
| 438 |
-
# Sempre latent→VAE→MP4 (simples)
|
| 439 |
external_decode=True,
|
| 440 |
):
|
| 441 |
t_all = time.perf_counter()
|
|
@@ -586,19 +538,23 @@ class VideoService:
|
|
| 586 |
latents = result
|
| 587 |
print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
|
| 588 |
|
| 589 |
-
# Staging e escrita MP4 (simples: VAE→pixels→MP4)
|
| 590 |
temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
|
| 591 |
results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
|
| 592 |
output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
|
| 593 |
final_output_path = None
|
| 594 |
|
| 595 |
-
print("[DEBUG]
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
)
|
| 603 |
|
| 604 |
candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
|
|
|
|
| 1 |
+
# ltx_server.py — VideoService (beta 1.0)
|
| 2 |
+
# Sempre output_type="latent"; no final: VAE (bloco inteiro) → pixels → MP4.
|
| 3 |
|
| 4 |
# --- 1. IMPORTAÇÕES ---
|
| 5 |
import torch
|
|
|
|
| 21 |
import time
|
| 22 |
import traceback
|
| 23 |
|
| 24 |
+
# Singletons do projeto para VAE e Encoder
|
| 25 |
+
from aduc_framework.tools.video_encode_tool import video_encode_tool_singleton
|
| 26 |
+
from aduc_framework.managers.vae_manager import vae_manager_singleton
|
| 27 |
+
|
| 28 |
# --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
|
| 29 |
def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
|
| 30 |
try:
|
|
|
|
| 70 |
parts = [p.strip() for p in line.split(",")]
|
| 71 |
if len(parts) >= 3:
|
| 72 |
try:
|
| 73 |
+
pid = int(parts[0]); name = parts[1]; used_mb = int(parts[2])
|
| 74 |
user = "unknown"
|
| 75 |
try:
|
| 76 |
import psutil
|
|
|
|
| 365 |
print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
|
| 366 |
return out
|
| 367 |
|
| 368 |
+
# --- 6. GERAÇÃO ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
def generate(
|
| 370 |
self,
|
| 371 |
prompt,
|
|
|
|
| 387 |
guidance_scale=3.0,
|
| 388 |
improve_texture=True,
|
| 389 |
progress_callback=None,
|
| 390 |
+
# Sempre latent → VAE → MP4 (simples)
|
| 391 |
external_decode=True,
|
| 392 |
):
|
| 393 |
t_all = time.perf_counter()
|
|
|
|
| 538 |
latents = result
|
| 539 |
print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
|
| 540 |
|
| 541 |
+
# Staging e escrita MP4 (simples: VAE → pixels → MP4)
|
| 542 |
temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
|
| 543 |
results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
|
| 544 |
output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
|
| 545 |
final_output_path = None
|
| 546 |
|
| 547 |
+
print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
|
| 548 |
+
# Se desejar “desocupar” a GPU antes do decode, pode-se mover p/ CPU e limpar:
|
| 549 |
+
# latents_cpu = latents.detach().to("cpu", non_blocking=True); torch.cuda.empty_cache(); torch.cuda.ipc_collect(); latents = latents_cpu.to(self.device)
|
| 550 |
+
pixel_tensor = vae_manager_singleton.decode(latents.to(self.device, non_blocking=True))
|
| 551 |
+
log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
|
| 552 |
+
|
| 553 |
+
print("[DEBUG] Codificando MP4 a partir do tensor de pixels (bloco inteiro)...")
|
| 554 |
+
video_encode_tool_singleton.save_video_from_tensor(
|
| 555 |
+
pixel_tensor,
|
| 556 |
+
output_video_path,
|
| 557 |
+
fps=call_kwargs["frame_rate"]
|
| 558 |
)
|
| 559 |
|
| 560 |
candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
|