Test

Paused

App Files Files Community

EuuIia commited on Oct 3

Commit

a6e974e

verified ·

1 Parent(s): 03c32c6

Update video_service.py

Browse files

Files changed (1) hide show

video_service.py +97 -9

video_service.py CHANGED Viewed

@@ -5,7 +5,9 @@ import torch
 import numpy as np
 import random
 import os
 import yaml
 from pathlib import Path
 import imageio
 import tempfile
@@ -85,23 +87,109 @@ class VideoService:
             torch.cuda.empty_cache()
             self._log_gpu_memory("Após carregar modelos")
         print("VideoService pronto para uso.")
     def _log_gpu_memory(self, stage_name: str):
-        if self.device != "cuda": return
-        current_reserved_b = torch.cuda.memory_reserved()
         current_reserved_mb = current_reserved_b / (1024 ** 2)
-        total_memory_b = torch.cuda.get_device_properties(0).total_memory
         total_memory_mb = total_memory_b / (1024 ** 2)
-        peak_reserved_mb = torch.cuda.max_memory_reserved() / (1024 ** 2)
-        delta_mb = current_reserved_mb - self.last_memory_reserved_mb
-        print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} ---")
         print(f"  - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
         print(f"  - Variação desde o último log: {delta_mb:+.2f} MB")
-        if peak_reserved_mb > self.last_memory_reserved_mb:
             print(f"  - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
         print("--------------------------------------------------\n")
         self.last_memory_reserved_mb = current_reserved_mb
     def _load_config(self):
         config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
         with open(config_file_path, "r") as file:

 import numpy as np
 import random
 import os
+import shlex
 import yaml
+from typing import List, Dict
 from pathlib import Path
 import imageio
 import tempfile
             torch.cuda.empty_cache()
             self._log_gpu_memory("Após carregar modelos")
         print("VideoService pronto para uso.")
+    def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
+        try:
+            import psutil
+            import pynvml as nvml
+            nvml.nvmlInit()
+            handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
+            # Try v3, then fall back to the generic name if binding differs
+            try:
+                procs = nvml.nvmlDeviceGetComputeRunningProcesses_v3(handle)
+            except Exception:
+                procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
+            results = []
+            for p in procs:
+                pid = int(p.pid)
+                used_mb = None
+                try:
+                    # NVML returns bytes; some bindings may use NVML_VALUE_NOT_AVAILABLE
+                    if getattr(p, "usedGpuMemory", None) is not None and p.usedGpuMemory not in (0,):
+                        used_mb = max(0, int(p.usedGpuMemory) // (1024 * 1024))
+                except Exception:
+                    used_mb = None
+                name = "unknown"
+                user = "unknown"
+                try:
+                    pr = psutil.Process(pid)
+                    name = pr.name()
+                    user = pr.username()
+                except Exception:
+                    pass
+                results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
+            nvml.nvmlShutdown()
+            return results
+        except Exception:
+            return []
+    def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
+        # CSV, no header, no units gives lines: "PID,process_name,used_memory"
+        cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
+        try:
+            out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
+        except Exception:
+            return []
+        results = []
+        for line in out.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) >= 3:
+                try:
+                    pid = int(parts[0])
+                    name = parts[1]
+                    used_mb = int(parts[2])
+                    user = "unknown"
+                    try:
+                        import psutil
+                        pr = psutil.Process(pid)
+                        user = pr.username()
+                    except Exception:
+                        pass
+                    results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
+                except Exception:
+                    continue
+        return results
+    def _gpu_process_table(processes: List[Dict], current_pid: int) -> str:
+        if not processes:
+            return "  - Processos ativos: (nenhum)\n"
+        # sort by used_mb desc, then pid
+        processes = sorted(processes, key=lambda x: (x.get("used_mb") or 0), reverse=True)
+        lines = ["  - Processos ativos (PID | USER | NAME | VRAM MB):"]
+        for p in processes:
+            star = "*" if p["pid"] == current_pid else " "
+            used_str = str(p["used_mb"]) if p.get("used_mb") is not None else "N/A"
+            lines.append(f"    {star} {p['pid']} | {p['user']} | {p['name']} | {used_str}")
+        return "\n".join(lines) + "\n"
+    # Integração no método existente:
     def _log_gpu_memory(self, stage_name: str):
+        import torch
+        if self.device != "cuda":
+            return
+        device_index = torch.cuda.current_device() if torch.cuda.is_available() else 0
+        current_reserved_b = torch.cuda.memory_reserved(device_index)
         current_reserved_mb = current_reserved_b / (1024 ** 2)
+        total_memory_b = torch.cuda.get_device_properties(device_index).total_memory
         total_memory_mb = total_memory_b / (1024 ** 2)
+        peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
+        delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
+        # Coleta de processos: tenta NVML, depois fallback para nvidia-smi
+        processes = _query_gpu_processes_via_nvml(device_index)
+        if not processes:
+            processes = _query_gpu_processes_via_nvidiasmi(device_index)
+        print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} (cuda:{device_index}) ---")
         print(f"  - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
         print(f"  - Variação desde o último log: {delta_mb:+.2f} MB")
+        if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
             print(f"  - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
+        # Imprime tabela de processos
+        print(_gpu_process_table(processes, os.getpid()), end="")
         print("--------------------------------------------------\n")
         self.last_memory_reserved_mb = current_reserved_mb
     def _load_config(self):
         config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
         with open(config_file_path, "r") as file: