EuuIia commited on
Commit
a6e974e
·
verified ·
1 Parent(s): 03c32c6

Update video_service.py

Browse files
Files changed (1) hide show
  1. video_service.py +97 -9
video_service.py CHANGED
@@ -5,7 +5,9 @@ import torch
5
  import numpy as np
6
  import random
7
  import os
 
8
  import yaml
 
9
  from pathlib import Path
10
  import imageio
11
  import tempfile
@@ -85,23 +87,109 @@ class VideoService:
85
  torch.cuda.empty_cache()
86
  self._log_gpu_memory("Após carregar modelos")
87
  print("VideoService pronto para uso.")
88
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def _log_gpu_memory(self, stage_name: str):
90
- if self.device != "cuda": return
91
- current_reserved_b = torch.cuda.memory_reserved()
 
 
 
92
  current_reserved_mb = current_reserved_b / (1024 ** 2)
93
- total_memory_b = torch.cuda.get_device_properties(0).total_memory
94
  total_memory_mb = total_memory_b / (1024 ** 2)
95
- peak_reserved_mb = torch.cuda.max_memory_reserved() / (1024 ** 2)
96
- delta_mb = current_reserved_mb - self.last_memory_reserved_mb
97
- print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} ---")
 
 
 
 
 
 
98
  print(f" - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
99
  print(f" - Variação desde o último log: {delta_mb:+.2f} MB")
100
- if peak_reserved_mb > self.last_memory_reserved_mb:
101
  print(f" - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
 
 
102
  print("--------------------------------------------------\n")
103
  self.last_memory_reserved_mb = current_reserved_mb
104
-
105
  def _load_config(self):
106
  config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
107
  with open(config_file_path, "r") as file:
 
5
  import numpy as np
6
  import random
7
  import os
8
+ import shlex
9
  import yaml
10
+ from typing import List, Dict
11
  from pathlib import Path
12
  import imageio
13
  import tempfile
 
87
  torch.cuda.empty_cache()
88
  self._log_gpu_memory("Após carregar modelos")
89
  print("VideoService pronto para uso.")
90
+
91
+ def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
92
+ try:
93
+ import psutil
94
+ import pynvml as nvml
95
+ nvml.nvmlInit()
96
+ handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
97
+ # Try v3, then fall back to the generic name if binding differs
98
+ try:
99
+ procs = nvml.nvmlDeviceGetComputeRunningProcesses_v3(handle)
100
+ except Exception:
101
+ procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
102
+ results = []
103
+ for p in procs:
104
+ pid = int(p.pid)
105
+ used_mb = None
106
+ try:
107
+ # NVML returns bytes; some bindings may use NVML_VALUE_NOT_AVAILABLE
108
+ if getattr(p, "usedGpuMemory", None) is not None and p.usedGpuMemory not in (0,):
109
+ used_mb = max(0, int(p.usedGpuMemory) // (1024 * 1024))
110
+ except Exception:
111
+ used_mb = None
112
+ name = "unknown"
113
+ user = "unknown"
114
+ try:
115
+ pr = psutil.Process(pid)
116
+ name = pr.name()
117
+ user = pr.username()
118
+ except Exception:
119
+ pass
120
+ results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
121
+ nvml.nvmlShutdown()
122
+ return results
123
+ except Exception:
124
+ return []
125
+
126
+ def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
127
+ # CSV, no header, no units gives lines: "PID,process_name,used_memory"
128
+ cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
129
+ try:
130
+ out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
131
+ except Exception:
132
+ return []
133
+ results = []
134
+ for line in out.strip().splitlines():
135
+ parts = [p.strip() for p in line.split(",")]
136
+ if len(parts) >= 3:
137
+ try:
138
+ pid = int(parts[0])
139
+ name = parts[1]
140
+ used_mb = int(parts[2])
141
+ user = "unknown"
142
+ try:
143
+ import psutil
144
+ pr = psutil.Process(pid)
145
+ user = pr.username()
146
+ except Exception:
147
+ pass
148
+ results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
149
+ except Exception:
150
+ continue
151
+ return results
152
+
153
+ def _gpu_process_table(processes: List[Dict], current_pid: int) -> str:
154
+ if not processes:
155
+ return " - Processos ativos: (nenhum)\n"
156
+ # sort by used_mb desc, then pid
157
+ processes = sorted(processes, key=lambda x: (x.get("used_mb") or 0), reverse=True)
158
+ lines = [" - Processos ativos (PID | USER | NAME | VRAM MB):"]
159
+ for p in processes:
160
+ star = "*" if p["pid"] == current_pid else " "
161
+ used_str = str(p["used_mb"]) if p.get("used_mb") is not None else "N/A"
162
+ lines.append(f" {star} {p['pid']} | {p['user']} | {p['name']} | {used_str}")
163
+ return "\n".join(lines) + "\n"
164
+
165
+ # Integração no método existente:
166
  def _log_gpu_memory(self, stage_name: str):
167
+ import torch
168
+ if self.device != "cuda":
169
+ return
170
+ device_index = torch.cuda.current_device() if torch.cuda.is_available() else 0
171
+ current_reserved_b = torch.cuda.memory_reserved(device_index)
172
  current_reserved_mb = current_reserved_b / (1024 ** 2)
173
+ total_memory_b = torch.cuda.get_device_properties(device_index).total_memory
174
  total_memory_mb = total_memory_b / (1024 ** 2)
175
+ peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
176
+ delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
177
+
178
+ # Coleta de processos: tenta NVML, depois fallback para nvidia-smi
179
+ processes = _query_gpu_processes_via_nvml(device_index)
180
+ if not processes:
181
+ processes = _query_gpu_processes_via_nvidiasmi(device_index)
182
+
183
+ print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} (cuda:{device_index}) ---")
184
  print(f" - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
185
  print(f" - Variação desde o último log: {delta_mb:+.2f} MB")
186
+ if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
187
  print(f" - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
188
+ # Imprime tabela de processos
189
+ print(_gpu_process_table(processes, os.getpid()), end="")
190
  print("--------------------------------------------------\n")
191
  self.last_memory_reserved_mb = current_reserved_mb
192
+
193
  def _load_config(self):
194
  config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
195
  with open(config_file_path, "r") as file: