MagicNodes / mod /mg_cleanup.py

DZRobo

Add MG_CleanUp node and improve memory management

9068525 19 days ago

12.7 kB

	import os
	import gc
	import time
	import platform
	import ctypes
	from ctypes import wintypes
	import torch
	import torch.nn.functional as F
	import comfy.model_management as model_management
	import comfy.sample as _sample
	import comfy.samplers as _samplers
	import comfy.utils as _utils

	try:
	import psutil # type: ignore
	except Exception: # pragma: no cover
	psutil = None # type: ignore


	def _get_ram_mb() -> float:
	try:
	if psutil is not None:
	p = psutil.Process(os.getpid())
	rss = float(p.memory_info().rss)
	try:
	private = getattr(p.memory_full_info(), "private", None)
	if isinstance(private, (int, float)) and private > 0:
	rss = float(private)
	except Exception:
	pass
	return rss / (1024.0 * 1024.0)
	except Exception:
	pass
	return 0.0


	def _get_vram_mb_per_device() -> list[tuple[int, float, float]]:
	out = []
	try:
	if torch.cuda.is_available():
	for d in range(torch.cuda.device_count()):
	try:
	reserved = float(torch.cuda.memory_reserved(d)) / (1024.0 * 1024.0)
	allocated = float(torch.cuda.memory_allocated(d)) / (1024.0 * 1024.0)
	except Exception:
	reserved = 0.0
	allocated = 0.0
	out.append((d, reserved, allocated))
	except Exception:
	pass
	return out


	def _trim_working_set_windows():
	try:
	if platform.system().lower().startswith("win"):
	kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
	proc = kernel32.GetCurrentProcess()
	kernel32.SetProcessWorkingSetSize(proc, ctypes.c_size_t(-1), ctypes.c_size_t(-1))
	except Exception:
	pass


	def _enable_win_privileges(names):
	"""Best-effort enable a set of Windows privileges for the current process."""
	try:
	if not platform.system().lower().startswith('win'):
	return False
	advapi32 = ctypes.windll.advapi32 # type: ignore[attr-defined]
	kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
	token = wintypes.HANDLE()
	TOKEN_ADJUST_PRIVILEGES = 0x20
	TOKEN_QUERY = 0x8
	if not advapi32.OpenProcessToken(kernel32.GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES \| TOKEN_QUERY, ctypes.byref(token)):
	return False

	class LUID(ctypes.Structure):
	_fields_ = [("LowPart", wintypes.DWORD), ("HighPart", wintypes.LONG)]

	class LUID_AND_ATTRIBUTES(ctypes.Structure):
	_fields_ = [("Luid", LUID), ("Attributes", wintypes.DWORD)]

	class TOKEN_PRIVILEGES(ctypes.Structure):
	_fields_ = [("PrivilegeCount", wintypes.DWORD), ("Privileges", LUID_AND_ATTRIBUTES * 1)]

	SE_PRIVILEGE_ENABLED = 0x2
	success = False
	for name in names:
	luid = LUID()
	if not advapi32.LookupPrivilegeValueW(None, ctypes.c_wchar_p(name), ctypes.byref(luid)):
	continue
	tp = TOKEN_PRIVILEGES()
	tp.PrivilegeCount = 1
	tp.Privileges[0].Luid = luid
	tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED
	if advapi32.AdjustTokenPrivileges(token, False, ctypes.byref(tp), 0, None, None):
	success = True
	return success
	except Exception:
	return False


	def _system_cache_trim_windows():
	"""Attempt to purge standby/file caches on Windows (requires privileges)."""
	try:
	if not platform.system().lower().startswith('win'):
	return False
	_enable_win_privileges([
	'SeIncreaseQuotaPrivilege',
	'SeProfileSingleProcessPrivilege',
	'SeDebugPrivilege',
	])
	try:
	kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
	SIZE_T = ctypes.c_size_t
	kernel32.SetSystemFileCacheSize(SIZE_T(-1), SIZE_T(-1), wintypes.DWORD(0))
	except Exception:
	pass
	try:
	ntdll = ctypes.windll.ntdll # type: ignore[attr-defined]
	SystemMemoryListInformation = 0x50
	MemoryPurgeStandbyList = ctypes.c_ulong(4)
	ntdll.NtSetSystemInformation(wintypes.ULONG(SystemMemoryListInformation), ctypes.byref(MemoryPurgeStandbyList), ctypes.sizeof(MemoryPurgeStandbyList))
	except Exception:
	pass
	return True
	except Exception:
	return False


	def cleanup_memory(sync_cuda: bool = True, hard_trim: bool = True) -> dict:
	"""Run a best-effort cleanup of RAM/VRAM. Returns stats dict with before/after deltas."""
	stats: dict = {"ram_before_mb": 0.0, "ram_after_mb": 0.0, "ram_freed_mb": 0.0, "gpu": []}
	stats["ram_before_mb"] = _get_ram_mb()
	gpu_before = _get_vram_mb_per_device()
	try:
	if sync_cuda and torch.cuda.is_available():
	torch.cuda.synchronize()
	except Exception:
	pass
	try:
	import comfy.model_management as mm
	if hasattr(mm, 'soft_empty_cache'):
	mm.soft_empty_cache()
	except Exception:
	pass
	try:
	gc.collect()
	except Exception:
	pass
	try:
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	except Exception:
	pass
	try:
	time.sleep(0)
	except Exception:
	pass
	if hard_trim:
	try:
	import comfy.model_management as mm
	if hasattr(mm, 'unload_all_models'):
	mm.unload_all_models()
	except Exception:
	pass
	try:
	for _ in range(2):
	time.sleep(0)
	gc.collect()
	except Exception:
	pass
	try:
	if hasattr(_utils, 'cleanup_lru_caches'):
	_utils.cleanup_lru_caches()
	except Exception:
	pass
	try:
	_trim_working_set_windows()
	psapi = ctypes.windll.psapi # type: ignore[attr-defined]
	kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
	psapi.EmptyWorkingSet(kernel32.GetCurrentProcess())
	except Exception:
	pass
	try:
	if platform.system().lower().startswith('linux'):
	libc = ctypes.CDLL('libc.so.6')
	libc.malloc_trim(0)
	except Exception:
	pass
	try:
	_system_cache_trim_windows()
	except Exception:
	pass
	stats["ram_after_mb"] = _get_ram_mb()
	stats["ram_freed_mb"] = max(0.0, stats["ram_before_mb"] - stats["ram_after_mb"])
	gpu_after = _get_vram_mb_per_device()
	device_map = {d: (r, a) for d, r, a in gpu_before}
	gpu_stats = []
	for d, r_after, a_after in gpu_after:
	r_before, a_before = device_map.get(d, (0.0, 0.0))
	gpu_stats.append({
	"device": d,
	"reserved_before_mb": r_before,
	"reserved_after_mb": r_after,
	"reserved_freed_mb": max(0.0, r_before - r_after),
	"allocated_before_mb": a_before,
	"allocated_after_mb": a_after,
	"allocated_freed_mb": max(0.0, a_before - a_after),
	})
	stats["gpu"] = gpu_stats
	return stats


	class MG_CleanUp:
	@classmethod
	def INPUT_TYPES(cls):
	return {
	"required": {
	"samples": ("LATENT", {}),
	},
	"optional": {
	"hard_trim": ("BOOLEAN", {"default": True, "tooltip": "Aggressively free RAM/VRAM and ask OS to return pages to the system."}),
	"sync_cuda": ("BOOLEAN", {"default": True, "tooltip": "Synchronize CUDA before cleanup to flush pending kernels."}),
	"hires_only_threshold": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 64, "tooltip": "Apply only when latent longest side >= threshold (0 == always)."}),
	}
	}

	RETURN_TYPES = ("LATENT", "IMAGE")
	RETURN_NAMES = ("samples", "Preview")
	FUNCTION = "apply"
	CATEGORY = "MagicNodes"

	def apply(self, samples, hard_trim=True, sync_cuda=True, hires_only_threshold=0,
	model=None, positive=None, negative=None, vae=None):
	img_prev = None
	try:
	if (model is not None) and (positive is not None) and (negative is not None) and (vae is not None):
	lat = samples.get("samples", None)
	if lat is not None and isinstance(lat, torch.Tensor) and lat.ndim == 4:
	z = lat
	B, C, H, W = z.shape
	target = 32
	z_ds = z if (H == target and W == target) else F.interpolate(z, size=(target, target), mode='bilinear', align_corners=False)
	lat_img = _sample.fix_empty_latent_channels(model, z_ds) if hasattr(_sample, 'fix_empty_latent_channels') else z_ds
	batch_inds = samples.get("batch_index", None)
	noise = _sample.prepare_noise(lat_img, int(0), batch_inds)
	steps = 1
	out = _sample.sample(
	model, noise, int(steps), float(1.0), "ddim", "normal",
	positive, negative, lat_img,
	denoise=float(0.10), disable_noise=False, start_step=None, last_step=None,
	force_full_denoise=False, noise_mask=None, callback=None,
	disable_pbar=not _utils.PROGRESS_BAR_ENABLED, seed=int(0)
	)
	try:
	img_prev = vae.decode(out)
	if len(img_prev.shape) == 5:
	img_prev = img_prev.reshape(-1, img_prev.shape[-3], img_prev.shape[-2], img_prev.shape[-1])
	except Exception:
	img_prev = None
	except Exception:
	img_prev = None

	try:
	do_cleanup = True
	try:
	if int(hires_only_threshold) > 0:
	z = samples.get("samples", None)
	if z is not None and hasattr(z, "shape") and len(z.shape) >= 4:
	_, _, H, W = z.shape
	if max(int(H), int(W)) < int(hires_only_threshold):
	do_cleanup = False
	except Exception:
	pass
	if do_cleanup:
	print("=== CleanUP RAM and GPU ===")
	stats = cleanup_memory(sync_cuda=bool(sync_cuda), hard_trim=bool(hard_trim))
	try:
	print(f"RAM freed: {stats['ram_freed_mb']:.1f} MB (before {stats['ram_before_mb']:.1f} -> after {stats['ram_after_mb']:.1f})")
	except Exception:
	pass
	try:
	for g in stats.get("gpu", []):
	print(
	f"GPU{g['device']}: reserved freed {g['reserved_freed_mb']:.1f} MB ( {g['reserved_before_mb']:.1f} -> {g['reserved_after_mb']:.1f} ), "
	f"allocated freed {g['allocated_freed_mb']:.1f} MB ( {g['allocated_before_mb']:.1f} -> {g['allocated_after_mb']:.1f} )"
	)
	except Exception:
	pass
	# Second pass after short delay to catch late releasers
	try:
	time.sleep(0.150)
	stats2 = cleanup_memory(sync_cuda=False, hard_trim=bool(hard_trim))
	if stats2 and float(stats2.get('ram_freed_mb', 0.0)) > 0.0:
	print(f"2nd pass: RAM freed +{stats2['ram_freed_mb']:.1f} MB")
	try:
	for g in stats2.get('gpu', []):
	if float(g.get('reserved_freed_mb', 0.0)) > 0.0 or float(g.get('allocated_freed_mb', 0.0)) > 0.0:
	print(f"2nd pass GPU{g['device']}: reserved +{g['reserved_freed_mb']:.1f} MB, allocated +{g['allocated_freed_mb']:.1f} MB")
	except Exception:
	pass
	except Exception:
	pass
	print("done.")
	except Exception:
	pass

	if img_prev is None:
	try:
	device = model_management.intermediate_device() if hasattr(model_management, 'intermediate_device') else 'cpu'
	img_prev = torch.zeros((1, 32, 32, 3), dtype=torch.float32, device=device)
	except Exception:
	img_prev = torch.zeros((1, 32, 32, 3))
	return (samples, img_prev)