Coverage for integrations / service_tools / vram_manager.py: 76.2%
265 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2VRAM Manager — GPU memory tracking, allocation, and offload strategy.
4Tracks which tools have reserved GPU memory and decides whether new
5tools can fit. Provides offload mode suggestions (gpu / cpu_offload / cpu_only).
7Pattern from: integrations/vision/minicpm_installer.py (detect_gpu)
8 ltx2_server.py (VRAM stats, cpu_offload, tiling)
9"""
11import logging
12import os
13import sys
14from typing import Any, Dict, Optional, Tuple
16logger = logging.getLogger(__name__)
18# VRAM budget table: tool_name -> (min_vram_gb, model_size_gb)
19VRAM_BUDGETS: Dict[str, Tuple[float, float]] = {
20 "acestep": (6.0, 4.0),
21 "diffrhythm": (6.0, 4.0), # singing voice synthesis
22 "wan2gp": (8.0, 8.0),
23 "ltx2": (6.0, 4.0),
24 "minicpm": (6.0, 4.0),
25 # STT engines
26 "whisper": (2.0, 1.5),
27 "whisper_base": (0.5, 0.2), # faster-whisper base (CPU-friendly)
28 "whisper_medium": (2.0, 1.5), # faster-whisper medium
29 "whisper_large": (4.0, 3.0), # faster-whisper large-v3-turbo
30 # TTS engines
31 "tts_chatterbox_turbo": (5.6, 3.8), # English, [laugh]/[chuckle] tags
32 "tts_f5": (2.5, 1.3), # English+Chinese, voice cloning
33 "tts_indic_parler": (2.0, 1.8), # 21 Indic languages + English
34 "tts_cosyvoice3": (4.0, 3.5), # zh/ja/ko/de/es/fr/it/ru, zero-shot
35 "tts_chatterbox_ml": (14.0, 12.0), # 23 languages, needs 16GB+
36 "tts_kokoro": (0.5, 0.2), # 82M neural English, CPU or GPU
37 "tts_neutts": (0.7, 0.4), # NeuTTS Air 748M, Q4 GGUF ~600MB
38 # — CPU-friendly (RTF<0.5 on i5),
39 # GPU optional. Budget covers
40 # GGUF-on-CPU + neucodec onnx
41 # workspace; auto-tightens via
42 # record_actual_usage on first
43 # successful load.
44 "tts_omnivoice": (3.5, 3.0), # 646 langs, Qwen3-0.6B+diffusion
45 # — stub budget, auto-tightens
46 # via record_actual_usage on
47 # first successful load.
48 # Mid-VRAM coverage tier (1–3 GB) — bridges the gap between F5/Indic
49 # Parler/Kokoro (≤2.5 GB) and the heavy clone engines so EVERY
50 # SUPPORTED_LANG_DICT code has at least one engine with vram_gb≤3.
51 "tts_melotts": (1.5, 1.0), # en/es/fr/zh/ja/ko, neural CPU/GPU
52 "tts_xtts_v2": (2.5, 1.8), # 17 langs, voice cloning (Coqui)
53 "tts_mms_tts": (1.0, 0.7), # ~50+ langs (per-lang VITS, Meta)
54}
57class VRAMManager:
58 """GPU memory tracking and allocation decisions."""
60 def __init__(self):
61 self._allocations: Dict[str, float] = {} # tool → GB reserved
62 self._gpu_info: Optional[Dict] = None
63 self._gpu_info_ts: float = 0.0 # timestamp of last nvidia-smi call
64 # Bundled mode: GPU state is stable (one model loaded at startup).
65 # Poll every 120s not 30s to reduce subprocess overhead.
66 _bundled = os.environ.get('NUNBA_BUNDLED') == '1'
67 self._refresh_ttl: float = 120.0 if _bundled else 30.0
68 # Serializes allocate() + can_fit() so two concurrent model loads
69 # can't both pass can_fit() and overcommit the GPU. Previously
70 # an atomic-less read-modify-write across _allocations on hot
71 # path (TOCTOU: read free → read budget → mutate dict). Under
72 # a cold startup with parallel LLM + TTS + VLM spawns, both
73 # could see 5GB free, both think 4GB fits, both allocate → 8GB
74 # claimed on a 5GB device → CUDA OOM.
75 import threading as _threading # noqa: E402 (runtime deferred)
76 self._alloc_lock = _threading.RLock()
78 # Measured VRAM usage telemetry: tool → actual model_size_gb seen
79 # after a successful load. Populated via record_actual_usage() —
80 # worker subprocesses self-report post-load GPU usage, parent
81 # stores the value and uses it in preference to the VRAM_BUDGETS
82 # estimate the next time the tool is considered. Enables
83 # conservative stub budgets (e.g. new OmniVoice at 3.0 GB) to
84 # auto-tighten after the first real load without a code change.
85 self._measured: Dict[str, float] = {}
86 self._measured_path = self._resolve_measured_path()
87 self._load_measured()
89 # ── Measured-usage telemetry ─────────────────────────────────
91 @staticmethod
92 def _resolve_measured_path():
93 from pathlib import Path
94 # Prefer the project agent_data dir, fall back to ~/.hevolve
95 cwd_path = Path.cwd() / 'agent_data' / 'vram_measured.json'
96 try:
97 cwd_path.parent.mkdir(parents=True, exist_ok=True)
98 return cwd_path
99 except Exception:
100 fallback = Path.home() / '.hevolve' / 'vram_measured.json'
101 fallback.parent.mkdir(parents=True, exist_ok=True)
102 return fallback
104 def _load_measured(self) -> None:
105 import json
106 try:
107 if self._measured_path.exists():
108 data = json.loads(
109 self._measured_path.read_text(encoding='utf-8')
110 )
111 self._measured = {
112 str(k): float(v)
113 for k, v in data.items()
114 if isinstance(v, (int, float)) and v > 0
115 }
116 except Exception as e:
117 logger.debug(f"VRAM measured load failed (ignoring): {e}")
118 self._measured = {}
120 def _persist_measured(self) -> None:
121 """Atomic JSON write — tmp-then-rename so we can't half-write."""
122 import json
123 try:
124 tmp = self._measured_path.with_suffix('.json.tmp')
125 tmp.write_text(
126 json.dumps(self._measured, indent=2),
127 encoding='utf-8',
128 )
129 tmp.replace(self._measured_path)
130 except Exception as e:
131 logger.debug(f"VRAM measured persist failed: {e}")
133 def record_actual_usage(self, tool_name: str, measured_gb: float) -> None:
134 """Worker-reported post-load GPU usage.
136 Called from ToolWorker._wait_ready after parsing the worker's
137 '__WORKER_VRAM_GB__ <n>' marker. Values are persisted so the
138 measurement survives restarts and tightens the budget used by
139 can_fit() / allocate() on subsequent loads.
141 Safety rails:
142 - Ignore non-positive values (worker emits 0.0 when it can't
143 measure — e.g. CPU-only, Metal, broken nvidia-smi).
144 - Clamp to [0.1, 64.0] GB — protects against obviously bad
145 telemetry (negative deltas from concurrent workers, runaway
146 leaks).
147 - Compare vs VRAM_BUDGETS declared size — log a prominent
148 warning if measured > declared * 1.5 (the declared budget
149 is wrong and won't fit on the target GPU class).
150 """
151 with self._alloc_lock:
152 if not tool_name or measured_gb is None:
153 return
154 try:
155 gb = float(measured_gb)
156 except (TypeError, ValueError):
157 return
158 if gb <= 0 or gb > 64.0:
159 logger.debug(
160 f"VRAM measurement for {tool_name} out of range ({gb}) — ignored"
161 )
162 return
163 prev = self._measured.get(tool_name)
164 self._measured[tool_name] = round(gb, 2)
165 self._persist_measured()
167 declared = VRAM_BUDGETS.get(tool_name)
168 if declared and gb > declared[1] * 1.5:
169 logger.warning(
170 f"{tool_name} measured {gb:.1f} GB — 50%+ over declared "
171 f"{declared[1]:.1f} GB. Consider raising VRAM_BUDGETS. "
172 f"can_fit() will use the measurement from now on."
173 )
174 elif prev is None:
175 logger.info(
176 f"{tool_name}: first measured VRAM = {gb:.2f} GB "
177 f"(budget was {declared[1] if declared else '—'} GB)"
178 )
180 def get_effective_budget(
181 self,
182 tool_name: str,
183 ) -> Optional[Tuple[float, float]]:
184 """Return (min_vram_gb, model_size_gb) using measured value if any.
186 Measurement is tighter than the declared budget in the common case
187 (stub budget is conservative), so we swap in the measured
188 model_size. When the measurement exceeds the declared model_size
189 we honor the measurement — the tool really does need that much.
191 min_vram (headroom) is never lowered below the declared minimum,
192 because overhead like activation buffers is not captured in the
193 static post-load measurement.
194 """
195 declared = VRAM_BUDGETS.get(tool_name)
196 if not declared:
197 return None
198 measured_size = self._measured.get(tool_name)
199 if measured_size is None:
200 return declared
201 min_vram, _declared_size = declared
202 # Require at least measured_size + 0.3 GB overhead headroom
203 effective_min = max(min_vram, measured_size + 0.3)
204 return (effective_min, measured_size)
206 def get_measured_usage(self) -> Dict[str, float]:
207 """Return a copy of current measured-usage telemetry (tool → GB)."""
208 return dict(self._measured)
210 # ── GPU Detection ────────────────────────────────────────────
212 def detect_gpu(self) -> Dict:
213 """Detect GPU and return info dict.
215 Priority: nvidia-smi (no deps) → PyTorch (if already loaded) → macOS Metal.
216 Returns: {name, total_gb, free_gb, cuda_available}
217 """
218 if self._gpu_info is not None:
219 return self._gpu_info
221 info = {
222 "name": None,
223 "total_gb": 0.0,
224 "free_gb": 0.0,
225 "cuda_available": False,
226 }
228 # run_bounded wraps Popen + explicit pipe close on timeout so the
229 # child's _readerthread can't orphan — see core/subprocess_safe.py
230 # for the failure mode (2026-04-15 wmic 27-min hang, same class).
231 from core.subprocess_safe import run_bounded
233 # nvidia-smi can be slow when the GPU is under heavy compute load
234 # (driver call queues serialize behind kernel launches, NVML init
235 # contends with active CUDA contexts). 5s was too tight on
236 # 8GB systems running concurrent VLM benchmarks - hit the
237 # subprocess_safe kill-pipes path every cycle and flooded the
238 # log. 15s gives slow systems breathing room without leaving
239 # zombie nvidia-smi processes around. Override via env for
240 # truly degraded systems.
241 _nvsmi_timeout = float(os.environ.get(
242 'HEVOLVE_NVIDIA_SMI_TIMEOUT', '15'))
244 # 1) nvidia-smi — zero-dependency, works on any NVIDIA GPU system
245 try:
246 result = run_bounded(
247 ["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
248 "--format=csv,noheader,nounits"],
249 timeout=_nvsmi_timeout,
250 )
251 if result.returncode == 0 and result.stdout.strip():
252 line = result.stdout.strip().split("\n")[0]
253 parts = [p.strip() for p in line.split(",")]
254 if len(parts) >= 3:
255 total_mb = float(parts[1])
256 free_mb = float(parts[2])
257 info.update({
258 "name": parts[0],
259 "total_gb": round(total_mb / 1024, 2),
260 "free_gb": round(free_mb / 1024, 2),
261 "cuda_available": True,
262 })
263 logger.info(
264 f"GPU (nvidia-smi): {info['name']} — "
265 f"{info['total_gb']} GB total, {info['free_gb']} GB free"
266 )
267 self._gpu_info = info
268 return info
269 except FileNotFoundError:
270 pass # nvidia-smi not on PATH — no NVIDIA GPU or drivers
271 except Exception as e:
272 logger.debug(f"nvidia-smi failed: {e}")
274 # 1b) rocm-smi — AMD GPUs via ROCm. Same loaded-GPU rationale
275 # as nvidia-smi above; honour the same env override.
276 try:
277 result = run_bounded(
278 ["rocm-smi", "--showmeminfo", "vram", "--csv"],
279 timeout=_nvsmi_timeout,
280 )
281 if result.returncode == 0 and result.stdout.strip():
282 # Parse CSV output: header line then data lines
283 lines = [l.strip() for l in result.stdout.strip().split("\n") if l.strip()]
284 for line in lines[1:]: # skip header
285 parts = [p.strip() for p in line.split(",")]
286 if len(parts) >= 3:
287 try:
288 total_bytes = float(parts[1])
289 used_bytes = float(parts[2])
290 total_gb = round(total_bytes / (1024 ** 3), 2)
291 free_gb = round((total_bytes - used_bytes) / (1024 ** 3), 2)
292 info.update({
293 "name": f"AMD GPU (ROCm)",
294 "total_gb": total_gb,
295 "free_gb": free_gb,
296 "cuda_available": False,
297 "rocm_available": True,
298 })
299 logger.info(
300 f"GPU (rocm-smi): {info['name']} — "
301 f"{info['total_gb']} GB total, {info['free_gb']} GB free"
302 )
303 self._gpu_info = info
304 return info
305 except (ValueError, IndexError):
306 continue
307 except FileNotFoundError:
308 pass # rocm-smi not on PATH — no AMD GPU or ROCm drivers
309 except Exception as e:
310 logger.debug(f"rocm-smi failed: {e}")
312 # 2) PyTorch — only if already imported (don't trigger a 2GB import)
313 if "torch" in sys.modules:
314 try:
315 import torch
316 # Detect frozen build torch stub (version 0.0.0, _is_stub=True).
317 # Replace with real torch so CUDA detection works across all
318 # deployments (Nunba frozen, HART OS standalone, cloud).
319 if getattr(torch, '_is_stub', False):
320 import importlib
321 _stale = [k for k in sys.modules if k == 'torch' or k.startswith('torch.')]
322 for _k in _stale:
323 del sys.modules[_k]
324 torch = importlib.import_module('torch')
325 logger.info(f"Replaced torch stub with real torch {torch.__version__}")
326 if torch.cuda.is_available():
327 props = torch.cuda.get_device_properties(0)
328 total = props.total_memory / (1024 ** 3)
329 allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)
330 info.update({
331 "name": torch.cuda.get_device_name(0),
332 "total_gb": round(total, 2),
333 "free_gb": round(total - allocated, 2),
334 "cuda_available": True,
335 })
336 logger.info(
337 f"GPU (PyTorch): {info['name']} — "
338 f"{info['total_gb']} GB total, {info['free_gb']} GB free"
339 )
340 self._gpu_info = info
341 return info
342 except Exception as e:
343 logger.debug(f"PyTorch GPU detection failed: {e}")
345 # 3) macOS Metal
346 if sys.platform == "darwin":
347 try:
348 import platform
349 info.update({
350 "name": f"Apple Metal ({'Apple Silicon' if platform.machine() == 'arm64' else 'Intel'})",
351 "total_gb": 0.0, # shared memory — hard to measure
352 "free_gb": 0.0,
353 "cuda_available": False,
354 "metal_available": True,
355 })
356 except Exception:
357 pass
359 if not info["cuda_available"]:
360 logger.info("No NVIDIA GPU detected (nvidia-smi not found or no CUDA device)")
362 self._gpu_info = info
363 return info
365 def refresh_gpu_info(self) -> Dict:
366 """Re-detect GPU with TTL cache (avoids nvidia-smi spam from multiple threads)."""
367 import time as _t
368 now = _t.monotonic()
369 if self._gpu_info is not None and (now - self._gpu_info_ts) < self._refresh_ttl:
370 return self._gpu_info # recent enough — skip subprocess
371 self._gpu_info = None
372 result = self.detect_gpu()
373 self._gpu_info_ts = _t.monotonic()
374 return result
376 # ── VRAM queries ─────────────────────────────────────────────
378 def get_free_vram(self) -> float:
379 """Return free VRAM in GB — actual free from nvidia-smi.
381 nvidia-smi already reports real free VRAM (total - all processes).
382 Do NOT subtract our allocations — that double-counts and reports
383 0GB when there's actually GB free, causing false OOM decisions.
384 """
385 info = self.detect_gpu()
386 if not info["cuda_available"]:
387 return 0.0
388 return info["free_gb"]
390 def get_total_vram(self) -> float:
391 """Return total VRAM in GB."""
392 return self.detect_gpu().get("total_gb", 0.0)
394 # ── Allocation ───────────────────────────────────────────────
396 def can_fit(self, tool_name: str) -> bool:
397 """Check if a tool can fit in remaining VRAM.
399 Uses the measured budget (post first successful load) if present,
400 otherwise falls back to the VRAM_BUDGETS declared value.
401 """
402 if tool_name in self._allocations:
403 return True # already allocated
404 effective = self.get_effective_budget(tool_name)
405 if not effective:
406 return True # unknown tool — assume it fits
407 min_vram, _model_size = effective
408 gpu = self.detect_gpu()
409 if not gpu["cuda_available"]:
410 return False # no GPU at all
411 return self.get_free_vram() >= min_vram
413 def allocate(self, tool_name: str) -> bool:
414 """Reserve VRAM for a tool. Returns False if it won't fit.
416 Lock-serialized: check-then-mutate must be atomic so two
417 parallel allocations can't both win can_fit(). can_fit is
418 called under the same RLock so the 'free' read sees prior
419 pending allocations, not raw GPU stats.
420 """
421 with self._alloc_lock:
422 if tool_name in self._allocations:
423 return True
424 if not self.can_fit(tool_name):
425 logger.warning(f"VRAM rejected: {tool_name} won't fit "
426 f"(free={self.get_free_vram():.1f}GB)")
427 return False
428 effective = self.get_effective_budget(tool_name)
429 model_gb = effective[1] if effective else 0.0
430 self._allocations[tool_name] = model_gb
431 logger.info(f"Allocated {model_gb} GB VRAM for {tool_name}")
432 return True
434 def release(self, tool_name: str) -> None:
435 """Release VRAM reservation for a tool."""
436 with self._alloc_lock:
437 freed = self._allocations.pop(tool_name, 0.0)
438 if freed:
439 logger.info(f"Released {freed} GB VRAM from {tool_name}")
441 def get_allocations(self) -> Dict[str, float]:
442 """Return current VRAM allocations {tool → GB}."""
443 return dict(self._allocations)
445 def get_allocations_display(self) -> Dict[str, Any]:
446 """Return VRAM allocations with rich model details for UI display.
448 Each value is either a float (GB) for unknown models, or a dict with
449 name, gb, device, and extra details (quant, context, mmproj) for known models.
450 The frontend VRAMBar can render either format.
451 """
452 import re as _re
453 raw = dict(self._allocations)
454 try:
455 from integrations.service_tools.model_catalog import get_catalog
456 catalog = get_catalog()
457 enriched = {}
458 for key, gb in raw.items():
459 display_key = key
460 detail = {'gb': gb}
461 for mid, entry in catalog._models.items():
462 if entry.loaded and (
463 mid == key or
464 entry.model_type == key or
465 mid.startswith(f'{key}-')
466 ):
467 display_key = entry.name
468 detail = {
469 'gb': gb,
470 'device': entry.device or 'gpu',
471 'backend': entry.backend,
472 'model_id': mid,
473 }
474 # Extract quant from filename (e.g. Q4_K_XL from Qwen3.5-4B-UD-Q4_K_XL.gguf)
475 fname = entry.files.get('model', '') or entry.files.get('file_name', '')
476 if not fname and entry.repo_id:
477 fname = entry.repo_id.split('/')[-1] if '/' in entry.repo_id else ''
478 quant_match = _re.search(r'(Q\d+_K(?:_[A-Z]+)?|F16|F32|INT[48]|GPTQ|AWQ)', fname, _re.I)
479 if quant_match:
480 detail['quant'] = quant_match.group(1)
481 # Context length from capabilities or tags
482 ctx = entry.capabilities.get('context_length') or entry.capabilities.get('n_ctx')
483 if ctx:
484 detail['context'] = ctx
485 # mmproj for vision models
486 if entry.capabilities.get('vision') or 'vision' in (entry.tags if hasattr(entry, 'tags') else []):
487 detail['vision'] = True
488 break
489 enriched[display_key] = detail
490 # NOTE: LLM quant/context/mmproj enrichment is handled by
491 # Nunba's orchestrator shim (models/orchestrator.py), not here.
492 # HARTOS must not import from Nunba (upward dependency).
493 return enriched
494 except Exception:
495 return raw
497 # ── Offload strategy ─────────────────────────────────────────
499 def suggest_offload_mode(self, tool_name: str) -> str:
500 """Suggest the best offload mode for a tool.
502 Returns: 'gpu' | 'cpu_offload' | 'cpu_only'
503 """
504 gpu = self.detect_gpu()
505 if not gpu["cuda_available"]:
506 return "cpu_only"
508 budget = VRAM_BUDGETS.get(tool_name)
509 if not budget:
510 return "gpu" # unknown tool, try GPU
512 min_vram, model_size = budget
513 free = self.get_free_vram()
515 if free >= model_size:
516 return "gpu"
517 elif free >= model_size * 0.5:
518 return "cpu_offload"
519 else:
520 return "cpu_only"
522 # ── Pressure detection ────────────────────────────────────────
524 def get_actual_free_vram(self) -> float:
525 """Return ACTUAL free VRAM by refreshing nvidia-smi (not cached advisory).
527 Unlike get_free_vram(), this re-reads hardware state every call.
528 Used by ModelLifecycleManager for real-time pressure detection.
529 """
530 self.refresh_gpu_info()
531 info = self._gpu_info or {}
532 return info.get('free_gb', 0.0)
534 def get_vram_usage_pct(self) -> float:
535 """Return current VRAM usage as percentage (0-100).
537 Refreshes GPU info first for accuracy.
538 """
539 self.refresh_gpu_info()
540 info = self._gpu_info or {}
541 total = info.get('total_gb', 0)
542 free = info.get('free_gb', 0)
543 if total <= 0:
544 return 0.0
545 return ((total - free) / total) * 100
547 # ── CUDA Cache Clearing ─────────────────────────────────────
549 @staticmethod
550 def clear_cuda_cache() -> bool:
551 """Clear GPU cache (CUDA or MPS) if torch is loaded. Returns True if cleared."""
552 if 'torch' in sys.modules:
553 try:
554 import torch
555 if torch.cuda.is_available():
556 torch.cuda.empty_cache()
557 return True
558 elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
559 torch.mps.empty_cache()
560 return True
561 except Exception:
562 pass
563 return False
565 # ── Allocation drift detection ───────────────────────────────
567 def detect_allocation_drift(self) -> Dict:
568 """Compare advisory allocations vs actual VRAM usage.
570 Returns drift info — positive drift means something is using
571 more VRAM than we budgeted (possible leak or untracked process).
572 """
573 self.refresh_gpu_info()
574 info = self._gpu_info or {}
575 total = info.get('total_gb', 0)
576 actual_free = info.get('free_gb', 0)
577 actual_used = total - actual_free if total > 0 else 0
579 advisory_used = sum(self._allocations.values())
580 # Some baseline VRAM is always used by OS/drivers (~0.5-1.5GB typically)
581 os_baseline = min(1.5, total * 0.1) if total > 0 else 0
583 drift_gb = actual_used - advisory_used - os_baseline
585 return {
586 'actual_used_gb': round(actual_used, 2),
587 'advisory_used_gb': round(advisory_used, 2),
588 'os_baseline_gb': round(os_baseline, 2),
589 'drift_gb': round(drift_gb, 2),
590 'drift_pct': round((drift_gb / total * 100) if total > 0 else 0, 1),
591 'untracked_process': drift_gb > 1.0, # >1GB unaccounted = suspicious
592 }
594 # ── Dashboard ────────────────────────────────────────────────
596 def get_status(self) -> Dict:
597 """Full VRAM status for dashboard."""
598 gpu = self.detect_gpu()
599 drift = self.detect_allocation_drift()
600 return {
601 "gpu": gpu,
602 "allocations": self.get_allocations_display(),
603 "total_allocated_gb": round(sum(self._allocations.values()), 2),
604 "effective_free_gb": round(self.get_free_vram(), 2),
605 "drift": drift,
606 }
609# Global singleton
610vram_manager = VRAMManager()
613# ── Module-level convenience functions ──────────────────────────
614# Allow: from integrations.service_tools.vram_manager import detect_gpu, clear_cuda_cache
616def detect_gpu() -> Dict:
617 """Detect GPU via the singleton VRAMManager. See VRAMManager.detect_gpu."""
618 return vram_manager.detect_gpu()
621def clear_cuda_cache() -> bool:
622 """Clear GPU cache via the singleton VRAMManager. See VRAMManager.clear_cuda_cache."""
623 return VRAMManager.clear_cuda_cache()
626def get_vram_manager() -> VRAMManager:
627 """Return the global VRAMManager singleton."""
628 return vram_manager