Coverage for integrations/service_tools/vram

1"""

2VRAM Manager — GPU memory tracking, allocation, and offload strategy.

4Tracks which tools have reserved GPU memory and decides whether new

5tools can fit. Provides offload mode suggestions (gpu / cpu_offload / cpu_only).

7Pattern from: integrations/vision/minicpm_installer.py (detect_gpu)

8 ltx2_server.py (VRAM stats, cpu_offload, tiling)

9"""

11import logging

12import os

13import sys

14from typing import Any, Dict, Optional, Tuple

16logger = logging.getLogger(__name__)

18# VRAM budget table: tool_name -> (min_vram_gb, model_size_gb)

19VRAM_BUDGETS: Dict[str, Tuple[float, float]] = {

20 "acestep": (6.0, 4.0),

21 "diffrhythm": (6.0, 4.0), # singing voice synthesis

22 "wan2gp": (8.0, 8.0),

23 "ltx2": (6.0, 4.0),

24 "minicpm": (6.0, 4.0),

25 # STT engines

26 "whisper": (2.0, 1.5),

27 "whisper_base": (0.5, 0.2), # faster-whisper base (CPU-friendly)

28 "whisper_medium": (2.0, 1.5), # faster-whisper medium

29 "whisper_large": (4.0, 3.0), # faster-whisper large-v3-turbo

30 # TTS engines

31 "tts_chatterbox_turbo": (5.6, 3.8), # English, [laugh]/[chuckle] tags

32 "tts_f5": (2.5, 1.3), # English+Chinese, voice cloning

33 "tts_indic_parler": (2.0, 1.8), # 21 Indic languages + English

34 "tts_cosyvoice3": (4.0, 3.5), # zh/ja/ko/de/es/fr/it/ru, zero-shot

35 "tts_chatterbox_ml": (14.0, 12.0), # 23 languages, needs 16GB+

36 "tts_kokoro": (0.5, 0.2), # 82M neural English, CPU or GPU

37 "tts_neutts": (0.7, 0.4), # NeuTTS Air 748M, Q4 GGUF ~600MB

38 # — CPU-friendly (RTF<0.5 on i5),

39 # GPU optional. Budget covers

40 # GGUF-on-CPU + neucodec onnx

41 # workspace; auto-tightens via

42 # record_actual_usage on first

43 # successful load.

44 "tts_omnivoice": (3.5, 3.0), # 646 langs, Qwen3-0.6B+diffusion

45 # — stub budget, auto-tightens

46 # via record_actual_usage on

47 # first successful load.

48 # Mid-VRAM coverage tier (1–3 GB) — bridges the gap between F5/Indic

49 # Parler/Kokoro (≤2.5 GB) and the heavy clone engines so EVERY

50 # SUPPORTED_LANG_DICT code has at least one engine with vram_gb≤3.

51 "tts_melotts": (1.5, 1.0), # en/es/fr/zh/ja/ko, neural CPU/GPU

52 "tts_xtts_v2": (2.5, 1.8), # 17 langs, voice cloning (Coqui)

53 "tts_mms_tts": (1.0, 0.7), # ~50+ langs (per-lang VITS, Meta)

54}

57class VRAMManager:

58 """GPU memory tracking and allocation decisions."""

60 def __init__(self):

61 self._allocations: Dict[str, float] = {} # tool → GB reserved

62 self._gpu_info: Optional[Dict] = None

63 self._gpu_info_ts: float = 0.0 # timestamp of last nvidia-smi call

64 # Bundled mode: GPU state is stable (one model loaded at startup).

65 # Poll every 120s not 30s to reduce subprocess overhead.

66 _bundled = os.environ.get('NUNBA_BUNDLED') == '1'

67 self._refresh_ttl: float = 120.0 if _bundled else 30.0

68 # Serializes allocate() + can_fit() so two concurrent model loads

69 # can't both pass can_fit() and overcommit the GPU. Previously

70 # an atomic-less read-modify-write across _allocations on hot

71 # path (TOCTOU: read free → read budget → mutate dict). Under

72 # a cold startup with parallel LLM + TTS + VLM spawns, both

73 # could see 5GB free, both think 4GB fits, both allocate → 8GB

74 # claimed on a 5GB device → CUDA OOM.

75 import threading as _threading # noqa: E402 (runtime deferred)

76 self._alloc_lock = _threading.RLock()

78 # Measured VRAM usage telemetry: tool → actual model_size_gb seen

79 # after a successful load. Populated via record_actual_usage() —

80 # worker subprocesses self-report post-load GPU usage, parent

81 # stores the value and uses it in preference to the VRAM_BUDGETS

82 # estimate the next time the tool is considered. Enables

83 # conservative stub budgets (e.g. new OmniVoice at 3.0 GB) to

84 # auto-tighten after the first real load without a code change.

85 self._measured: Dict[str, float] = {}

86 self._measured_path = self._resolve_measured_path()

87 self._load_measured()

89 # ── Measured-usage telemetry ─────────────────────────────────

91 @staticmethod

92 def _resolve_measured_path():

93 from pathlib import Path

94 # Prefer the project agent_data dir, fall back to ~/.hevolve

95 cwd_path = Path.cwd() / 'agent_data' / 'vram_measured.json'

96 try:

97 cwd_path.parent.mkdir(parents=True, exist_ok=True)

98 return cwd_path

99 except Exception:

100 fallback = Path.home() / '.hevolve' / 'vram_measured.json'

101 fallback.parent.mkdir(parents=True, exist_ok=True)

102 return fallback

103

104 def _load_measured(self) -> None:

105 import json

106 try:

107 if self._measured_path.exists():

108 data = json.loads(

109 self._measured_path.read_text(encoding='utf-8')

110 )

111 self._measured = {

112 str(k): float(v)

113 for k, v in data.items()

114 if isinstance(v, (int, float)) and v > 0

115 }

116 except Exception as e:

117 logger.debug(f"VRAM measured load failed (ignoring): {e}")

118 self._measured = {}

119

120 def _persist_measured(self) -> None:

121 """Atomic JSON write — tmp-then-rename so we can't half-write."""

122 import json

123 try:

124 tmp = self._measured_path.with_suffix('.json.tmp')

125 tmp.write_text(

126 json.dumps(self._measured, indent=2),

127 encoding='utf-8',

128 )

129 tmp.replace(self._measured_path)

130 except Exception as e:

131 logger.debug(f"VRAM measured persist failed: {e}")

132

133 def record_actual_usage(self, tool_name: str, measured_gb: float) -> None:

134 """Worker-reported post-load GPU usage.

135

136 Called from ToolWorker._wait_ready after parsing the worker's

137 '__WORKER_VRAM_GB__ <n>' marker. Values are persisted so the

138 measurement survives restarts and tightens the budget used by

139 can_fit() / allocate() on subsequent loads.

140

141 Safety rails:

142 - Ignore non-positive values (worker emits 0.0 when it can't

143 measure — e.g. CPU-only, Metal, broken nvidia-smi).

144 - Clamp to [0.1, 64.0] GB — protects against obviously bad

145 telemetry (negative deltas from concurrent workers, runaway

146 leaks).

147 - Compare vs VRAM_BUDGETS declared size — log a prominent

148 warning if measured > declared * 1.5 (the declared budget

149 is wrong and won't fit on the target GPU class).

150 """

151 with self._alloc_lock:

152 if not tool_name or measured_gb is None:

153 return

154 try:

155 gb = float(measured_gb)

156 except (TypeError, ValueError):

157 return

158 if gb <= 0 or gb > 64.0:

159 logger.debug(

160 f"VRAM measurement for {tool_name} out of range ({gb}) — ignored"

161 )

162 return

163 prev = self._measured.get(tool_name)

164 self._measured[tool_name] = round(gb, 2)

165 self._persist_measured()

166

167 declared = VRAM_BUDGETS.get(tool_name)

168 if declared and gb > declared[1] * 1.5:

169 logger.warning(

170 f"{tool_name} measured {gb:.1f} GB — 50%+ over declared "

171 f"{declared[1]:.1f} GB. Consider raising VRAM_BUDGETS. "

172 f"can_fit() will use the measurement from now on."

173 )

174 elif prev is None:

175 logger.info(

176 f"{tool_name}: first measured VRAM = {gb:.2f} GB "

177 f"(budget was {declared[1] if declared else '—'} GB)"

178 )

179

180 def get_effective_budget(

181 self,

182 tool_name: str,

183 ) -> Optional[Tuple[float, float]]:

184 """Return (min_vram_gb, model_size_gb) using measured value if any.

185

186 Measurement is tighter than the declared budget in the common case

187 (stub budget is conservative), so we swap in the measured

188 model_size. When the measurement exceeds the declared model_size

189 we honor the measurement — the tool really does need that much.

190

191 min_vram (headroom) is never lowered below the declared minimum,

192 because overhead like activation buffers is not captured in the

193 static post-load measurement.

194 """

195 declared = VRAM_BUDGETS.get(tool_name)

196 if not declared:

197 return None

198 measured_size = self._measured.get(tool_name)

199 if measured_size is None:

200 return declared

201 min_vram, _declared_size = declared

202 # Require at least measured_size + 0.3 GB overhead headroom

203 effective_min = max(min_vram, measured_size + 0.3)

204 return (effective_min, measured_size)

205

206 def get_measured_usage(self) -> Dict[str, float]:

207 """Return a copy of current measured-usage telemetry (tool → GB)."""

208 return dict(self._measured)

209

210 # ── GPU Detection ────────────────────────────────────────────

211

212 def detect_gpu(self) -> Dict:

213 """Detect GPU and return info dict.

214

215 Priority: nvidia-smi (no deps) → PyTorch (if already loaded) → macOS Metal.

216 Returns: {name, total_gb, free_gb, cuda_available}

217 """

218 if self._gpu_info is not None:

219 return self._gpu_info

220

221 info = {

222 "name": None,

223 "total_gb": 0.0,

224 "free_gb": 0.0,

225 "cuda_available": False,

226 }

227

228 # run_bounded wraps Popen + explicit pipe close on timeout so the

229 # child's _readerthread can't orphan — see core/subprocess_safe.py

230 # for the failure mode (2026-04-15 wmic 27-min hang, same class).

231 from core.subprocess_safe import run_bounded

232

233 # nvidia-smi can be slow when the GPU is under heavy compute load

234 # (driver call queues serialize behind kernel launches, NVML init

235 # contends with active CUDA contexts). 5s was too tight on

236 # 8GB systems running concurrent VLM benchmarks - hit the

237 # subprocess_safe kill-pipes path every cycle and flooded the

238 # log. 15s gives slow systems breathing room without leaving

239 # zombie nvidia-smi processes around. Override via env for

240 # truly degraded systems.

241 _nvsmi_timeout = float(os.environ.get(

242 'HEVOLVE_NVIDIA_SMI_TIMEOUT', '15'))

243

244 # 1) nvidia-smi — zero-dependency, works on any NVIDIA GPU system

245 try:

246 result = run_bounded(

247 ["nvidia-smi", "--query-gpu=name,memory.total,memory.free",

248 "--format=csv,noheader,nounits"],

249 timeout=_nvsmi_timeout,

250 )

251 if result.returncode == 0 and result.stdout.strip():

252 line = result.stdout.strip().split("\n")[0]

253 parts = [p.strip() for p in line.split(",")]

254 if len(parts) >= 3:

255 total_mb = float(parts[1])

256 free_mb = float(parts[2])

257 info.update({

258 "name": parts[0],

259 "total_gb": round(total_mb / 1024, 2),

260 "free_gb": round(free_mb / 1024, 2),

261 "cuda_available": True,

262 })

263 logger.info(

264 f"GPU (nvidia-smi): {info['name']} — "

265 f"{info['total_gb']} GB total, {info['free_gb']} GB free"

266 )

267 self._gpu_info = info

268 return info

269 except FileNotFoundError:

270 pass # nvidia-smi not on PATH — no NVIDIA GPU or drivers

271 except Exception as e:

272 logger.debug(f"nvidia-smi failed: {e}")

273

274 # 1b) rocm-smi — AMD GPUs via ROCm. Same loaded-GPU rationale

275 # as nvidia-smi above; honour the same env override.

276 try:

277 result = run_bounded(

278 ["rocm-smi", "--showmeminfo", "vram", "--csv"],

279 timeout=_nvsmi_timeout,

280 )

281 if result.returncode == 0 and result.stdout.strip():

282 # Parse CSV output: header line then data lines

283 lines = [l.strip() for l in result.stdout.strip().split("\n") if l.strip()]

284 for line in lines[1:]: # skip header

285 parts = [p.strip() for p in line.split(",")]

286 if len(parts) >= 3:

287 try:

288 total_bytes = float(parts[1])

289 used_bytes = float(parts[2])

290 total_gb = round(total_bytes / (1024 ** 3), 2)

291 free_gb = round((total_bytes - used_bytes) / (1024 ** 3), 2)

292 info.update({

293 "name": f"AMD GPU (ROCm)",

294 "total_gb": total_gb,

295 "free_gb": free_gb,

296 "cuda_available": False,

297 "rocm_available": True,

298 })

299 logger.info(

300 f"GPU (rocm-smi): {info['name']} — "

301 f"{info['total_gb']} GB total, {info['free_gb']} GB free"

302 )

303 self._gpu_info = info

304 return info

305 except (ValueError, IndexError):

306 continue

307 except FileNotFoundError:

308 pass # rocm-smi not on PATH — no AMD GPU or ROCm drivers

309 except Exception as e:

310 logger.debug(f"rocm-smi failed: {e}")

311

312 # 2) PyTorch — only if already imported (don't trigger a 2GB import)

313 if "torch" in sys.modules:

314 try:

315 import torch

316 # Detect frozen build torch stub (version 0.0.0, _is_stub=True).

317 # Replace with real torch so CUDA detection works across all

318 # deployments (Nunba frozen, HART OS standalone, cloud).

319 if getattr(torch, '_is_stub', False):

320 import importlib

321 _stale = [k for k in sys.modules if k == 'torch' or k.startswith('torch.')]

322 for _k in _stale:

323 del sys.modules[_k]

324 torch = importlib.import_module('torch')

325 logger.info(f"Replaced torch stub with real torch {torch.__version__}")

326 if torch.cuda.is_available():

327 props = torch.cuda.get_device_properties(0)

328 total = props.total_memory / (1024 ** 3)

329 allocated = torch.cuda.memory_allocated(0) / (1024 ** 3)

330 info.update({

331 "name": torch.cuda.get_device_name(0),

332 "total_gb": round(total, 2),

333 "free_gb": round(total - allocated, 2),

334 "cuda_available": True,

335 })

336 logger.info(

337 f"GPU (PyTorch): {info['name']} — "

338 f"{info['total_gb']} GB total, {info['free_gb']} GB free"

339 )

340 self._gpu_info = info

341 return info

342 except Exception as e:

343 logger.debug(f"PyTorch GPU detection failed: {e}")

344

345 # 3) macOS Metal

346 if sys.platform == "darwin":

347 try:

348 import platform

349 info.update({

350 "name": f"Apple Metal ({'Apple Silicon' if platform.machine() == 'arm64' else 'Intel'})",

351 "total_gb": 0.0, # shared memory — hard to measure

352 "free_gb": 0.0,

353 "cuda_available": False,

354 "metal_available": True,

355 })

356 except Exception:

357 pass

358

359 if not info["cuda_available"]:

360 logger.info("No NVIDIA GPU detected (nvidia-smi not found or no CUDA device)")

361

362 self._gpu_info = info

363 return info

364

365 def refresh_gpu_info(self) -> Dict:

366 """Re-detect GPU with TTL cache (avoids nvidia-smi spam from multiple threads)."""

367 import time as _t

368 now = _t.monotonic()

369 if self._gpu_info is not None and (now - self._gpu_info_ts) < self._refresh_ttl:

370 return self._gpu_info # recent enough — skip subprocess

371 self._gpu_info = None

372 result = self.detect_gpu()

373 self._gpu_info_ts = _t.monotonic()

374 return result

375

376 # ── VRAM queries ─────────────────────────────────────────────

377

378 def get_free_vram(self) -> float:

379 """Return free VRAM in GB — actual free from nvidia-smi.

380

381 nvidia-smi already reports real free VRAM (total - all processes).

382 Do NOT subtract our allocations — that double-counts and reports

383 0GB when there's actually GB free, causing false OOM decisions.

384 """

385 info = self.detect_gpu()

386 if not info["cuda_available"]:

387 return 0.0

388 return info["free_gb"]

389

390 def get_total_vram(self) -> float:

391 """Return total VRAM in GB."""

392 return self.detect_gpu().get("total_gb", 0.0)

393

394 # ── Allocation ───────────────────────────────────────────────

395

396 def can_fit(self, tool_name: str) -> bool:

397 """Check if a tool can fit in remaining VRAM.

398

399 Uses the measured budget (post first successful load) if present,

400 otherwise falls back to the VRAM_BUDGETS declared value.

401 """

402 if tool_name in self._allocations:

403 return True # already allocated

404 effective = self.get_effective_budget(tool_name)

405 if not effective:

406 return True # unknown tool — assume it fits

407 min_vram, _model_size = effective

408 gpu = self.detect_gpu()

409 if not gpu["cuda_available"]:

410 return False # no GPU at all

411 return self.get_free_vram() >= min_vram

412

413 def allocate(self, tool_name: str) -> bool:

414 """Reserve VRAM for a tool. Returns False if it won't fit.

415

416 Lock-serialized: check-then-mutate must be atomic so two

417 parallel allocations can't both win can_fit(). can_fit is

418 called under the same RLock so the 'free' read sees prior

419 pending allocations, not raw GPU stats.

420 """

421 with self._alloc_lock:

422 if tool_name in self._allocations:

423 return True

424 if not self.can_fit(tool_name):

425 logger.warning(f"VRAM rejected: {tool_name} won't fit "

426 f"(free={self.get_free_vram():.1f}GB)")

427 return False

428 effective = self.get_effective_budget(tool_name)

429 model_gb = effective[1] if effective else 0.0

430 self._allocations[tool_name] = model_gb

431 logger.info(f"Allocated {model_gb} GB VRAM for {tool_name}")

432 return True

433

434 def release(self, tool_name: str) -> None:

435 """Release VRAM reservation for a tool."""

436 with self._alloc_lock:

437 freed = self._allocations.pop(tool_name, 0.0)

438 if freed:

439 logger.info(f"Released {freed} GB VRAM from {tool_name}")

440

441 def get_allocations(self) -> Dict[str, float]:

442 """Return current VRAM allocations {tool → GB}."""

443 return dict(self._allocations)

444

445 def get_allocations_display(self) -> Dict[str, Any]:

446 """Return VRAM allocations with rich model details for UI display.

447

448 Each value is either a float (GB) for unknown models, or a dict with

449 name, gb, device, and extra details (quant, context, mmproj) for known models.

450 The frontend VRAMBar can render either format.

451 """

452 import re as _re

453 raw = dict(self._allocations)

454 try:

455 from integrations.service_tools.model_catalog import get_catalog

456 catalog = get_catalog()

457 enriched = {}

458 for key, gb in raw.items():

459 display_key = key

460 detail = {'gb': gb}

461 for mid, entry in catalog._models.items():

462 if entry.loaded and (

463 mid == key or

464 entry.model_type == key or

465 mid.startswith(f'{key}-')

466 ):

467 display_key = entry.name

468 detail = {

469 'gb': gb,

470 'device': entry.device or 'gpu',

471 'backend': entry.backend,

472 'model_id': mid,

473 }

474 # Extract quant from filename (e.g. Q4_K_XL from Qwen3.5-4B-UD-Q4_K_XL.gguf)

475 fname = entry.files.get('model', '') or entry.files.get('file_name', '')

476 if not fname and entry.repo_id:

477 fname = entry.repo_id.split('/')[-1] if '/' in entry.repo_id else ''

478 quant_match = _re.search(r'(Q\d+_K(?:_[A-Z]+)?|F16|F32|INT[48]|GPTQ|AWQ)', fname, _re.I)

479 if quant_match:

480 detail['quant'] = quant_match.group(1)

481 # Context length from capabilities or tags

482 ctx = entry.capabilities.get('context_length') or entry.capabilities.get('n_ctx')

483 if ctx:

484 detail['context'] = ctx

485 # mmproj for vision models

486 if entry.capabilities.get('vision') or 'vision' in (entry.tags if hasattr(entry, 'tags') else []):

487 detail['vision'] = True

488 break

489 enriched[display_key] = detail

490 # NOTE: LLM quant/context/mmproj enrichment is handled by

491 # Nunba's orchestrator shim (models/orchestrator.py), not here.

492 # HARTOS must not import from Nunba (upward dependency).

493 return enriched

494 except Exception:

495 return raw

496

497 # ── Offload strategy ─────────────────────────────────────────

498

499 def suggest_offload_mode(self, tool_name: str) -> str:

500 """Suggest the best offload mode for a tool.

501

502 Returns: 'gpu' | 'cpu_offload' | 'cpu_only'

503 """

504 gpu = self.detect_gpu()

505 if not gpu["cuda_available"]:

506 return "cpu_only"

507

508 budget = VRAM_BUDGETS.get(tool_name)

509 if not budget:

510 return "gpu" # unknown tool, try GPU

511

512 min_vram, model_size = budget

513 free = self.get_free_vram()

514

515 if free >= model_size:

516 return "gpu"

517 elif free >= model_size * 0.5:

518 return "cpu_offload"

519 else:

520 return "cpu_only"

521

522 # ── Pressure detection ────────────────────────────────────────

523

524 def get_actual_free_vram(self) -> float:

525 """Return ACTUAL free VRAM by refreshing nvidia-smi (not cached advisory).

526

527 Unlike get_free_vram(), this re-reads hardware state every call.

528 Used by ModelLifecycleManager for real-time pressure detection.

529 """

530 self.refresh_gpu_info()

531 info = self._gpu_info or {}

532 return info.get('free_gb', 0.0)

533

534 def get_vram_usage_pct(self) -> float:

535 """Return current VRAM usage as percentage (0-100).

536

537 Refreshes GPU info first for accuracy.

538 """

539 self.refresh_gpu_info()

540 info = self._gpu_info or {}

541 total = info.get('total_gb', 0)

542 free = info.get('free_gb', 0)

543 if total <= 0:

544 return 0.0

545 return ((total - free) / total) * 100

546

547 # ── CUDA Cache Clearing ─────────────────────────────────────

548

549 @staticmethod

550 def clear_cuda_cache() -> bool:

551 """Clear GPU cache (CUDA or MPS) if torch is loaded. Returns True if cleared."""

552 if 'torch' in sys.modules:

553 try:

554 import torch

555 if torch.cuda.is_available():

556 torch.cuda.empty_cache()

557 return True

558 elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():

559 torch.mps.empty_cache()

560 return True

561 except Exception:

562 pass

563 return False

564

565 # ── Allocation drift detection ───────────────────────────────

566

567 def detect_allocation_drift(self) -> Dict:

568 """Compare advisory allocations vs actual VRAM usage.

569

570 Returns drift info — positive drift means something is using

571 more VRAM than we budgeted (possible leak or untracked process).

572 """

573 self.refresh_gpu_info()

574 info = self._gpu_info or {}

575 total = info.get('total_gb', 0)

576 actual_free = info.get('free_gb', 0)

577 actual_used = total - actual_free if total > 0 else 0

578

579 advisory_used = sum(self._allocations.values())

580 # Some baseline VRAM is always used by OS/drivers (~0.5-1.5GB typically)

581 os_baseline = min(1.5, total * 0.1) if total > 0 else 0

582

583 drift_gb = actual_used - advisory_used - os_baseline

584

585 return {

586 'actual_used_gb': round(actual_used, 2),

587 'advisory_used_gb': round(advisory_used, 2),

588 'os_baseline_gb': round(os_baseline, 2),

589 'drift_gb': round(drift_gb, 2),

590 'drift_pct': round((drift_gb / total * 100) if total > 0 else 0, 1),

591 'untracked_process': drift_gb > 1.0, # >1GB unaccounted = suspicious

592 }

593

594 # ── Dashboard ────────────────────────────────────────────────

595

596 def get_status(self) -> Dict:

597 """Full VRAM status for dashboard."""

598 gpu = self.detect_gpu()

599 drift = self.detect_allocation_drift()

600 return {

601 "gpu": gpu,

602 "allocations": self.get_allocations_display(),

603 "total_allocated_gb": round(sum(self._allocations.values()), 2),

604 "effective_free_gb": round(self.get_free_vram(), 2),

605 "drift": drift,

606 }

607

608

609# Global singleton

610vram_manager = VRAMManager()

611

612

613# ── Module-level convenience functions ──────────────────────────

614# Allow: from integrations.service_tools.vram_manager import detect_gpu, clear_cuda_cache

615

616def detect_gpu() -> Dict:

617 """Detect GPU via the singleton VRAMManager. See VRAMManager.detect_gpu."""

618 return vram_manager.detect_gpu()

619

620

621def clear_cuda_cache() -> bool:

622 """Clear GPU cache via the singleton VRAMManager. See VRAMManager.clear_cuda_cache."""

623 return VRAMManager.clear_cuda_cache()

624

625

626def get_vram_manager() -> VRAMManager:

627 """Return the global VRAMManager singleton."""

628 return vram_manager

Coverage for integrations / service_tools / vram_manager.py: 76.2%

265 statements