Coverage for integrations / service_tools / vram_manager.py: 76.2%

265 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2VRAM Manager — GPU memory tracking, allocation, and offload strategy. 

3 

4Tracks which tools have reserved GPU memory and decides whether new 

5tools can fit. Provides offload mode suggestions (gpu / cpu_offload / cpu_only). 

6 

7Pattern from: integrations/vision/minicpm_installer.py (detect_gpu) 

8 ltx2_server.py (VRAM stats, cpu_offload, tiling) 

9""" 

10 

11import logging 

12import os 

13import sys 

14from typing import Any, Dict, Optional, Tuple 

15 

16logger = logging.getLogger(__name__) 

17 

18# VRAM budget table: tool_name -> (min_vram_gb, model_size_gb) 

19VRAM_BUDGETS: Dict[str, Tuple[float, float]] = { 

20 "acestep": (6.0, 4.0), 

21 "diffrhythm": (6.0, 4.0), # singing voice synthesis 

22 "wan2gp": (8.0, 8.0), 

23 "ltx2": (6.0, 4.0), 

24 "minicpm": (6.0, 4.0), 

25 # STT engines 

26 "whisper": (2.0, 1.5), 

27 "whisper_base": (0.5, 0.2), # faster-whisper base (CPU-friendly) 

28 "whisper_medium": (2.0, 1.5), # faster-whisper medium 

29 "whisper_large": (4.0, 3.0), # faster-whisper large-v3-turbo 

30 # TTS engines 

31 "tts_chatterbox_turbo": (5.6, 3.8), # English, [laugh]/[chuckle] tags 

32 "tts_f5": (2.5, 1.3), # English+Chinese, voice cloning 

33 "tts_indic_parler": (2.0, 1.8), # 21 Indic languages + English 

34 "tts_cosyvoice3": (4.0, 3.5), # zh/ja/ko/de/es/fr/it/ru, zero-shot 

35 "tts_chatterbox_ml": (14.0, 12.0), # 23 languages, needs 16GB+ 

36 "tts_kokoro": (0.5, 0.2), # 82M neural English, CPU or GPU 

37 "tts_neutts": (0.7, 0.4), # NeuTTS Air 748M, Q4 GGUF ~600MB 

38 # — CPU-friendly (RTF<0.5 on i5), 

39 # GPU optional. Budget covers 

40 # GGUF-on-CPU + neucodec onnx 

41 # workspace; auto-tightens via 

42 # record_actual_usage on first 

43 # successful load. 

44 "tts_omnivoice": (3.5, 3.0), # 646 langs, Qwen3-0.6B+diffusion 

45 # — stub budget, auto-tightens 

46 # via record_actual_usage on 

47 # first successful load. 

48 # Mid-VRAM coverage tier (1–3 GB) — bridges the gap between F5/Indic 

49 # Parler/Kokoro (≤2.5 GB) and the heavy clone engines so EVERY 

50 # SUPPORTED_LANG_DICT code has at least one engine with vram_gb≤3. 

51 "tts_melotts": (1.5, 1.0), # en/es/fr/zh/ja/ko, neural CPU/GPU 

52 "tts_xtts_v2": (2.5, 1.8), # 17 langs, voice cloning (Coqui) 

53 "tts_mms_tts": (1.0, 0.7), # ~50+ langs (per-lang VITS, Meta) 

54} 

55 

56 

57class VRAMManager: 

58 """GPU memory tracking and allocation decisions.""" 

59 

60 def __init__(self): 

61 self._allocations: Dict[str, float] = {} # tool → GB reserved 

62 self._gpu_info: Optional[Dict] = None 

63 self._gpu_info_ts: float = 0.0 # timestamp of last nvidia-smi call 

64 # Bundled mode: GPU state is stable (one model loaded at startup). 

65 # Poll every 120s not 30s to reduce subprocess overhead. 

66 _bundled = os.environ.get('NUNBA_BUNDLED') == '1' 

67 self._refresh_ttl: float = 120.0 if _bundled else 30.0 

68 # Serializes allocate() + can_fit() so two concurrent model loads 

69 # can't both pass can_fit() and overcommit the GPU. Previously 

70 # an atomic-less read-modify-write across _allocations on hot 

71 # path (TOCTOU: read free → read budget → mutate dict). Under 

72 # a cold startup with parallel LLM + TTS + VLM spawns, both 

73 # could see 5GB free, both think 4GB fits, both allocate → 8GB 

74 # claimed on a 5GB device → CUDA OOM. 

75 import threading as _threading # noqa: E402 (runtime deferred) 

76 self._alloc_lock = _threading.RLock() 

77 

78 # Measured VRAM usage telemetry: tool → actual model_size_gb seen 

79 # after a successful load. Populated via record_actual_usage() — 

80 # worker subprocesses self-report post-load GPU usage, parent 

81 # stores the value and uses it in preference to the VRAM_BUDGETS 

82 # estimate the next time the tool is considered. Enables 

83 # conservative stub budgets (e.g. new OmniVoice at 3.0 GB) to 

84 # auto-tighten after the first real load without a code change. 

85 self._measured: Dict[str, float] = {} 

86 self._measured_path = self._resolve_measured_path() 

87 self._load_measured() 

88 

89 # ── Measured-usage telemetry ───────────────────────────────── 

90 

91 @staticmethod 

92 def _resolve_measured_path(): 

93 from pathlib import Path 

94 # Prefer the project agent_data dir, fall back to ~/.hevolve 

95 cwd_path = Path.cwd() / 'agent_data' / 'vram_measured.json' 

96 try: 

97 cwd_path.parent.mkdir(parents=True, exist_ok=True) 

98 return cwd_path 

99 except Exception: 

100 fallback = Path.home() / '.hevolve' / 'vram_measured.json' 

101 fallback.parent.mkdir(parents=True, exist_ok=True) 

102 return fallback 

103 

104 def _load_measured(self) -> None: 

105 import json 

106 try: 

107 if self._measured_path.exists(): 

108 data = json.loads( 

109 self._measured_path.read_text(encoding='utf-8') 

110 ) 

111 self._measured = { 

112 str(k): float(v) 

113 for k, v in data.items() 

114 if isinstance(v, (int, float)) and v > 0 

115 } 

116 except Exception as e: 

117 logger.debug(f"VRAM measured load failed (ignoring): {e}") 

118 self._measured = {} 

119 

120 def _persist_measured(self) -> None: 

121 """Atomic JSON write — tmp-then-rename so we can't half-write.""" 

122 import json 

123 try: 

124 tmp = self._measured_path.with_suffix('.json.tmp') 

125 tmp.write_text( 

126 json.dumps(self._measured, indent=2), 

127 encoding='utf-8', 

128 ) 

129 tmp.replace(self._measured_path) 

130 except Exception as e: 

131 logger.debug(f"VRAM measured persist failed: {e}") 

132 

133 def record_actual_usage(self, tool_name: str, measured_gb: float) -> None: 

134 """Worker-reported post-load GPU usage. 

135 

136 Called from ToolWorker._wait_ready after parsing the worker's 

137 '__WORKER_VRAM_GB__ <n>' marker. Values are persisted so the 

138 measurement survives restarts and tightens the budget used by 

139 can_fit() / allocate() on subsequent loads. 

140 

141 Safety rails: 

142 - Ignore non-positive values (worker emits 0.0 when it can't 

143 measure — e.g. CPU-only, Metal, broken nvidia-smi). 

144 - Clamp to [0.1, 64.0] GB — protects against obviously bad 

145 telemetry (negative deltas from concurrent workers, runaway 

146 leaks). 

147 - Compare vs VRAM_BUDGETS declared size — log a prominent 

148 warning if measured > declared * 1.5 (the declared budget 

149 is wrong and won't fit on the target GPU class). 

150 """ 

151 with self._alloc_lock: 

152 if not tool_name or measured_gb is None: 

153 return 

154 try: 

155 gb = float(measured_gb) 

156 except (TypeError, ValueError): 

157 return 

158 if gb <= 0 or gb > 64.0: 

159 logger.debug( 

160 f"VRAM measurement for {tool_name} out of range ({gb}) — ignored" 

161 ) 

162 return 

163 prev = self._measured.get(tool_name) 

164 self._measured[tool_name] = round(gb, 2) 

165 self._persist_measured() 

166 

167 declared = VRAM_BUDGETS.get(tool_name) 

168 if declared and gb > declared[1] * 1.5: 

169 logger.warning( 

170 f"{tool_name} measured {gb:.1f} GB — 50%+ over declared " 

171 f"{declared[1]:.1f} GB. Consider raising VRAM_BUDGETS. " 

172 f"can_fit() will use the measurement from now on." 

173 ) 

174 elif prev is None: 

175 logger.info( 

176 f"{tool_name}: first measured VRAM = {gb:.2f} GB " 

177 f"(budget was {declared[1] if declared else '—'} GB)" 

178 ) 

179 

180 def get_effective_budget( 

181 self, 

182 tool_name: str, 

183 ) -> Optional[Tuple[float, float]]: 

184 """Return (min_vram_gb, model_size_gb) using measured value if any. 

185 

186 Measurement is tighter than the declared budget in the common case 

187 (stub budget is conservative), so we swap in the measured 

188 model_size. When the measurement exceeds the declared model_size 

189 we honor the measurement — the tool really does need that much. 

190 

191 min_vram (headroom) is never lowered below the declared minimum, 

192 because overhead like activation buffers is not captured in the 

193 static post-load measurement. 

194 """ 

195 declared = VRAM_BUDGETS.get(tool_name) 

196 if not declared: 

197 return None 

198 measured_size = self._measured.get(tool_name) 

199 if measured_size is None: 

200 return declared 

201 min_vram, _declared_size = declared 

202 # Require at least measured_size + 0.3 GB overhead headroom 

203 effective_min = max(min_vram, measured_size + 0.3) 

204 return (effective_min, measured_size) 

205 

206 def get_measured_usage(self) -> Dict[str, float]: 

207 """Return a copy of current measured-usage telemetry (tool → GB).""" 

208 return dict(self._measured) 

209 

210 # ── GPU Detection ──────────────────────────────────────────── 

211 

212 def detect_gpu(self) -> Dict: 

213 """Detect GPU and return info dict. 

214 

215 Priority: nvidia-smi (no deps) → PyTorch (if already loaded) → macOS Metal. 

216 Returns: {name, total_gb, free_gb, cuda_available} 

217 """ 

218 if self._gpu_info is not None: 

219 return self._gpu_info 

220 

221 info = { 

222 "name": None, 

223 "total_gb": 0.0, 

224 "free_gb": 0.0, 

225 "cuda_available": False, 

226 } 

227 

228 # run_bounded wraps Popen + explicit pipe close on timeout so the 

229 # child's _readerthread can't orphan — see core/subprocess_safe.py 

230 # for the failure mode (2026-04-15 wmic 27-min hang, same class). 

231 from core.subprocess_safe import run_bounded 

232 

233 # nvidia-smi can be slow when the GPU is under heavy compute load 

234 # (driver call queues serialize behind kernel launches, NVML init 

235 # contends with active CUDA contexts). 5s was too tight on 

236 # 8GB systems running concurrent VLM benchmarks - hit the 

237 # subprocess_safe kill-pipes path every cycle and flooded the 

238 # log. 15s gives slow systems breathing room without leaving 

239 # zombie nvidia-smi processes around. Override via env for 

240 # truly degraded systems. 

241 _nvsmi_timeout = float(os.environ.get( 

242 'HEVOLVE_NVIDIA_SMI_TIMEOUT', '15')) 

243 

244 # 1) nvidia-smi — zero-dependency, works on any NVIDIA GPU system 

245 try: 

246 result = run_bounded( 

247 ["nvidia-smi", "--query-gpu=name,memory.total,memory.free", 

248 "--format=csv,noheader,nounits"], 

249 timeout=_nvsmi_timeout, 

250 ) 

251 if result.returncode == 0 and result.stdout.strip(): 

252 line = result.stdout.strip().split("\n")[0] 

253 parts = [p.strip() for p in line.split(",")] 

254 if len(parts) >= 3: 

255 total_mb = float(parts[1]) 

256 free_mb = float(parts[2]) 

257 info.update({ 

258 "name": parts[0], 

259 "total_gb": round(total_mb / 1024, 2), 

260 "free_gb": round(free_mb / 1024, 2), 

261 "cuda_available": True, 

262 }) 

263 logger.info( 

264 f"GPU (nvidia-smi): {info['name']} — " 

265 f"{info['total_gb']} GB total, {info['free_gb']} GB free" 

266 ) 

267 self._gpu_info = info 

268 return info 

269 except FileNotFoundError: 

270 pass # nvidia-smi not on PATH — no NVIDIA GPU or drivers 

271 except Exception as e: 

272 logger.debug(f"nvidia-smi failed: {e}") 

273 

274 # 1b) rocm-smi — AMD GPUs via ROCm. Same loaded-GPU rationale 

275 # as nvidia-smi above; honour the same env override. 

276 try: 

277 result = run_bounded( 

278 ["rocm-smi", "--showmeminfo", "vram", "--csv"], 

279 timeout=_nvsmi_timeout, 

280 ) 

281 if result.returncode == 0 and result.stdout.strip(): 

282 # Parse CSV output: header line then data lines 

283 lines = [l.strip() for l in result.stdout.strip().split("\n") if l.strip()] 

284 for line in lines[1:]: # skip header 

285 parts = [p.strip() for p in line.split(",")] 

286 if len(parts) >= 3: 

287 try: 

288 total_bytes = float(parts[1]) 

289 used_bytes = float(parts[2]) 

290 total_gb = round(total_bytes / (1024 ** 3), 2) 

291 free_gb = round((total_bytes - used_bytes) / (1024 ** 3), 2) 

292 info.update({ 

293 "name": f"AMD GPU (ROCm)", 

294 "total_gb": total_gb, 

295 "free_gb": free_gb, 

296 "cuda_available": False, 

297 "rocm_available": True, 

298 }) 

299 logger.info( 

300 f"GPU (rocm-smi): {info['name']} — " 

301 f"{info['total_gb']} GB total, {info['free_gb']} GB free" 

302 ) 

303 self._gpu_info = info 

304 return info 

305 except (ValueError, IndexError): 

306 continue 

307 except FileNotFoundError: 

308 pass # rocm-smi not on PATH — no AMD GPU or ROCm drivers 

309 except Exception as e: 

310 logger.debug(f"rocm-smi failed: {e}") 

311 

312 # 2) PyTorch — only if already imported (don't trigger a 2GB import) 

313 if "torch" in sys.modules: 

314 try: 

315 import torch 

316 # Detect frozen build torch stub (version 0.0.0, _is_stub=True). 

317 # Replace with real torch so CUDA detection works across all 

318 # deployments (Nunba frozen, HART OS standalone, cloud). 

319 if getattr(torch, '_is_stub', False): 

320 import importlib 

321 _stale = [k for k in sys.modules if k == 'torch' or k.startswith('torch.')] 

322 for _k in _stale: 

323 del sys.modules[_k] 

324 torch = importlib.import_module('torch') 

325 logger.info(f"Replaced torch stub with real torch {torch.__version__}") 

326 if torch.cuda.is_available(): 

327 props = torch.cuda.get_device_properties(0) 

328 total = props.total_memory / (1024 ** 3) 

329 allocated = torch.cuda.memory_allocated(0) / (1024 ** 3) 

330 info.update({ 

331 "name": torch.cuda.get_device_name(0), 

332 "total_gb": round(total, 2), 

333 "free_gb": round(total - allocated, 2), 

334 "cuda_available": True, 

335 }) 

336 logger.info( 

337 f"GPU (PyTorch): {info['name']} — " 

338 f"{info['total_gb']} GB total, {info['free_gb']} GB free" 

339 ) 

340 self._gpu_info = info 

341 return info 

342 except Exception as e: 

343 logger.debug(f"PyTorch GPU detection failed: {e}") 

344 

345 # 3) macOS Metal 

346 if sys.platform == "darwin": 

347 try: 

348 import platform 

349 info.update({ 

350 "name": f"Apple Metal ({'Apple Silicon' if platform.machine() == 'arm64' else 'Intel'})", 

351 "total_gb": 0.0, # shared memory — hard to measure 

352 "free_gb": 0.0, 

353 "cuda_available": False, 

354 "metal_available": True, 

355 }) 

356 except Exception: 

357 pass 

358 

359 if not info["cuda_available"]: 

360 logger.info("No NVIDIA GPU detected (nvidia-smi not found or no CUDA device)") 

361 

362 self._gpu_info = info 

363 return info 

364 

365 def refresh_gpu_info(self) -> Dict: 

366 """Re-detect GPU with TTL cache (avoids nvidia-smi spam from multiple threads).""" 

367 import time as _t 

368 now = _t.monotonic() 

369 if self._gpu_info is not None and (now - self._gpu_info_ts) < self._refresh_ttl: 

370 return self._gpu_info # recent enough — skip subprocess 

371 self._gpu_info = None 

372 result = self.detect_gpu() 

373 self._gpu_info_ts = _t.monotonic() 

374 return result 

375 

376 # ── VRAM queries ───────────────────────────────────────────── 

377 

378 def get_free_vram(self) -> float: 

379 """Return free VRAM in GB — actual free from nvidia-smi. 

380 

381 nvidia-smi already reports real free VRAM (total - all processes). 

382 Do NOT subtract our allocations — that double-counts and reports 

383 0GB when there's actually GB free, causing false OOM decisions. 

384 """ 

385 info = self.detect_gpu() 

386 if not info["cuda_available"]: 

387 return 0.0 

388 return info["free_gb"] 

389 

390 def get_total_vram(self) -> float: 

391 """Return total VRAM in GB.""" 

392 return self.detect_gpu().get("total_gb", 0.0) 

393 

394 # ── Allocation ─────────────────────────────────────────────── 

395 

396 def can_fit(self, tool_name: str) -> bool: 

397 """Check if a tool can fit in remaining VRAM. 

398 

399 Uses the measured budget (post first successful load) if present, 

400 otherwise falls back to the VRAM_BUDGETS declared value. 

401 """ 

402 if tool_name in self._allocations: 

403 return True # already allocated 

404 effective = self.get_effective_budget(tool_name) 

405 if not effective: 

406 return True # unknown tool — assume it fits 

407 min_vram, _model_size = effective 

408 gpu = self.detect_gpu() 

409 if not gpu["cuda_available"]: 

410 return False # no GPU at all 

411 return self.get_free_vram() >= min_vram 

412 

413 def allocate(self, tool_name: str) -> bool: 

414 """Reserve VRAM for a tool. Returns False if it won't fit. 

415 

416 Lock-serialized: check-then-mutate must be atomic so two 

417 parallel allocations can't both win can_fit(). can_fit is 

418 called under the same RLock so the 'free' read sees prior 

419 pending allocations, not raw GPU stats. 

420 """ 

421 with self._alloc_lock: 

422 if tool_name in self._allocations: 

423 return True 

424 if not self.can_fit(tool_name): 

425 logger.warning(f"VRAM rejected: {tool_name} won't fit " 

426 f"(free={self.get_free_vram():.1f}GB)") 

427 return False 

428 effective = self.get_effective_budget(tool_name) 

429 model_gb = effective[1] if effective else 0.0 

430 self._allocations[tool_name] = model_gb 

431 logger.info(f"Allocated {model_gb} GB VRAM for {tool_name}") 

432 return True 

433 

434 def release(self, tool_name: str) -> None: 

435 """Release VRAM reservation for a tool.""" 

436 with self._alloc_lock: 

437 freed = self._allocations.pop(tool_name, 0.0) 

438 if freed: 

439 logger.info(f"Released {freed} GB VRAM from {tool_name}") 

440 

441 def get_allocations(self) -> Dict[str, float]: 

442 """Return current VRAM allocations {tool → GB}.""" 

443 return dict(self._allocations) 

444 

445 def get_allocations_display(self) -> Dict[str, Any]: 

446 """Return VRAM allocations with rich model details for UI display. 

447 

448 Each value is either a float (GB) for unknown models, or a dict with 

449 name, gb, device, and extra details (quant, context, mmproj) for known models. 

450 The frontend VRAMBar can render either format. 

451 """ 

452 import re as _re 

453 raw = dict(self._allocations) 

454 try: 

455 from integrations.service_tools.model_catalog import get_catalog 

456 catalog = get_catalog() 

457 enriched = {} 

458 for key, gb in raw.items(): 

459 display_key = key 

460 detail = {'gb': gb} 

461 for mid, entry in catalog._models.items(): 

462 if entry.loaded and ( 

463 mid == key or 

464 entry.model_type == key or 

465 mid.startswith(f'{key}-') 

466 ): 

467 display_key = entry.name 

468 detail = { 

469 'gb': gb, 

470 'device': entry.device or 'gpu', 

471 'backend': entry.backend, 

472 'model_id': mid, 

473 } 

474 # Extract quant from filename (e.g. Q4_K_XL from Qwen3.5-4B-UD-Q4_K_XL.gguf) 

475 fname = entry.files.get('model', '') or entry.files.get('file_name', '') 

476 if not fname and entry.repo_id: 

477 fname = entry.repo_id.split('/')[-1] if '/' in entry.repo_id else '' 

478 quant_match = _re.search(r'(Q\d+_K(?:_[A-Z]+)?|F16|F32|INT[48]|GPTQ|AWQ)', fname, _re.I) 

479 if quant_match: 

480 detail['quant'] = quant_match.group(1) 

481 # Context length from capabilities or tags 

482 ctx = entry.capabilities.get('context_length') or entry.capabilities.get('n_ctx') 

483 if ctx: 

484 detail['context'] = ctx 

485 # mmproj for vision models 

486 if entry.capabilities.get('vision') or 'vision' in (entry.tags if hasattr(entry, 'tags') else []): 

487 detail['vision'] = True 

488 break 

489 enriched[display_key] = detail 

490 # NOTE: LLM quant/context/mmproj enrichment is handled by 

491 # Nunba's orchestrator shim (models/orchestrator.py), not here. 

492 # HARTOS must not import from Nunba (upward dependency). 

493 return enriched 

494 except Exception: 

495 return raw 

496 

497 # ── Offload strategy ───────────────────────────────────────── 

498 

499 def suggest_offload_mode(self, tool_name: str) -> str: 

500 """Suggest the best offload mode for a tool. 

501 

502 Returns: 'gpu' | 'cpu_offload' | 'cpu_only' 

503 """ 

504 gpu = self.detect_gpu() 

505 if not gpu["cuda_available"]: 

506 return "cpu_only" 

507 

508 budget = VRAM_BUDGETS.get(tool_name) 

509 if not budget: 

510 return "gpu" # unknown tool, try GPU 

511 

512 min_vram, model_size = budget 

513 free = self.get_free_vram() 

514 

515 if free >= model_size: 

516 return "gpu" 

517 elif free >= model_size * 0.5: 

518 return "cpu_offload" 

519 else: 

520 return "cpu_only" 

521 

522 # ── Pressure detection ──────────────────────────────────────── 

523 

524 def get_actual_free_vram(self) -> float: 

525 """Return ACTUAL free VRAM by refreshing nvidia-smi (not cached advisory). 

526 

527 Unlike get_free_vram(), this re-reads hardware state every call. 

528 Used by ModelLifecycleManager for real-time pressure detection. 

529 """ 

530 self.refresh_gpu_info() 

531 info = self._gpu_info or {} 

532 return info.get('free_gb', 0.0) 

533 

534 def get_vram_usage_pct(self) -> float: 

535 """Return current VRAM usage as percentage (0-100). 

536 

537 Refreshes GPU info first for accuracy. 

538 """ 

539 self.refresh_gpu_info() 

540 info = self._gpu_info or {} 

541 total = info.get('total_gb', 0) 

542 free = info.get('free_gb', 0) 

543 if total <= 0: 

544 return 0.0 

545 return ((total - free) / total) * 100 

546 

547 # ── CUDA Cache Clearing ───────────────────────────────────── 

548 

549 @staticmethod 

550 def clear_cuda_cache() -> bool: 

551 """Clear GPU cache (CUDA or MPS) if torch is loaded. Returns True if cleared.""" 

552 if 'torch' in sys.modules: 

553 try: 

554 import torch 

555 if torch.cuda.is_available(): 

556 torch.cuda.empty_cache() 

557 return True 

558 elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): 

559 torch.mps.empty_cache() 

560 return True 

561 except Exception: 

562 pass 

563 return False 

564 

565 # ── Allocation drift detection ─────────────────────────────── 

566 

567 def detect_allocation_drift(self) -> Dict: 

568 """Compare advisory allocations vs actual VRAM usage. 

569 

570 Returns drift info — positive drift means something is using 

571 more VRAM than we budgeted (possible leak or untracked process). 

572 """ 

573 self.refresh_gpu_info() 

574 info = self._gpu_info or {} 

575 total = info.get('total_gb', 0) 

576 actual_free = info.get('free_gb', 0) 

577 actual_used = total - actual_free if total > 0 else 0 

578 

579 advisory_used = sum(self._allocations.values()) 

580 # Some baseline VRAM is always used by OS/drivers (~0.5-1.5GB typically) 

581 os_baseline = min(1.5, total * 0.1) if total > 0 else 0 

582 

583 drift_gb = actual_used - advisory_used - os_baseline 

584 

585 return { 

586 'actual_used_gb': round(actual_used, 2), 

587 'advisory_used_gb': round(advisory_used, 2), 

588 'os_baseline_gb': round(os_baseline, 2), 

589 'drift_gb': round(drift_gb, 2), 

590 'drift_pct': round((drift_gb / total * 100) if total > 0 else 0, 1), 

591 'untracked_process': drift_gb > 1.0, # >1GB unaccounted = suspicious 

592 } 

593 

594 # ── Dashboard ──────────────────────────────────────────────── 

595 

596 def get_status(self) -> Dict: 

597 """Full VRAM status for dashboard.""" 

598 gpu = self.detect_gpu() 

599 drift = self.detect_allocation_drift() 

600 return { 

601 "gpu": gpu, 

602 "allocations": self.get_allocations_display(), 

603 "total_allocated_gb": round(sum(self._allocations.values()), 2), 

604 "effective_free_gb": round(self.get_free_vram(), 2), 

605 "drift": drift, 

606 } 

607 

608 

609# Global singleton 

610vram_manager = VRAMManager() 

611 

612 

613# ── Module-level convenience functions ────────────────────────── 

614# Allow: from integrations.service_tools.vram_manager import detect_gpu, clear_cuda_cache 

615 

616def detect_gpu() -> Dict: 

617 """Detect GPU via the singleton VRAMManager. See VRAMManager.detect_gpu.""" 

618 return vram_manager.detect_gpu() 

619 

620 

621def clear_cuda_cache() -> bool: 

622 """Clear GPU cache via the singleton VRAMManager. See VRAMManager.clear_cuda_cache.""" 

623 return VRAMManager.clear_cuda_cache() 

624 

625 

626def get_vram_manager() -> VRAMManager: 

627 """Return the global VRAMManager singleton.""" 

628 return vram_manager