Coverage for integrations / service_tools / model_lifecycle.py: 61.6%

983 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Model Lifecycle Manager — intelligent load/unload/offload for ML models. 

3 

4Daemon-driven (like AgentDaemon), tick-based loop that: 

5 1. Tracks per-model access patterns (last_access, count, device) 

6 2. Detects VRAM/RAM pressure via real-time nvidia-smi refresh 

7 3. Evicts idle models (LRU, configurable timeout per model) 

8 4. Offloads GPU models to CPU when VRAM pressure detected 

9 5. Reports usage deltas to FederatedAggregator for hive learning 

10 6. Applies hive-learned placement hints (pre-cache popular models) 

11 7. Continuous process health monitoring with dead-process recovery 

12 8. OOM crash detection + auto-restart with resource downgrade 

13 9. Model swap queue for sequential multi-model workloads 

14 10. Pressure alerts emitted to EventBus for frontend display 

15 

16Integration: 

17 - Extends RuntimeToolManager via lifecycle hooks (composition, not inheritance) 

18 - Uses VRAMManager for GPU tracking (adds refresh_gpu_info calls) 

19 - Monitored by NodeWatchdog 

20 - Reports to FederatedAggregator 

21 - Exposed via /api/tools/lifecycle endpoint 

22 

23Terminology: Uses NodeTierLevel CAPABILITY tiers (embedded/lite/standard/full/compute_host), 

24NOT topology modes (flat/regional/central) or model tiers (fast/balanced/expert). 

25""" 

26import collections 

27import json 

28import logging 

29import os 

30import shutil 

31import threading 

32import time 

33from contextlib import contextmanager 

34from dataclasses import dataclass, field 

35from enum import Enum 

36from pathlib import Path 

37from typing import Dict, List, Optional, Tuple 

38 

39logger = logging.getLogger(__name__) 

40 

41LIFECYCLE_STATE_FILE = Path.home() / '.hevolve' / 'lifecycle_state.json' 

42 

43# Hints older than this are discarded on reload. A 24h cutoff keeps 

44# "this user spoke Tamil yesterday" relevant while throwing away 

45# "this user spoke Tamil in January" (long-stale entries would pin 

46# models that the user actually doesn't use anymore). 

47LIFECYCLE_STALENESS_S: float = 24 * 3600.0 

48 

49 

50# ═══════════════════════════════════════════════════════════════ 

51# Enums and State 

52# ═══════════════════════════════════════════════════════════════ 

53 

54class ModelDevice(Enum): 

55 UNLOADED = "unloaded" 

56 GPU = "gpu" 

57 CPU = "cpu" 

58 CPU_OFFLOAD = "cpu_offload" 

59 

60 

61class ModelPriority(Enum): 

62 ACTIVE = "active" 

63 WARM = "warm" 

64 IDLE = "idle" 

65 EVICTABLE = "evictable" 

66 

67 

68# Priority sort order: ACTIVE (never touch) → WARM → IDLE → EVICTABLE (first to go) 

69_PRIORITY_RANK = { 

70 ModelPriority.ACTIVE: 0, 

71 ModelPriority.WARM: 1, 

72 ModelPriority.IDLE: 2, 

73 ModelPriority.EVICTABLE: 3, 

74} 

75 

76 

77@dataclass 

78class ModelState: 

79 """Per-model lifecycle state.""" 

80 name: str 

81 device: ModelDevice = ModelDevice.UNLOADED 

82 priority: ModelPriority = ModelPriority.IDLE 

83 

84 # Access tracking 

85 last_access_time: float = 0.0 

86 load_time: float = 0.0 

87 access_count: int = 0 

88 access_count_session: int = 0 

89 

90 # Resource tracking 

91 vram_gb: float = 0.0 

92 ram_gb: float = 0.0 

93 

94 # Configuration 

95 idle_timeout_s: float = 300.0 

96 is_sidecar: bool = False 

97 supports_cpu_offload: bool = False 

98 

99 # Hive hints 

100 hive_popularity: float = 0.0 

101 hive_boost: bool = False 

102 

103 # Inference guard 

104 active_inference_count: int = 0 

105 

106 # ─── Eviction policy flags ──────────────────────────────────── 

107 # These are the 2026-04-12 fix for the "llama-server kept dying 

108 # every 5 minutes" incident. The default idle eviction loop 

109 # (_evict_idle_models → _update_priorities) used to demote ANY 

110 # idle model to EVICTABLE after its timeout, unconditionally, 

111 # which killed the 4B main LLM mid-session even though nobody 

112 # was competing for VRAM. 

113 

114 #: True ⇒ the model is NEVER evicted, regardless of idle time, 

115 #: VRAM pressure, or RAM pressure. Reserved for the draft 0.8B 

116 #: classifier which is the first-contact path for EVERY chat 

117 #: message. Killing the draft means every reply cold-starts a 

118 #: 500 MB model + mmproj, so pinning it is always the right call. 

119 #: Use sparingly — pinned models consume their VRAM budget 

120 #: forever, so this must only tag models where the eviction 

121 #: cost always outweighs the freed VRAM. 

122 pinned: bool = False 

123 

124 #: True ⇒ the model is evicted ONLY when VRAM pressure is 

125 #: detected (phase 3 of _tick), never by passive idle timeout 

126 #: (phase 7). Applied to main-tier chat LLMs (2B / 4B) so a 

127 #: 30-second idle doesn't kill the server the user is actively 

128 #: talking to. Passive idle eviction is intended for STT / TTS / 

129 #: VLM one-shot models where a cold start is cheap and the 

130 #: VRAM is better spent on the model you're about to use next. 

131 pressure_evict_only: bool = False 

132 

133 # Crash recovery tracking 

134 crash_count: int = 0 # Consecutive crashes (resets on successful access) 

135 last_crash_time: float = 0.0 # Timestamp of last crash 

136 last_exit_code: Optional[int] = None # Last process exit code (137/9=OOM kill) 

137 restart_backoff_s: float = 0.0 # Current backoff delay (exponential) 

138 downgraded: bool = False # True if restarted on lower resource tier 

139 

140 def to_dict(self) -> dict: 

141 now = time.time() 

142 return { 

143 'name': self.name, 

144 'device': self.device.value, 

145 'priority': self.priority.value, 

146 'last_access_time': self.last_access_time, 

147 'load_time': self.load_time, 

148 'access_count': self.access_count, 

149 'idle_seconds': round(now - self.last_access_time, 1) if self.last_access_time else 0, 

150 'vram_gb': self.vram_gb, 

151 'ram_gb': self.ram_gb, 

152 'idle_timeout_s': self.idle_timeout_s, 

153 'hive_popularity': self.hive_popularity, 

154 'hive_boost': self.hive_boost, 

155 'active_inference_count': self.active_inference_count, 

156 'crash_count': self.crash_count, 

157 'last_exit_code': self.last_exit_code, 

158 'downgraded': self.downgraded, 

159 'healthy': self.device == ModelDevice.UNLOADED or self.crash_count == 0, 

160 } 

161 

162 

163# ═══════════════════════════════════════════════════════════════ 

164# Configuration Tables 

165# ═══════════════════════════════════════════════════════════════ 

166 

167# Default idle timeouts per model (seconds). Expensive-to-reload = longer. 

168DEFAULT_IDLE_TIMEOUTS: Dict[str, float] = { 

169 'whisper': 300.0, 

170 'tts_audio_suite': 600.0, 

171 'minicpm': 900.0, 

172 'wan2gp': 600.0, 

173 'ltx2': 600.0, 

174 'acestep': 600.0, 

175 'omniparser': 300.0, 

176 'clip': 180.0, 

177 'sentence_transformers': 180.0, 

178 'mobilevlm': 300.0, 

179} 

180 

181# CPU offload capability: (can_offload, cpu_ram_gb, method) 

182# method: 'torch_to_cpu' | 'restart_cpu' | 'none' 

183CPU_OFFLOAD_TABLE: Dict[str, Tuple[bool, float, str]] = { 

184 'whisper': (True, 0.5, 'torch_to_cpu'), 

185 'tts_audio_suite': (True, 2.0, 'restart_cpu'), 

186 'minicpm': (False, 0.0, 'none'), 

187 'wan2gp': (False, 0.0, 'none'), 

188 'ltx2': (True, 4.0, 'restart_cpu'), 

189 'acestep': (False, 0.0, 'none'), 

190 'omniparser': (True, 2.0, 'torch_to_cpu'), 

191 'clip': (True, 0.5, 'torch_to_cpu'), 

192 'sentence_transformers': (True, 0.3, 'torch_to_cpu'), 

193 'mobilevlm': (True, 1.0, 'torch_to_cpu'), 

194} 

195 

196# Capability tier requirements per model (uses NodeTierLevel, NOT topology) 

197# Imported lazily to avoid circular deps 

198MODEL_MIN_TIER: Dict[str, str] = { 

199 'whisper': 'standard', 

200 'tts_audio_suite': 'standard', 

201 'minicpm': 'full', 

202 'wan2gp': 'full', 

203 'ltx2': 'full', 

204 'acestep': 'full', 

205 'omniparser': 'full', 

206 'clip': 'standard', 

207 'sentence_transformers': 'standard', 

208 'mobilevlm': 'standard', 

209} 

210 

211 

212# ═══════════════════════════════════════════════════════════════ 

213# ModelLifecycleManager 

214# ═══════════════════════════════════════════════════════════════ 

215 

216class ModelLifecycleManager: 

217 """Daemon-driven model lifecycle: load, track, offload, evict, report. 

218 

219 Follows AgentDaemon pattern: tick-based loop, heartbeat to watchdog, 

220 never blocks user requests. 

221 

222 Singleton via get_model_lifecycle_manager(). 

223 """ 

224 

225 def __init__(self): 

226 self._models: Dict[str, ModelState] = {} 

227 self._lock = threading.Lock() 

228 self._running = False 

229 self._thread: Optional[threading.Thread] = None 

230 # Flat/bundled mode: models load once at startup, no dynamic swapping. 

231 # Poll every 120s (just health check), not 15s (active management). 

232 _bundled = os.environ.get('NUNBA_BUNDLED') == '1' 

233 _default_interval = '120' if _bundled else '15' 

234 self._interval = int(os.environ.get('HEVOLVE_LIFECYCLE_INTERVAL', _default_interval)) 

235 self._tick_count = 0 

236 

237 # Pressure thresholds (configurable via env) 

238 self._vram_pressure_pct = float( 

239 os.environ.get('HEVOLVE_VRAM_PRESSURE_PCT', '85')) 

240 self._ram_pressure_pct = float( 

241 os.environ.get('HEVOLVE_RAM_PRESSURE_PCT', '90')) 

242 self._cpu_pressure_pct = float( 

243 os.environ.get('HEVOLVE_CPU_PRESSURE_PCT', '80')) 

244 self._disk_free_min_gb = float( 

245 os.environ.get('HEVOLVE_DISK_FREE_MIN_GB', '2.0')) 

246 

247 # Throttle flags (read by AgentDaemon to reduce dispatch rate) 

248 self._cpu_throttle_active = False 

249 self._disk_throttle_active = False 

250 

251 # Hive placement hints from FederatedAggregator 

252 self._hive_hints: Dict[str, float] = {} 

253 

254 # Cached node tier 

255 self._node_tier = None 

256 

257 # ── Crash recovery ──────────────────────────────────────── 

258 self._max_crash_restarts = int( 

259 os.environ.get('HEVOLVE_MAX_CRASH_RESTARTS', '3')) 

260 self._base_backoff_s = 5.0 # First retry after 5s 

261 self._max_backoff_s = 300.0 # Cap at 5 min 

262 self._restart_pending: Dict[str, float] = {} # name → retry_after timestamp 

263 

264 # ── Swap queue ──────────────────────────────────────────── 

265 # When model B is needed but A occupies the GPU, A gets evicted 

266 # and B loads. A is queued for restore when B finishes/idles. 

267 self._swap_queue: collections.deque = collections.deque(maxlen=8) 

268 # Each entry: {'name': str, 'device': str, 'evicted_for': str, 'timestamp': float} 

269 

270 # ── Pressure alert state (debounce) ─────────────────────── 

271 self._last_pressure_alert: Dict[str, float] = {} # type → timestamp 

272 self._pressure_alert_cooldown = 60.0 # seconds between alerts of same type 

273 

274 # ── G3: Direct llama-server supervision ─────────────────── 

275 # When Nunba's LlamaConfig is unavailable (standalone, Docker, 

276 # or bundled mode where Nunba's supervisor failed) we launch 

277 # llama-server ourselves via _launch_llama_server_direct. We 

278 # must also supervise it — otherwise a dead server is invisible 

279 # to _check_llm_health and never gets restarted. 

280 self._direct_llama_proc = None # subprocess.Popen 

281 self._direct_llama_port: Optional[int] = None 

282 self._direct_llama_mode: Optional[str] = None # 'gpu' | 'cpu' 

283 self._direct_llama_log_fh = None 

284 self._direct_llama_last_restart: float = 0.0 

285 self._direct_llama_restart_cooldown_s: float = 10.0 

286 

287 # ── Cross-restart persistence ───────────────────────────── 

288 # Load any persisted per-model access hints from the previous 

289 # Nunba boot. Entries older than LIFECYCLE_STALENESS_S are 

290 # dropped — last_access from a week ago is worse than no hint. 

291 self._last_persist_time: float = 0.0 

292 self._persist_interval_s: float = 60.0 # throttle disk writes 

293 self._persisted_hints: Dict[str, dict] = {} 

294 try: 

295 self._persisted_hints = self._load_persisted_state() 

296 except Exception as e: 

297 logger.debug(f"Lifecycle: persisted state load skipped: {e}") 

298 

299 # ── Daemon lifecycle ────────────────────────────────────── 

300 

301 def start(self): 

302 """Start the lifecycle daemon and register hooks.""" 

303 with self._lock: 

304 if self._running: 

305 return 

306 self._running = True 

307 

308 # Register hooks with RuntimeToolManager 

309 try: 

310 from .runtime_manager import runtime_tool_manager 

311 runtime_tool_manager.register_lifecycle_hook( 

312 'on_tool_started', self._on_tool_started) 

313 runtime_tool_manager.register_lifecycle_hook( 

314 'on_tool_stopped', self._on_tool_stopped) 

315 except Exception as e: 

316 logger.debug(f"Lifecycle hook registration skipped: {e}") 

317 

318 # Sync from current RTM state 

319 self._sync_from_rtm() 

320 

321 # Detect node tier 

322 self._detect_tier() 

323 

324 # VRAM budget check: warn if total GPU VRAM is insufficient for 

325 # pinned models. On 4GB GPUs, 0.8B (1.5GB) + 4B (3.5GB) = 5GB 

326 # exceeds the budget — the pressure system will handle it, but 

327 # the user should know the main model may be CPU-offloaded. 

328 try: 

329 from .vram_manager import get_vram_manager 

330 vm = get_vram_manager() 

331 total_vram = vm.total_vram_gb if vm else 0 

332 pinned_vram = sum( 

333 s.vram_gb for s in self._models.values() 

334 if s.pinned and s.vram_gb > 0 

335 ) 

336 all_model_vram = sum( 

337 s.vram_gb for s in self._models.values() 

338 if s.vram_gb > 0 

339 ) 

340 if total_vram > 0 and all_model_vram > total_vram: 

341 logger.warning( 

342 f"VRAM budget exceeded: models need {all_model_vram:.1f}GB " 

343 f"but GPU has {total_vram:.1f}GB. " 

344 f"Pressure eviction will offload non-pinned models to CPU. " 

345 f"Pinned models ({pinned_vram:.1f}GB) are protected." 

346 ) 

347 if pinned_vram > total_vram: 

348 logger.error( 

349 f"CRITICAL: Pinned models alone need {pinned_vram:.1f}GB " 

350 f"but GPU only has {total_vram:.1f}GB. " 

351 f"Consider reducing pinned model count or using CPU inference." 

352 ) 

353 except Exception: 

354 pass # VRAM check is advisory, not blocking 

355 

356 self._thread = threading.Thread(target=self._loop, daemon=True) 

357 self._thread.start() 

358 logger.info(f"Model lifecycle manager started (interval={self._interval}s)") 

359 

360 def stop(self): 

361 with self._lock: 

362 self._running = False 

363 if self._thread: 

364 self._thread.join(timeout=10) 

365 

366 def _loop(self): 

367 while self._running: 

368 self._wd_sleep(self._interval) 

369 if not self._running: 

370 break 

371 self._wd_heartbeat() 

372 try: 

373 self._tick() 

374 except Exception as e: 

375 logger.debug(f"Model lifecycle tick error: {e}") 

376 

377 def _wd_heartbeat(self): 

378 """Send heartbeat to watchdog between potentially blocking phases.""" 

379 try: 

380 from security.node_watchdog import get_watchdog 

381 wd = get_watchdog() 

382 if wd: 

383 wd.heartbeat('model_lifecycle') 

384 except Exception: 

385 pass 

386 

387 def _wd_sleep(self, seconds: float) -> None: 

388 """Sleep while keeping the model_lifecycle heartbeat fresh. 

389 

390 Delegates to ``NodeWatchdog.sleep_with_heartbeat``. The 

391 2026-04-11 incident showed model_lifecycle frozen for 3606s 

392 (exactly one hour) — the investigation points at 

393 _report_to_federation or a nvidia-smi subprocess hang inside 

394 one of the 13 tick phases, but the sleep path is also a 

395 candidate whenever HEVOLVE_MODEL_LIFECYCLE_INTERVAL is set 

396 above the 300s frozen threshold. Using this helper collapses 

397 both problem classes to one primitive. 

398 """ 

399 try: 

400 from security.node_watchdog import get_watchdog 

401 wd = get_watchdog() 

402 if wd is not None: 

403 wd.sleep_with_heartbeat( 

404 'model_lifecycle', seconds, 

405 stop_check=lambda: not self._running, 

406 ) 

407 return 

408 except Exception: 

409 pass 

410 time.sleep(seconds) 

411 

412 def _tick(self): 

413 """Single lifecycle pass with heartbeat checkpoints between phases.""" 

414 self._tick_count += 1 

415 

416 # Guardrail: circuit breaker 

417 try: 

418 from security.hive_guardrails import HiveCircuitBreaker 

419 if HiveCircuitBreaker.is_halted(): 

420 return 

421 except (ImportError, AttributeError): 

422 pass 

423 

424 # Phase 1: Refresh real GPU state (may call nvidia-smi subprocess) 

425 self._refresh_memory_state() 

426 self._wd_heartbeat() 

427 

428 # Phase 2: Update priorities 

429 self._update_priorities() 

430 

431 # Phase 3: VRAM pressure response 

432 if self._detect_vram_pressure(): 

433 self._respond_to_vram_pressure() 

434 self._wd_heartbeat() 

435 

436 # Phase 4: RAM pressure response 

437 if self._detect_ram_pressure(): 

438 self._respond_to_ram_pressure() 

439 

440 # Phase 5: CPU pressure response 

441 cpu_pressure = self._detect_cpu_pressure() 

442 self._cpu_throttle_active = cpu_pressure 

443 if cpu_pressure: 

444 self._respond_to_cpu_pressure() 

445 

446 # Phase 6: Disk pressure response 

447 disk_pressure = self._detect_disk_pressure() 

448 self._disk_throttle_active = disk_pressure 

449 self._wd_heartbeat() 

450 

451 # Phase 7: Background idle eviction 

452 self._evict_idle_models() 

453 self._wd_heartbeat() 

454 

455 # Phase 8: Apply hive hints (every 4th tick, ~60s) 

456 if self._tick_count % 4 == 0: 

457 self._apply_hive_hints() 

458 self._wd_heartbeat() 

459 

460 # Phase 9: Report to federation (every 6th tick, ~90s). 

461 # Investigation found this the most likely culprit for the 

462 # 1-hour model_lifecycle freeze in the 2026-04-11 incident — 

463 # the federation HTTP call had no explicit timeout and could 

464 # hang indefinitely on a dead remote. Heartbeat immediately 

465 # afterwards so a slow (but not infinite) call doesn't bleed 

466 # into phases 10-13. 

467 if self._tick_count % 6 == 0: 

468 self._report_to_federation() 

469 self._wd_heartbeat() 

470 

471 # Phase 10: Process health check + crash recovery 

472 self._check_process_health() 

473 self._wd_heartbeat() 

474 

475 # Phase 11: Process pending crash restarts (with backoff) 

476 self._process_restart_queue() 

477 self._wd_heartbeat() 

478 

479 # Phase 12: Swap queue — restore evicted models when space frees up 

480 self._process_swap_queue() 

481 self._wd_heartbeat() 

482 

483 # Phase 13: Pressure alerts to frontend (debounced) 

484 self._emit_pressure_alerts() 

485 

486 # Phase 14: Persist access hints (throttled — J211). 

487 # Cheap I/O (single JSON file, atomic rename) so the rate limit 

488 # lives in _persist_state_to_disk itself rather than here. 

489 try: 

490 self._persist_state_to_disk() 

491 except Exception: 

492 pass 

493 

494 # ── Hook callbacks (from RuntimeToolManager) ────────────── 

495 

496 def _on_tool_started(self, tool_name: str, **kwargs): 

497 """Called when RTM starts a tool.""" 

498 device_str = kwargs.get('device', 'gpu') 

499 offload_mode = kwargs.get('offload_mode', 'gpu') 

500 is_inprocess = kwargs.get('inprocess', False) 

501 

502 if offload_mode == 'cpu_only': 

503 device = ModelDevice.CPU 

504 elif offload_mode == 'cpu_offload': 

505 device = ModelDevice.CPU_OFFLOAD 

506 else: 

507 device = ModelDevice.GPU 

508 

509 offload_info = CPU_OFFLOAD_TABLE.get(tool_name, (False, 0.0, 'none')) 

510 timeout = DEFAULT_IDLE_TIMEOUTS.get(tool_name, 300.0) 

511 

512 # Override timeout from env 

513 env_timeout = os.environ.get( 

514 f'HEVOLVE_{tool_name.upper()}_IDLE_TIMEOUT') 

515 if env_timeout: 

516 try: 

517 timeout = float(env_timeout) 

518 except ValueError: 

519 pass 

520 

521 from .vram_manager import VRAM_BUDGETS 

522 budget = VRAM_BUDGETS.get(tool_name, (0, 0)) 

523 vram_gb = budget[1] if device == ModelDevice.GPU else 0.0 

524 ram_gb = offload_info[1] if device == ModelDevice.CPU else 0.0 

525 

526 now = time.time() 

527 with self._lock: 

528 state = ModelState( 

529 name=tool_name, 

530 device=device, 

531 priority=ModelPriority.WARM, 

532 last_access_time=now, 

533 load_time=now, 

534 idle_timeout_s=timeout, 

535 is_sidecar=not is_inprocess, 

536 supports_cpu_offload=offload_info[0], 

537 vram_gb=vram_gb, 

538 ram_gb=ram_gb, 

539 ) 

540 # Apply persisted-from-previous-boot hints so warm-start 

541 # preference survives restart. This is the J211 fix for 

542 # the cold-load penalty after Nunba is killed + relaunched. 

543 hint = self._persisted_hints.pop(tool_name, None) 

544 if hint: 

545 try: 

546 state.access_count = int(hint.get('access_count', 0) or 0) 

547 # prior last_access_time preserved as a hint; the 

548 # current boot's load time overrides so idle math 

549 # starts fresh. The prior timestamp is only used to 

550 # reason about popularity (see _apply_hive_hints). 

551 prior_access = float(hint.get('last_access_time', 0.0) or 0.0) 

552 if prior_access and (now - prior_access) < LIFECYCLE_STALENESS_S: 

553 # Boost priority for recently-used models so the 

554 # first request after restart doesn't get stuck 

555 # behind a cold-load. 

556 state.priority = ModelPriority.WARM 

557 state.hive_boost = True 

558 if hint.get('pinned'): 

559 state.pinned = True 

560 # J214: a pending pressure_evict_only staged via 

561 # set_pressure_evict_only() applies at start-time 

562 # so the TTS backend the user just picked never 

563 # enters the idle-sweep phase on its first load. 

564 if 'pressure_evict_only' in hint: 

565 state.pressure_evict_only = bool( 

566 hint['pressure_evict_only']) 

567 except Exception: 

568 pass 

569 self._models[tool_name] = state 

570 

571 # Notify UI: model loaded — include capabilities from catalog 

572 # so UI knows what features are now available 

573 self._emit_event('model.loaded', { 

574 'model': tool_name, 

575 'device': device_str, 

576 'vram_gb': vram_gb, 

577 'capabilities': self._get_model_capabilities(tool_name), 

578 }) 

579 

580 def _on_tool_stopped(self, tool_name: str, **kwargs): 

581 """Called when RTM stops a tool.""" 

582 with self._lock: 

583 state = self._models.get(tool_name) 

584 if state: 

585 state.device = ModelDevice.UNLOADED 

586 state.priority = ModelPriority.IDLE 

587 state.vram_gb = 0.0 

588 state.ram_gb = 0.0 

589 state.active_inference_count = 0 

590 

591 # Persist the release — even if we crash immediately after, 

592 # the next boot has a fresh snapshot of which models were in use. 

593 try: 

594 self._persist_state_to_disk(force=True) 

595 except Exception: 

596 pass 

597 

598 # Notify UI: model unloaded — these capabilities are now unavailable 

599 self._emit_event('model.unloaded', { 

600 'model': tool_name, 

601 'capabilities': self._get_model_capabilities(tool_name), 

602 }) 

603 

604 # ── Cross-restart persistence (J211) ────────────────────── 

605 # 

606 # The lifecycle state file was declared as LIFECYCLE_STATE_FILE at 

607 # module top but never written until the 2026-04-18 live audit 

608 # flagged the gap. We persist a compact, user-scoped JSON doc: 

609 # 

610 # { 

611 # "version": 1, 

612 # "saved_at": 1713456789.12, 

613 # "models": { 

614 # "indic_parler": { 

615 # "last_access_time": 1713450000.0, 

616 # "access_count": 17, 

617 # "pinned": false 

618 # }, 

619 # ... 

620 # } 

621 # } 

622 # 

623 # Reload happens ONCE in __init__. Save happens throttled inside 

624 # _tick (every _persist_interval_s) and on every model stop event. 

625 # Stale entries (older than LIFECYCLE_STALENESS_S) are discarded at 

626 # load-time so a 6-month-old hint doesn't distort today's boot. 

627 # No secrets / credentials / PII are persisted — only tool names 

628 # and access counters. 

629 

630 def _load_persisted_state(self) -> Dict[str, dict]: 

631 """Read the on-disk hint doc and return the map of models. 

632 

633 Returns an empty dict if the file is missing, malformed, or 

634 older than LIFECYCLE_STALENESS_S (entries are filtered on 

635 that threshold to avoid pinning stale models). 

636 """ 

637 try: 

638 if not LIFECYCLE_STATE_FILE.exists(): 

639 return {} 

640 with open(LIFECYCLE_STATE_FILE, 'r', encoding='utf-8') as f: 

641 data = json.load(f) 

642 except Exception: 

643 return {} 

644 

645 models = data.get('models') if isinstance(data, dict) else None 

646 if not isinstance(models, dict): 

647 return {} 

648 

649 now = time.time() 

650 fresh: Dict[str, dict] = {} 

651 for name, hint in models.items(): 

652 if not isinstance(hint, dict): 

653 continue 

654 last = float(hint.get('last_access_time', 0.0) or 0.0) 

655 # Stale entries get dropped — a week-old hint is worse than 

656 # no hint (it would keep a model the user no longer uses). 

657 if last and (now - last) > LIFECYCLE_STALENESS_S: 

658 continue 

659 fresh[name] = hint 

660 if fresh: 

661 logger.info( 

662 f"Lifecycle: loaded {len(fresh)} persisted hint(s) " 

663 f"from {LIFECYCLE_STATE_FILE}" 

664 ) 

665 return fresh 

666 

667 def _persist_state_to_disk(self, force: bool = False) -> None: 

668 """Write current access hints to disk. 

669 

670 Throttled to at most one write per _persist_interval_s unless 

671 ``force=True`` (used from _on_tool_stopped so a user-visible 

672 release is recorded immediately). Writes are atomic via 

673 rename-tmpfile-to-target so a crash mid-write can't leave a 

674 half-written JSON blob behind. 

675 """ 

676 now = time.time() 

677 if not force and (now - self._last_persist_time) < self._persist_interval_s: 

678 return 

679 

680 with self._lock: 

681 payload = { 

682 'version': 1, 

683 'saved_at': now, 

684 'models': { 

685 name: { 

686 'last_access_time': s.last_access_time, 

687 'access_count': s.access_count, 

688 'pinned': bool(s.pinned), 

689 } 

690 for name, s in self._models.items() 

691 if s.last_access_time > 0 

692 }, 

693 } 

694 

695 try: 

696 LIFECYCLE_STATE_FILE.parent.mkdir(parents=True, exist_ok=True) 

697 tmp = LIFECYCLE_STATE_FILE.with_suffix('.json.tmp') 

698 with open(tmp, 'w', encoding='utf-8') as f: 

699 json.dump(payload, f, ensure_ascii=False, indent=2) 

700 # Atomic rename — survives a crash mid-write. On Windows, 

701 # os.replace handles "target exists" correctly. 

702 os.replace(tmp, LIFECYCLE_STATE_FILE) 

703 self._last_persist_time = now 

704 except Exception as e: 

705 logger.debug(f"Lifecycle: persist skipped ({e})") 

706 

707 # ── Access tracking (called by tool wrappers) ───────────── 

708 

709 @contextmanager 

710 def inference_guard(self, tool_name: str): 

711 """Context manager — prevents eviction during active inference.""" 

712 with self._lock: 

713 state = self._models.get(tool_name) 

714 if state: 

715 state.active_inference_count += 1 

716 state.last_access_time = time.time() 

717 state.access_count += 1 

718 state.access_count_session += 1 

719 try: 

720 yield 

721 finally: 

722 with self._lock: 

723 state = self._models.get(tool_name) 

724 if state: 

725 state.active_inference_count = max( 

726 0, state.active_inference_count - 1) 

727 

728 def notify_access(self, tool_name: str): 

729 """Lightweight access notification. Updates timestamps + counters. 

730 

731 Also resets crash count on successful access — confirms recovery. 

732 """ 

733 with self._lock: 

734 state = self._models.get(tool_name) 

735 if state: 

736 state.last_access_time = time.time() 

737 state.access_count += 1 

738 state.access_count_session += 1 

739 # Successful access = process is healthy, reset crash state 

740 if state.crash_count > 0: 

741 logger.info(f"Model {tool_name} recovered after " 

742 f"{state.crash_count} crash(es)") 

743 state.crash_count = 0 

744 state.restart_backoff_s = 0.0 

745 state.downgraded = False 

746 

747 # ── Tick phases ─────────────────────────────────────────── 

748 

749 def _refresh_memory_state(self): 

750 """Re-read actual GPU state and sync with RTM running state.""" 

751 try: 

752 from .vram_manager import vram_manager 

753 vram_manager.refresh_gpu_info() 

754 except Exception: 

755 pass 

756 

757 # Sync with RTM's actual process state 

758 try: 

759 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS 

760 for tool_name in TOOL_CONFIGS: 

761 is_alive = runtime_tool_manager._is_server_alive(tool_name) 

762 with self._lock: 

763 state = self._models.get(tool_name) 

764 if state and not is_alive and state.device != ModelDevice.UNLOADED: 

765 state.device = ModelDevice.UNLOADED 

766 state.priority = ModelPriority.IDLE 

767 state.vram_gb = 0.0 

768 state.ram_gb = 0.0 

769 except Exception: 

770 pass 

771 

772 def _update_priorities(self): 

773 """Recalculate priority for every tracked model. 

774 

775 Respects two eviction-policy flags on each ModelState: 

776 

777 * ``pinned`` — the model is always ACTIVE; no matter how 

778 long it idles, it is never demoted to EVICTABLE. Applied 

779 to the draft 0.8B classifier which every chat message 

780 hits first. 

781 

782 * ``pressure_evict_only`` — the model never enters the 

783 EVICTABLE priority through the passive idle path. It 

784 can still hit EVICTABLE temporarily in 

785 ``_respond_to_vram_pressure`` / ``_respond_to_ram_pressure`` 

786 which is exactly what "evict under pressure" means. 

787 Applied to main-tier chat LLMs (2B / 4B) so a 30-second 

788 idle doesn't kill the server the user is actively 

789 talking to. 

790 """ 

791 now = time.time() 

792 with self._lock: 

793 for state in self._models.values(): 

794 if state.device == ModelDevice.UNLOADED: 

795 continue 

796 

797 # Pinned models are permanently ACTIVE. They stay loaded 

798 # for the life of the process and never feel eviction. 

799 if state.pinned: 

800 state.priority = ModelPriority.ACTIVE 

801 continue 

802 

803 if state.active_inference_count > 0: 

804 state.priority = ModelPriority.ACTIVE 

805 continue 

806 

807 idle_s = (now - state.last_access_time 

808 if state.last_access_time else float('inf')) 

809 

810 if idle_s < state.idle_timeout_s * 0.5: 

811 state.priority = ModelPriority.WARM 

812 elif idle_s < state.idle_timeout_s: 

813 # Hive boost extends warm period 

814 if state.hive_boost: 

815 state.priority = ModelPriority.WARM 

816 else: 

817 state.priority = ModelPriority.IDLE 

818 else: 

819 # Past the idle timeout. A normal model becomes 

820 # EVICTABLE and is swept up by _evict_idle_models 

821 # in phase 7. A pressure_evict_only model is 

822 # capped at IDLE so it survives the passive sweep 

823 # and only gets unloaded when VRAM pressure forces 

824 # the hand in phase 3. 

825 if state.pressure_evict_only: 

826 state.priority = ModelPriority.IDLE 

827 else: 

828 state.priority = ModelPriority.EVICTABLE 

829 

830 def _detect_vram_pressure(self) -> bool: 

831 """True if current VRAM usage exceeds threshold.""" 

832 try: 

833 from .vram_manager import vram_manager 

834 pct = vram_manager.get_vram_usage_pct() 

835 return pct >= self._vram_pressure_pct 

836 except Exception: 

837 return False 

838 

839 def _detect_ram_pressure(self) -> bool: 

840 """True if system RAM usage exceeds threshold.""" 

841 try: 

842 import psutil 

843 return psutil.virtual_memory().percent >= self._ram_pressure_pct 

844 except ImportError: 

845 return False 

846 

847 def _detect_cpu_pressure(self) -> bool: 

848 """True if CPU usage exceeds threshold.""" 

849 try: 

850 import psutil 

851 pct = psutil.cpu_percent(interval=None) 

852 return pct >= self._cpu_pressure_pct 

853 except ImportError: 

854 return False 

855 

856 def _detect_disk_pressure(self) -> bool: 

857 """True if free disk space is below minimum threshold.""" 

858 try: 

859 code_root = os.environ.get( 

860 'HEVOLVE_CODE_ROOT', 

861 os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 

862 usage = shutil.disk_usage(code_root) 

863 free_gb = usage.free / (1024 ** 3) 

864 return free_gb < self._disk_free_min_gb 

865 except Exception: 

866 return False 

867 

868 def _respond_to_vram_pressure(self): 

869 """Evict or offload GPU models to relieve VRAM pressure. 

870 

871 Strategy: EVICTABLE first (LRU) → IDLE offload to CPU → WARM offload. 

872 Never touches ACTIVE models. Never touches pinned models — the 

873 ``pinned`` guard is a second safety net on top of the 

874 ``_update_priorities`` rule that forces pinned models into 

875 ACTIVE priority. It catches out-of-band callers that might 

876 invoke this method between priority-update phases. 

877 """ 

878 with self._lock: 

879 candidates = sorted( 

880 [s for s in self._models.values() 

881 if s.device in (ModelDevice.GPU, ModelDevice.CPU_OFFLOAD) 

882 and s.priority != ModelPriority.ACTIVE 

883 and not s.pinned], 

884 key=lambda s: (_PRIORITY_RANK.get(s.priority, 99), 

885 s.last_access_time) 

886 ) 

887 # Work on a copy of names to avoid modifying dict during iteration 

888 candidate_names = [(s.name, s.priority, s.supports_cpu_offload) 

889 for s in candidates] 

890 

891 for name, priority, can_offload in reversed(candidate_names): 

892 if not self._detect_vram_pressure(): 

893 break # Pressure relieved 

894 

895 if priority == ModelPriority.EVICTABLE: 

896 self._do_unload(name) 

897 elif can_offload: 

898 self._do_offload_to_cpu(name) 

899 elif priority in (ModelPriority.IDLE, ModelPriority.EVICTABLE): 

900 self._do_unload(name) 

901 

902 def _respond_to_ram_pressure(self): 

903 """Unload CPU models under RAM pressure (LRU first). 

904 

905 Pinned models are excluded as a second safety net — the 

906 ``pinned`` guard ensures a draft classifier that was 

907 offloaded to CPU for any reason still survives RAM pressure. 

908 """ 

909 with self._lock: 

910 candidates = sorted( 

911 [s for s in self._models.values() 

912 if s.device == ModelDevice.CPU 

913 and s.priority != ModelPriority.ACTIVE 

914 and not s.pinned], 

915 key=lambda s: s.last_access_time 

916 ) 

917 names = [s.name for s in candidates] 

918 

919 for name in names: 

920 if not self._detect_ram_pressure(): 

921 break 

922 self._do_unload(name) 

923 

924 def _respond_to_cpu_pressure(self): 

925 """Reduce system load by evicting non-essential models. 

926 

927 Under CPU pressure, background model processes consume cycles. 

928 Evict EVICTABLE and IDLE models to free CPU for user-facing 

929 work. Pinned models are exempt as a second safety net. 

930 """ 

931 with self._lock: 

932 evictable = [ 

933 s.name for s in self._models.values() 

934 if s.priority in (ModelPriority.EVICTABLE, ModelPriority.IDLE) 

935 and s.device != ModelDevice.UNLOADED 

936 and s.active_inference_count == 0 

937 and not s.pinned 

938 ] 

939 for name in evictable: 

940 logger.info(f"Lifecycle: evicting '{name}' due to CPU pressure") 

941 self._do_unload(name) 

942 

943 def _evict_idle_models(self): 

944 """Background GC: unload models past their idle timeout. 

945 

946 Belt-and-suspenders: the ``pinned`` and ``pressure_evict_only`` 

947 guards live in :meth:`_update_priorities` which never demotes 

948 those models to EVICTABLE, so they don't appear in the 

949 candidate list below. We still check ``pinned`` explicitly 

950 here as a second safety net — if a future code path assigns 

951 EVICTABLE directly (e.g. manual admin eviction), pinning 

952 should still beat it. 

953 """ 

954 now = time.time() 

955 with self._lock: 

956 evictable = [ 

957 s.name for s in self._models.values() 

958 if s.priority == ModelPriority.EVICTABLE 

959 and s.device != ModelDevice.UNLOADED 

960 and s.active_inference_count == 0 

961 and not s.pinned 

962 ] 

963 

964 for name in evictable: 

965 with self._lock: 

966 state = self._models.get(name) 

967 if not state or state.pinned: 

968 continue 

969 idle_s = now - state.last_access_time if state.last_access_time else float('inf') 

970 if idle_s > state.idle_timeout_s: 

971 logger.info( 

972 f"Lifecycle: evicting idle model '{name}' " 

973 f"(idle {idle_s:.0f}s > timeout {state.idle_timeout_s:.0f}s)") 

974 self._do_unload(name) 

975 

976 # ── Load/Unload/Offload operations ──────────────────────── 

977 

978 def _do_unload(self, tool_name: str): 

979 """Unload a model via RuntimeToolManager.""" 

980 try: 

981 from .runtime_manager import runtime_tool_manager 

982 runtime_tool_manager.stop_tool(tool_name) 

983 # on_tool_stopped hook will update our state 

984 logger.info(f"Lifecycle: unloaded '{tool_name}'") 

985 except Exception as e: 

986 logger.debug(f"Lifecycle unload error for {tool_name}: {e}") 

987 

988 def _do_offload_to_cpu(self, tool_name: str) -> bool: 

989 """Migrate a GPU model to CPU.""" 

990 offload_info = CPU_OFFLOAD_TABLE.get(tool_name) 

991 if not offload_info or not offload_info[0]: 

992 return False 

993 

994 with self._lock: 

995 state = self._models.get(tool_name) 

996 if not state or state.device == ModelDevice.CPU: 

997 return False 

998 if state.active_inference_count > 0: 

999 return False 

1000 

1001 _, cpu_ram_gb, method = offload_info 

1002 success = False 

1003 

1004 if method == 'torch_to_cpu': 

1005 success = self._offload_torch_to_cpu(tool_name) 

1006 elif method == 'restart_cpu': 

1007 success = self._offload_via_restart(tool_name) 

1008 

1009 if success: 

1010 with self._lock: 

1011 state = self._models.get(tool_name) 

1012 if state: 

1013 old_vram = state.vram_gb 

1014 state.device = ModelDevice.CPU 

1015 state.vram_gb = 0.0 

1016 state.ram_gb = cpu_ram_gb 

1017 try: 

1018 from .vram_manager import vram_manager 

1019 vram_manager.release(tool_name) 

1020 except Exception: 

1021 pass 

1022 logger.info(f"Lifecycle: offloaded '{tool_name}' to CPU") 

1023 

1024 return success 

1025 

1026 def _offload_torch_to_cpu(self, tool_name: str) -> bool: 

1027 """Move in-process model to CPU / unload to free VRAM.""" 

1028 if tool_name == 'whisper': 

1029 try: 

1030 # faster-whisper (CTranslate2) — used by Nunba. No .cpu() method, 

1031 # must set to None and clear cache to free VRAM. 

1032 from .whisper_tool import unload_whisper 

1033 unload_whisper() 

1034 return True 

1035 except Exception as e: 

1036 logger.debug(f"Whisper offload failed: {e}") 

1037 return False 

1038 

1039 def _offload_via_restart(self, tool_name: str) -> bool: 

1040 """Stop and restart a sidecar tool with cpu_only offload mode.""" 

1041 try: 

1042 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS 

1043 runtime_tool_manager.stop_tool(tool_name) 

1044 config = TOOL_CONFIGS.get(tool_name) 

1045 if config and not config.get('is_inprocess'): 

1046 result = runtime_tool_manager._start_sidecar( 

1047 tool_name, config, 'cpu_only') 

1048 return result.get('running', False) 

1049 except Exception as e: 

1050 logger.debug(f"Restart offload for {tool_name} failed: {e}") 

1051 return False 

1052 

1053 # ── Phase 10-13: Process Health, Crash Recovery, Swap, Alerts ── 

1054 

1055 # Exit codes that indicate OOM kill 

1056 _OOM_EXIT_CODES = { 

1057 137, # Linux SIGKILL (128 + 9) — typical OOM killer 

1058 -9, # Python representation of SIGKILL 

1059 9, # Raw SIGKILL 

1060 3221225477, # Windows 0xC0000005 — access violation (often OOM-related) 

1061 3221225725, # Windows 0xC00000FD — stack overflow 

1062 } 

1063 

1064 def _check_process_health(self): 

1065 """Phase 10: Detect dead processes and classify crash vs clean exit. 

1066 

1067 For each model we think is loaded, verify the actual process is alive. 

1068 If dead: record exit code, classify OOM vs clean, queue restart. 

1069 """ 

1070 dead_models = [] 

1071 

1072 # Check RTM-managed sidecar processes 

1073 try: 

1074 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS 

1075 for tool_name in TOOL_CONFIGS: 

1076 with self._lock: 

1077 state = self._models.get(tool_name) 

1078 if not state or state.device == ModelDevice.UNLOADED: 

1079 continue 

1080 

1081 proc = runtime_tool_manager._processes.get(tool_name) 

1082 config = TOOL_CONFIGS.get(tool_name, {}) 

1083 

1084 if config.get('is_inprocess'): 

1085 # In-process models — check module-level state 

1086 if not runtime_tool_manager._is_server_alive(tool_name): 

1087 dead_models.append((tool_name, None, 'inprocess')) 

1088 elif proc is not None: 

1089 exit_code = proc.poll() 

1090 if exit_code is not None: 

1091 dead_models.append((tool_name, exit_code, 'sidecar')) 

1092 else: 

1093 # No process object but state says loaded — stale state 

1094 dead_models.append((tool_name, None, 'orphan')) 

1095 except Exception as e: 

1096 logger.debug(f"Health check RTM scan error: {e}") 

1097 

1098 # Check LLM (llama.cpp) process — not managed by RTM 

1099 try: 

1100 self._check_llm_health(dead_models) 

1101 except Exception as e: 

1102 # CRITICAL: previous code was `except Exception: pass` which 

1103 # silently masked LLM watchdog failures. The user's "main LLM 

1104 # cannot die" rule requires every health-check failure to be 

1105 # visible in the log. Log + traceback so the next iteration 

1106 # of root-cause work has signal instead of silence. 

1107 logger.exception( 

1108 f"[LLM-WATCHDOG] _check_llm_health raised {type(e).__name__}: " 

1109 f"{e!s} — main LLM health is now UNKNOWN; " 

1110 f"watchdog will retry on next tick" 

1111 ) 

1112 

1113 # Process each dead model 

1114 for tool_name, exit_code, proc_type in dead_models: 

1115 logger.warning( 

1116 f"[LLM-WATCHDOG] Dead model queued for handling: " 

1117 f"tool={tool_name} exit_code={exit_code} proc_type={proc_type}" 

1118 ) 

1119 self._handle_dead_process(tool_name, exit_code, proc_type) 

1120 

1121 def _check_llm_health(self, dead_models: list): 

1122 """Check llama.cpp server health (separate from RTM sidecar tools). 

1123 

1124 Supervises BOTH: 

1125 1. Nunba-managed server via LlamaConfig (primary in bundled mode). 

1126 2. Direct-launch server (self._direct_llama_proc) when Nunba's 

1127 LlamaConfig module isn't available (G3 — standalone / Docker / 

1128 bundled-without-supervisor). 

1129 

1130 Either source adding to ``dead_models`` triggers _handle_dead_process, 

1131 which queues a restart through _process_restart_queue. 

1132 """ 

1133 with self._lock: 

1134 state = self._models.get('llm') 

1135 # The MAIN LLM is the system's primary engine — per the user's 

1136 # explicit "main LLM cannot die" invariant we MUST keep checking 

1137 # whether it is alive even when our local state thinks it is 

1138 # UNLOADED. An UNLOADED state can come from: 

1139 # (a) crash_count exceeded max → _handle_dead_process bailed out 

1140 # (b) eviction under VRAM pressure 

1141 # (c) a stale state from a prior boot where Nunba "adopted" an 

1142 # externally-running server (server_process=None) and never 

1143 # refreshed device when that adopted server died 

1144 # In every one of those cases, the previous early-return left the 

1145 # main engine permanently dead. We instead log + continue, so 

1146 # the HTTP probe below can decide. 

1147 if not state: 

1148 # STATELESS PROBE PATH (added 2026-05-08 after live evidence 

1149 # showed llama-server died at 18:47 and the watchdog was blind 

1150 # because Nunba's LlamaConfig.start_server() spawns the server 

1151 # outside RuntimeToolManager — so RTM's `_on_tool_started('llm')` 

1152 # callback never fires and `self._models['llm']` stays absent). 

1153 # Even without a registered state we MUST keep the main engine 

1154 # alive: probe llama-server via Nunba's LlamaConfig directly, 

1155 # and if it's down, queue a restart via the existing 

1156 # `_handle_dead_process` path (which will re-spawn via 

1157 # `LlamaConfig.start_server`). The fake state we synthesize 

1158 # here lives only inside `_handle_dead_process` (it reads 

1159 # state.crash_count etc.), so we register a minimal one 

1160 # before queueing. 

1161 try: 

1162 from llama.llama_config import LlamaConfig 

1163 _cfg = LlamaConfig() 

1164 if not _cfg.check_server_running(): 

1165 logger.warning( 

1166 f"[LLM-WATCHDOG] STATELESS-PROBE: llama-server " 

1167 f"unreachable on port " 

1168 f"{_cfg.config.get('server_port')!r} and no 'llm' " 

1169 f"state registered with lifecycle. Registering a " 

1170 f"minimal state and queueing restart so the main " 

1171 f"engine cannot stay dead silently." 

1172 ) 

1173 # Register a minimal state so _handle_dead_process can 

1174 # update crash_count + restart_backoff etc. 

1175 with self._lock: 

1176 self._models['llm'] = ModelState( 

1177 name='llm', 

1178 device=ModelDevice.UNLOADED, 

1179 priority=ModelPriority.IDLE, 

1180 vram_gb=0.0, 

1181 ram_gb=0.0, 

1182 last_access_time=time.time(), 

1183 crash_count=0, 

1184 pressure_evict_only=True, 

1185 ) 

1186 dead_models.append(('llm', None, 'stateless_probe')) 

1187 return 

1188 else: 

1189 # Healthy but unregistered — register state lazily so 

1190 # subsequent ticks take the fast path with full info. 

1191 logger.info( 

1192 f"[LLM-WATCHDOG] STATELESS-PROBE: llama-server is " 

1193 f"alive on port {_cfg.config.get('server_port')!r} " 

1194 f"but had no registered state — registering now so " 

1195 f"future ticks can supervise it." 

1196 ) 

1197 with self._lock: 

1198 self._models['llm'] = ModelState( 

1199 name='llm', 

1200 device=ModelDevice.GPU, # Best-effort default 

1201 priority=ModelPriority.ACTIVE, 

1202 vram_gb=0.0, 

1203 ram_gb=0.0, 

1204 last_access_time=time.time(), 

1205 crash_count=0, 

1206 pressure_evict_only=True, 

1207 ) 

1208 return 

1209 except ImportError: 

1210 logger.debug( 

1211 "[LLM-WATCHDOG] No 'llm' state and LlamaConfig not " 

1212 "importable (Docker / standalone mode) — relying on " 

1213 "G3 direct-launch supervision below." 

1214 ) 

1215 except Exception as e: 

1216 logger.exception( 

1217 f"[LLM-WATCHDOG] STATELESS-PROBE raised " 

1218 f"{type(e).__name__}: {e!s} — main engine health is " 

1219 f"now UNKNOWN; will retry on next tick" 

1220 ) 

1221 return 

1222 if state and state.device == ModelDevice.UNLOADED: 

1223 logger.warning( 

1224 f"[LLM-WATCHDOG] state.device=UNLOADED — investigating " 

1225 f"whether the main engine should be re-loaded " 

1226 f"(crash_count={state.crash_count}, " 

1227 f"last_exit_code={state.last_exit_code}). " 

1228 f"Continuing health check despite UNLOADED — " 

1229 f"main LLM cannot stay dead." 

1230 ) 

1231 

1232 nunba_handled = False 

1233 try: 

1234 from llama.llama_config import LlamaConfig 

1235 config = LlamaConfig() 

1236 if config.server_process is not None: 

1237 nunba_handled = True 

1238 exit_code = config.server_process.poll() 

1239 if exit_code is not None: 

1240 logger.warning( 

1241 f"[LLM-WATCHDOG] Nunba-spawned llama-server died: " 

1242 f"PID={config.server_process.pid} " 

1243 f"exit_code={exit_code}" 

1244 ) 

1245 dead_models.append(('llm', exit_code, 'llm_server')) 

1246 else: 

1247 logger.debug( 

1248 f"[LLM-WATCHDOG] Nunba-spawned llama-server alive " 

1249 f"PID={config.server_process.pid}" 

1250 ) 

1251 else: 

1252 # No Popen handle — either we adopted an external server, OR 

1253 # state thinks it's loaded but the spawning Python process 

1254 # already exited (handle gone with it). Either way, an 

1255 # HTTP probe is the only authoritative signal. We do NOT 

1256 # gate on `state.device != UNLOADED` here — the main LLM 

1257 # must be probed even when state is stale (see Bug B in 

1258 # the 2026-05-08 incident: adopt-then-die left the system 

1259 # half-alive because the watchdog stopped probing once 

1260 # state went UNLOADED). 

1261 alive = config.check_server_running() 

1262 if not alive: 

1263 logger.warning( 

1264 f"[LLM-WATCHDOG] Adopted/stateless llama-server is " 

1265 f"UNREACHABLE via HTTP probe — queueing restart. " 

1266 f"state.device={state.device.value} " 

1267 f"server_port={config.config.get('server_port')}" 

1268 ) 

1269 dead_models.append(('llm', None, 'llm_server')) 

1270 nunba_handled = True 

1271 else: 

1272 nunba_handled = True 

1273 logger.debug( 

1274 "[LLM-WATCHDOG] Adopted llama-server alive (HTTP 200)" 

1275 ) 

1276 except ImportError as e: 

1277 # Previous code: `except ImportError: pass` — silent. Surface 

1278 # this as a log line so when Nunba is bundled-without-supervisor 

1279 # we can see the LlamaConfig path is unavailable and the G3 

1280 # direct-launch path below is being used by design. 

1281 logger.info( 

1282 f"[LLM-WATCHDOG] LlamaConfig import unavailable " 

1283 f"({e!s}) — falling through to G3 direct-launch supervision" 

1284 ) 

1285 except Exception as e: 

1286 # Previous code only caught ImportError — a real LlamaConfig() 

1287 # instantiation error (e.g. corrupted llama_config.json, 

1288 # filesystem permission error) would propagate up and be 

1289 # swallowed by the outer except in _check_process_health. 

1290 # Log + re-raise so caller's CRITICAL log fires. 

1291 logger.exception( 

1292 f"[LLM-WATCHDOG] LlamaConfig health-check raised " 

1293 f"{type(e).__name__}: {e!s}" 

1294 ) 

1295 raise 

1296 

1297 # G3: Direct-launch supervision (fires when Nunba not in charge). 

1298 if nunba_handled: 

1299 return 

1300 proc = self._direct_llama_proc 

1301 port = self._direct_llama_port 

1302 if proc is not None: 

1303 exit_code = proc.poll() 

1304 if exit_code is not None: 

1305 logger.warning( 

1306 f"[G3] direct llama-server died: PID={proc.pid} " 

1307 f"exit_code={exit_code} port={port}") 

1308 # Clear stale handle so _handle_dead_process → restart 

1309 # can cleanly relaunch via _launch_llama_server_direct. 

1310 try: 

1311 fh = self._direct_llama_log_fh 

1312 if fh is not None and not fh.closed: 

1313 fh.close() 

1314 except Exception: 

1315 pass 

1316 self._direct_llama_proc = None 

1317 self._direct_llama_log_fh = None 

1318 dead_models.append(('llm', exit_code, 'llm_server_direct')) 

1319 elif port is not None and state.device != ModelDevice.UNLOADED: 

1320 # We once launched directly but lost the handle — verify via HTTP. 

1321 # A missing handle + unreachable port == dead server. 

1322 try: 

1323 import urllib.request as _ur 

1324 import urllib.error as _ue 

1325 url = f'http://127.0.0.1:{port}/health' 

1326 try: 

1327 with _ur.urlopen(url, timeout=2) as resp: 

1328 if resp.status != 200: 

1329 dead_models.append( 

1330 ('llm', None, 'llm_server_direct')) 

1331 except (_ue.URLError, _ue.HTTPError, OSError): 

1332 dead_models.append(('llm', None, 'llm_server_direct')) 

1333 except Exception as e: 

1334 logger.debug(f"[G3] direct HTTP health check skipped: {e}") 

1335 

1336 def _handle_dead_process(self, tool_name: str, exit_code: Optional[int], 

1337 proc_type: str): 

1338 """Classify crash, update state, queue restart if appropriate.""" 

1339 is_oom = exit_code in self._OOM_EXIT_CODES if exit_code is not None else False 

1340 crash_type = 'oom' if is_oom else ('crash' if exit_code else 'disappeared') 

1341 

1342 logger.warning( 

1343 f"Dead process detected: {tool_name} " 

1344 f"(exit_code={exit_code}, type={crash_type}, proc={proc_type})") 

1345 

1346 with self._lock: 

1347 state = self._models.get(tool_name) 

1348 if not state: 

1349 return 

1350 

1351 old_device = state.device 

1352 old_vram = state.vram_gb 

1353 

1354 # Update state to unloaded 

1355 state.device = ModelDevice.UNLOADED 

1356 state.priority = ModelPriority.IDLE 

1357 state.crash_count += 1 

1358 state.last_crash_time = time.time() 

1359 state.last_exit_code = exit_code 

1360 state.vram_gb = 0.0 

1361 state.ram_gb = 0.0 

1362 state.active_inference_count = 0 

1363 

1364 # Exponential backoff: 5s, 10s, 20s, 40s... capped at 300s 

1365 state.restart_backoff_s = min( 

1366 self._base_backoff_s * (2 ** (state.crash_count - 1)), 

1367 self._max_backoff_s 

1368 ) 

1369 

1370 # MAIN LLM EXEMPTION (user invariant 2026-05-08): 

1371 # "the LLM is the main engine, it cannot die — we observe with 

1372 # watchdogs etc." For every other tool we honor max_crash_restarts 

1373 # so a permanently-broken sidecar doesn't loop forever. For 'llm' 

1374 # specifically, we keep retrying — but at the capped backoff 

1375 # interval (300s) so we don't burn CPU on a hopeless box. The 

1376 # alternative (give up) leaves the system permanently degraded 

1377 # and was the root cause of the 2026-05-08 11:40 → 13:56+ silent 

1378 # outage. 

1379 if tool_name == 'llm': 

1380 should_restart = True 

1381 if state.crash_count > self._max_crash_restarts: 

1382 # Past the configured ceiling — log loudly so on-call 

1383 # sees the persistent failure, but keep trying at the 

1384 # max backoff. Reset crash_count to the ceiling so 

1385 # restart_backoff_s stays clamped at _max_backoff_s. 

1386 logger.error( 

1387 f"[LLM-WATCHDOG] LLM has crashed {state.crash_count} " 

1388 f"times (> max_crash_restarts={self._max_crash_restarts})" 

1389 f" — continuing to retry every {self._max_backoff_s}s " 

1390 f"because the main engine cannot stay dead. " 

1391 f"Investigate llama-server log for repeated crash " 

1392 f"cause." 

1393 ) 

1394 else: 

1395 should_restart = state.crash_count <= self._max_crash_restarts 

1396 

1397 # Release VRAM allocation 

1398 try: 

1399 from .vram_manager import vram_manager 

1400 vram_manager.release(tool_name) 

1401 except Exception: 

1402 pass 

1403 

1404 # Release from RTM process table 

1405 try: 

1406 from .runtime_manager import runtime_tool_manager 

1407 runtime_tool_manager._processes.pop(tool_name, None) 

1408 runtime_tool_manager._ports.pop(tool_name, None) 

1409 except Exception: 

1410 pass 

1411 

1412 # Sync catalog state 

1413 try: 

1414 from integrations.service_tools.model_orchestrator import get_orchestrator 

1415 get_orchestrator().notify_unloaded( 

1416 self._guess_model_type(tool_name), tool_name) 

1417 except Exception: 

1418 pass 

1419 

1420 # Emit crash event 

1421 self._emit_event('model.crash', { 

1422 'model': tool_name, 

1423 'crash_type': crash_type, 

1424 'exit_code': exit_code, 

1425 'crash_count': state.crash_count if state else 0, 

1426 'will_restart': should_restart, 

1427 }) 

1428 

1429 # Queue restart with backoff (if under max retries) 

1430 if should_restart: 

1431 # IDEMPOTENCE GUARD (2026-05-09 starvation fix): 

1432 # If a restart is already queued or in progress, do NOT 

1433 # overwrite it. Re-queuing every dead-detection tick was 

1434 # the 2026-05-09 starvation bug — retry_after kept getting 

1435 # pushed forward by the latest tick's `now + backoff_s`, 

1436 # so _process_restart_queue's `now >= retry_after` check 

1437 # never succeeded. Live evidence: 220 `Queued restart for 

1438 # llm` lines, 0 `_restart_llm starting` lines over 7.5h. 

1439 # Keep the original retry_after so the spawn fires. 

1440 with self._lock: 

1441 existing = self._restart_pending.get(tool_name) 

1442 if existing and isinstance(existing, dict): 

1443 if existing.get('in_progress'): 

1444 logger.debug( 

1445 f"[LIFECYCLE] {tool_name} restart IN PROGRESS — " 

1446 f"suppressing re-queue from dead-detection tick" 

1447 ) 

1448 else: 

1449 eta = max(0, existing.get('retry_after', 0) - time.time()) 

1450 logger.debug( 

1451 f"[LIFECYCLE] {tool_name} restart already queued " 

1452 f"(fires in {eta:.0f}s) — keeping original schedule, " 

1453 f"not re-queuing" 

1454 ) 

1455 return 

1456 retry_after = time.time() + state.restart_backoff_s 

1457 downgrade = is_oom # OOM → restart on lower resource tier 

1458 self._restart_pending[tool_name] = { 

1459 'retry_after': retry_after, 

1460 'downgrade': downgrade, 

1461 'old_device': old_device.value if old_device else 'gpu', 

1462 'in_progress': False, 

1463 } 

1464 logger.info( 

1465 f"Queued restart for {tool_name} in {state.restart_backoff_s:.0f}s" 

1466 f" (attempt {state.crash_count}/{self._max_crash_restarts})" 

1467 f"{' [DOWNGRADE]' if downgrade else ''}") 

1468 else: 

1469 logger.error( 

1470 f"Model {tool_name} exceeded max restarts " 

1471 f"({self._max_crash_restarts}), giving up. " 

1472 f"Manual intervention required.") 

1473 self._emit_event('model.restart_exhausted', { 

1474 'model': tool_name, 

1475 'crash_count': state.crash_count if state else 0, 

1476 }) 

1477 

1478 def _process_restart_queue(self): 

1479 """Phase 11: Process pending crash restarts with exponential backoff.""" 

1480 now = time.time() 

1481 ready = [] 

1482 with self._lock: 

1483 for name, info in list(self._restart_pending.items()): 

1484 if not isinstance(info, dict): 

1485 continue 

1486 if info.get('in_progress'): 

1487 # Already being spawned by a prior tick — skip. Without 

1488 # this check, two ticks could pull the same entry and 

1489 # double-spawn (race that motivated the IDEMPOTENCE 

1490 # GUARD in _handle_dead_process — both halves needed). 

1491 continue 

1492 if now >= info.get('retry_after', 0): 

1493 # Mark in_progress instead of deleting. Retain the 

1494 # entry so concurrent ticks see in_progress=True and 

1495 # skip. We pop the entry only AFTER spawn completes 

1496 # (success or exception), in the finally block below. 

1497 info['in_progress'] = True 

1498 self._restart_pending[name] = info 

1499 ready.append((name, info)) 

1500 

1501 for name, info in ready: 

1502 # IDEMPOTENCE: pre-spawn HTTP probe. If the server came up 

1503 # via an external respawn (user restarted llama-server in a 

1504 # different terminal) between when this restart was queued 

1505 # and now, skip the spawn — don't kill the user's process 

1506 # via stop_server() and re-spawn our own. Same probe also 

1507 # protects against the race where an earlier _restart_llm 

1508 # already succeeded but a stale queue entry survived. 

1509 if name == 'llm': 

1510 try: 

1511 from llama.llama_config import LlamaConfig 

1512 if LlamaConfig().check_server_running(): 

1513 logger.info( 

1514 f"[LIFECYCLE] {name} already healthy on HTTP " 

1515 f"probe — skipping queued spawn (idempotent " 

1516 f"no-op; likely came up via external respawn " 

1517 f"or earlier _restart_llm)" 

1518 ) 

1519 # Detect ACTUAL device — Nunba defaults to GPU but 

1520 # an externally-respawned server (or a Nunba retry 

1521 # after a CUDA-context loss from sleep/hibernate) 

1522 # may be on CPU. Self-review caught the v1 of this 

1523 # code unconditionally stamping GPU, which would 

1524 # mislead the catalog/dispatcher. vram_manager has 

1525 # an internal cache so this is sub-millisecond after 

1526 # first call (no nvidia-smi cost per tick). 

1527 try: 

1528 from integrations.service_tools.vram_manager import ( 

1529 vram_manager as _vmm, 

1530 ) 

1531 _gpu_info = _vmm.detect_gpu() or {} 

1532 _device = (ModelDevice.GPU 

1533 if _gpu_info.get('cuda_available') 

1534 else ModelDevice.CPU) 

1535 except Exception: 

1536 # If vram_manager can't be reached, prefer GPU 

1537 # since that's the default Nunba launch mode — 

1538 # next full _check_llm_health tick will resync 

1539 # if wrong. 

1540 _device = ModelDevice.GPU 

1541 with self._lock: 

1542 self._restart_pending.pop(name, None) 

1543 # Sync watchdog state so STATELESS-PROBE doesn't 

1544 # immediately re-detect "dead" and re-queue. 

1545 st = self._models.get(name) 

1546 if st: 

1547 st.device = _device 

1548 st.priority = ModelPriority.ACTIVE 

1549 st.last_access_time = time.time() 

1550 continue 

1551 except Exception: 

1552 pass # Probe failed — proceed with spawn 

1553 downgrade = info.get('downgrade', False) 

1554 old_device = info.get('old_device', 'gpu') 

1555 

1556 # Decide restart mode 

1557 if downgrade and old_device == 'gpu': 

1558 restart_mode = 'cpu_offload' # OOM on GPU → try CPU offload 

1559 elif downgrade and old_device == 'cpu_offload': 

1560 restart_mode = 'cpu_only' # OOM on offload → pure CPU 

1561 else: 

1562 restart_mode = old_device # Same mode as before 

1563 

1564 logger.info(f"Restarting {name} in {restart_mode} mode" 

1565 f" (was {old_device}, downgrade={downgrade})") 

1566 

1567 success = False 

1568 try: 

1569 if name == 'llm': 

1570 success = self._restart_llm(restart_mode) 

1571 else: 

1572 success = self._restart_rtm_tool(name, restart_mode) 

1573 finally: 

1574 # ALWAYS clear the in_progress entry after spawn returns. 

1575 # If we succeeded, no further pending needed. If we failed 

1576 # and the LLM-EXEMPTION + dead-detection wants another 

1577 # attempt, the next _check_process_health tick re-queues 

1578 # via _handle_dead_process — and that path's IDEMPOTENCE 

1579 # GUARD already prevents starvation by NOT overwriting 

1580 # pending if one already exists. So pop here is safe. 

1581 # Without this pop, in_progress=True would persist and 

1582 # the entry would be skipped forever by future ticks. 

1583 with self._lock: 

1584 self._restart_pending.pop(name, None) 

1585 

1586 if success: 

1587 with self._lock: 

1588 state = self._models.get(name) 

1589 if state: 

1590 state.downgraded = downgrade 

1591 self._emit_event('model.restarted', { 

1592 'model': name, 'mode': restart_mode, 'downgraded': downgrade}) 

1593 else: 

1594 # Failure path: bump crash_count + backoff, then re-queue 

1595 # ONLY if no entry is currently pending (other ticks may 

1596 # have re-queued via _handle_dead_process while we were 

1597 # spawning). Same idempotence rule as _handle_dead_process. 

1598 with self._lock: 

1599 state = self._models.get(name) 

1600 already_pending = name in self._restart_pending 

1601 if (state and not already_pending 

1602 and state.crash_count <= self._max_crash_restarts): 

1603 state.crash_count += 1 

1604 state.restart_backoff_s = min( 

1605 state.restart_backoff_s * 2, self._max_backoff_s) 

1606 self._restart_pending[name] = { 

1607 'retry_after': now + state.restart_backoff_s, 

1608 'downgrade': downgrade, 

1609 'old_device': restart_mode, 

1610 'in_progress': False, 

1611 } 

1612 

1613 def _restart_llm(self, mode: str) -> bool: 

1614 """Restart llama.cpp server in specified mode. 

1615 

1616 Primary path: Nunba's LlamaConfig (reads ~/.nunba/llama_config.json). 

1617 Fallback: direct subprocess launch (Docker/standalone without Nunba). 

1618 """ 

1619 logger.info( 

1620 f"[LLM-WATCHDOG] _restart_llm starting: mode={mode!r} " 

1621 f"(primary path: Nunba LlamaConfig.start_server)" 

1622 ) 

1623 # Primary: Nunba manages the server 

1624 try: 

1625 from llama.llama_config import LlamaConfig 

1626 config = LlamaConfig() 

1627 logger.info( 

1628 f"[LLM-WATCHDOG] Calling LlamaConfig.stop_server() to clean " 

1629 f"up any half-alive server state before restart" 

1630 ) 

1631 config.stop_server() 

1632 config.config['use_gpu'] = (mode == 'gpu') 

1633 config._save_config() 

1634 logger.info( 

1635 f"[LLM-WATCHDOG] Calling LlamaConfig.start_server() " 

1636 f"(use_gpu={config.config['use_gpu']}, " 

1637 f"port={config.config.get('server_port')})" 

1638 ) 

1639 ok = config.start_server() 

1640 if ok: 

1641 logger.info( 

1642 f"[LLM-WATCHDOG] Nunba LLM restart SUCCEEDED on port " 

1643 f"{config.config.get('server_port')}" 

1644 ) 

1645 else: 

1646 logger.error( 

1647 f"[LLM-WATCHDOG] Nunba LLM restart RETURNED FALSE " 

1648 f"from start_server() — caller will re-queue with " 

1649 f"backoff. Check llama_server_<port>.log for the " 

1650 f"underlying spawn failure." 

1651 ) 

1652 return ok 

1653 except ImportError: 

1654 logger.info( 

1655 "[LLM-WATCHDOG] Nunba LlamaConfig not importable — " 

1656 "falling through to direct llama-server launch (G3 path)" 

1657 ) 

1658 except Exception as e: 

1659 logger.exception( 

1660 f"[LLM-WATCHDOG] Nunba LLM restart raised " 

1661 f"{type(e).__name__}: {e!s} — caller will re-queue with " 

1662 f"backoff" 

1663 ) 

1664 return False 

1665 

1666 # Fallback: direct launch (Docker/standalone) 

1667 return self._launch_llama_server_direct(mode) 

1668 

1669 def _launch_llama_server_direct(self, mode: str) -> bool: 

1670 """Launch llama-server directly without Nunba. 

1671 

1672 Uses model_catalog for model selection, port_registry for port, 

1673 compute_config for parallel slots, vram_manager for GPU detection. 

1674 """ 

1675 import subprocess as _sp 

1676 

1677 # Find llama-server binary 

1678 server_bin = self._find_llama_server_binary() 

1679 if not server_bin: 

1680 logger.error("llama-server binary not found") 

1681 return False 

1682 

1683 # Find model + mmproj from catalog 

1684 model_path, mmproj_path = self._find_model_files() 

1685 if not model_path: 

1686 logger.error("No GGUF model file found") 

1687 return False 

1688 

1689 # Port from registry 

1690 try: 

1691 from core.port_registry import get_port 

1692 port = get_port('llm') 

1693 except Exception: 

1694 port = int(os.environ.get('HEVOLVE_LLM_PORT', 8080)) 

1695 

1696 # Build command 

1697 gpu_layers = 99 if mode == 'gpu' else 0 

1698 ctx_size = int(os.environ.get('HEVOLVE_LLM_CTX_SIZE', 8192)) 

1699 

1700 cmd = [ 

1701 server_bin, 

1702 '--model', model_path, 

1703 '--port', str(port), 

1704 '--ctx-size', str(ctx_size), 

1705 '--n-gpu-layers', str(gpu_layers), 

1706 '--threads', '4', 

1707 '--flash-attn', 'on', 

1708 ] 

1709 if mmproj_path: 

1710 cmd.extend(['--mmproj', mmproj_path]) 

1711 

1712 log_path = os.path.join( 

1713 os.environ.get('TEMP', '/tmp'), f'llama_{port}.log') 

1714 log_fh = None 

1715 try: 

1716 log_fh = open(log_path, 'w') 

1717 _popen_kwargs = dict(stdout=log_fh, stderr=_sp.STDOUT) 

1718 if os.name == 'nt': 

1719 _popen_kwargs['creationflags'] = _sp.CREATE_NO_WINDOW 

1720 proc = _sp.Popen(cmd, **_popen_kwargs) 

1721 # Store handle so it stays open for the child process 

1722 self._direct_log_fh = log_fh 

1723 log_fh = None # Prevent close in finally 

1724 # G3: Track the Popen so _check_llm_health can detect death 

1725 # and _process_restart_queue can resurrect it. 

1726 self._direct_llama_proc = proc 

1727 self._direct_llama_port = port 

1728 self._direct_llama_mode = mode 

1729 self._direct_llama_log_fh = self._direct_log_fh 

1730 self._direct_llama_last_restart = time.time() 

1731 logger.info(f"llama-server started: PID={proc.pid} port={port} " 

1732 f"gpu_layers={gpu_layers} log={log_path}") 

1733 return True 

1734 except Exception as e: 

1735 logger.error(f"Direct llama-server launch failed: {e}") 

1736 return False 

1737 finally: 

1738 if log_fh is not None: 

1739 log_fh.close() 

1740 

1741 @staticmethod 

1742 def _find_llama_server_binary() -> Optional[str]: 

1743 """Find llama-server binary across known locations.""" 

1744 home = str(Path.home()) 

1745 for base in ['.nunba', '.trueflow']: 

1746 for sub in ['llama.cpp/build/bin/Release', 'llama.cpp']: 

1747 for ext in ['', '.exe']: 

1748 p = os.path.join(home, base, sub, f'llama-server{ext}') 

1749 if os.path.isfile(p): 

1750 return p 

1751 # Check PATH 

1752 binary = shutil.which('llama-server') 

1753 if binary: 

1754 return binary 

1755 return None 

1756 

1757 @staticmethod 

1758 def _find_model_files() -> Tuple[Optional[str], Optional[str]]: 

1759 """Find GGUF model + mmproj from known locations. 

1760 

1761 Searches ~/.nunba/models/ and ~/.trueflow/models/ for the largest 

1762 Qwen GGUF file (prefers 4B over 0.8B for the main LLM port). 

1763 """ 

1764 home = str(Path.home()) 

1765 model_dirs = [ 

1766 os.path.join(home, '.nunba', 'models'), 

1767 os.path.join(home, '.trueflow', 'models'), 

1768 ] 

1769 best_model = None 

1770 best_size = 0 

1771 mmproj = None 

1772 

1773 for d in model_dirs: 

1774 if not os.path.isdir(d): 

1775 continue 

1776 for f in os.listdir(d): 

1777 fp = os.path.join(d, f) 

1778 if not os.path.isfile(fp): 

1779 continue 

1780 if f.startswith('mmproj') and f.endswith('.gguf') and not mmproj: 

1781 mmproj = fp 

1782 elif f.endswith('.gguf') and not f.startswith('mmproj'): 

1783 size = os.path.getsize(fp) 

1784 if size > best_size: 

1785 best_size = size 

1786 best_model = fp 

1787 

1788 return best_model, mmproj 

1789 

1790 def _restart_rtm_tool(self, tool_name: str, mode: str) -> bool: 

1791 """Restart a RuntimeToolManager-managed tool.""" 

1792 try: 

1793 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS 

1794 config = TOOL_CONFIGS.get(tool_name) 

1795 if not config: 

1796 return False 

1797 if config.get('is_inprocess'): 

1798 result = runtime_tool_manager._start_inprocess(tool_name, config) 

1799 else: 

1800 result = runtime_tool_manager._start_sidecar( 

1801 tool_name, config, mode) 

1802 return result.get('running', False) 

1803 except Exception as e: 

1804 logger.error(f"RTM restart failed for {tool_name}: {e}") 

1805 return False 

1806 

1807 # ── Swap Queue ─────────────────────────────────────────────── 

1808 

1809 def request_swap(self, needed_model: str, needed_type: str = 'gpu', 

1810 evict_target: Optional[str] = None) -> bool: 

1811 """Request a model swap: evict lowest-priority GPU model to make room. 

1812 

1813 Called by orchestrator when a model can't fit alongside current loads. 

1814 The evicted model is queued for restoration when the new model idles. 

1815 

1816 Returns True if swap was initiated, False if nothing can be evicted. 

1817 """ 

1818 with self._lock: 

1819 # Find eviction candidate: lowest priority, non-ACTIVE GPU model 

1820 if evict_target: 

1821 candidates = [evict_target] 

1822 else: 

1823 # LLMs are owned by llama-server (separate process); evicting 

1824 # them from the registry doesn't free VRAM (see line 1562). 

1825 # Skip them as swap candidates so we never burn down the 

1826 # active LLM in exchange for a TTS that still won't fit. 

1827 gpu_models = sorted( 

1828 [s for s in self._models.values() 

1829 if s.device == ModelDevice.GPU 

1830 and s.priority != ModelPriority.ACTIVE 

1831 and s.active_inference_count == 0 

1832 and s.name != needed_model 

1833 and not s.name.startswith('llm-') 

1834 and s.model_type != 'llm'], 

1835 key=lambda s: ( 

1836 _PRIORITY_RANK.get(s.priority, 99), 

1837 s.last_access_time, 

1838 ) 

1839 ) 

1840 candidates = [s.name for s in gpu_models] 

1841 

1842 if not candidates: 

1843 logger.warning(f"Swap failed for {needed_model}: no evictable GPU model") 

1844 return False 

1845 

1846 evicted = candidates[-1] # Lowest priority, oldest access 

1847 logger.info(f"Swap: evicting '{evicted}' to make room for '{needed_model}'") 

1848 

1849 # Record in swap queue BEFORE eviction 

1850 self._swap_queue.append({ 

1851 'name': evicted, 

1852 'device': 'gpu', 

1853 'evicted_for': needed_model, 

1854 'timestamp': time.time(), 

1855 }) 

1856 

1857 # Evict 

1858 self._do_unload(evicted) 

1859 

1860 self._emit_event('model.swapped', { 

1861 'evicted': evicted, 

1862 'loaded': needed_model, 

1863 'swap_queue_depth': len(self._swap_queue), 

1864 }) 

1865 return True 

1866 

1867 def ensure_inference_headroom(self, needed_gb: float = 1.0, 

1868 requester: str = 'tts') -> bool: 

1869 """Ensure enough free VRAM for inference buffers. 

1870 

1871 Called BEFORE synthesis/inference — not model loading. 

1872 Offloads models to CPU even if they're ACTIVE, as long as the 

1873 requester has higher priority (e.g., TTS > idle STT). 

1874 

1875 Whisper STT runs continuously but is mostly filtering silence — 

1876 safe to offload for a few seconds during TTS synthesis. 

1877 

1878 Returns True if headroom was created (or already existed). 

1879 """ 

1880 try: 

1881 from .vram_manager import vram_manager 

1882 free_gb = vram_manager.get_free_vram() 

1883 if free_gb >= needed_gb: 

1884 return True 

1885 

1886 logger.info(f"Inference headroom: {free_gb:.1f}GB free, need {needed_gb}GB " 

1887 f"for {requester}") 

1888 

1889 # NEVER evict the LLM — it runs as a separate process (llama-server) 

1890 # that the lifecycle manager can't actually unload. "Evicting" it just 

1891 # removes the registry entry while llama-server keeps all its VRAM. 

1892 # Instead, skip headroom check — the model will use whatever is free. 

1893 if free_gb < needed_gb: 

1894 logger.info(f"Inference headroom: proceeding anyway — model will " 

1895 f"use available {free_gb:.1f}GB (llama-server owns the rest)") 

1896 return True 

1897 

1898 # Standard swap first (evicts IDLE/EVICTABLE models) 

1899 if self.request_swap(needed_model=requester, needed_type='gpu'): 

1900 return True 

1901 

1902 # Force-offload any GPU model that isn't the requester. 

1903 # Whisper registers itself on load (whisper_tool.py). 

1904 with self._lock: 

1905 gpu_models = [ 

1906 s for s in self._models.values() 

1907 if s.device == ModelDevice.GPU 

1908 and s.name != requester 

1909 ] 

1910 for model in gpu_models: 

1911 logger.info(f"Inference headroom: offloading {model.name} to CPU " 

1912 f"for {requester}") 

1913 success = self._do_offload_to_cpu(model.name) 

1914 if success: 

1915 self._swap_queue.append({ 

1916 'name': model.name, 

1917 'device': 'gpu', 

1918 'evicted_for': requester, 

1919 'timestamp': time.time(), 

1920 }) 

1921 with self._lock: 

1922 model.device = ModelDevice.CPU 

1923 return True 

1924 else: 

1925 logger.warning(f"Inference headroom: offload of {model.name} failed") 

1926 

1927 logger.warning(f"Inference headroom: no model to evict for {requester}") 

1928 return False 

1929 except Exception as e: 

1930 logger.debug(f"Inference headroom check failed: {e}") 

1931 return False 

1932 

1933 def _process_swap_queue(self): 

1934 """Phase 12: Restore evicted models when the model that displaced them idles. 

1935 

1936 If model B evicted model A, and B is now IDLE/EVICTABLE, restore A if VRAM allows. 

1937 """ 

1938 if not self._swap_queue: 

1939 return 

1940 

1941 restored = [] 

1942 for entry in list(self._swap_queue): 

1943 evicted_for = entry.get('evicted_for') 

1944 evicted_name = entry.get('name') 

1945 

1946 with self._lock: 

1947 # Check if the model that caused the eviction has become idle 

1948 displacer = self._models.get(evicted_for) 

1949 if displacer and displacer.priority in ( 

1950 ModelPriority.IDLE, ModelPriority.EVICTABLE): 

1951 pass # Displacer is idle — consider restoring 

1952 elif displacer and displacer.device == ModelDevice.UNLOADED: 

1953 pass # Displacer already gone — restore 

1954 else: 

1955 continue # Displacer still active — skip 

1956 

1957 # Check if VRAM has room 

1958 try: 

1959 from .vram_manager import vram_manager, VRAM_BUDGETS 

1960 budget = VRAM_BUDGETS.get(evicted_name, (0, 0)) 

1961 if vram_manager.get_free_vram() < budget[1]: 

1962 continue # Still no room 

1963 except Exception: 

1964 continue 

1965 

1966 # Restore the evicted model 

1967 logger.info(f"Swap queue: restoring '{evicted_name}' " 

1968 f"(displaced by '{evicted_for}' which is now idle)") 

1969 success = self._restart_rtm_tool(evicted_name, entry.get('device', 'gpu')) 

1970 if success: 

1971 restored.append(entry) 

1972 

1973 for entry in restored: 

1974 try: 

1975 self._swap_queue.remove(entry) 

1976 except ValueError: 

1977 pass 

1978 

1979 # ── Pressure Alerts ────────────────────────────────────────── 

1980 

1981 def _emit_pressure_alerts(self): 

1982 """Phase 13: Emit debounced pressure events to EventBus for frontend.""" 

1983 now = time.time() 

1984 alerts = [] 

1985 

1986 if self._detect_vram_pressure(): 

1987 alerts.append(('vram', 'GPU memory is under pressure — models may be evicted')) 

1988 if self._detect_ram_pressure(): 

1989 alerts.append(('ram', 'System memory is under pressure — performance may degrade')) 

1990 if self._detect_cpu_pressure(): 

1991 alerts.append(('cpu', 'CPU is under heavy load — inference may be slower')) 

1992 if self._detect_disk_pressure(): 

1993 alerts.append(('disk', 'Low disk space — downloads and caching disabled')) 

1994 

1995 for ptype, message in alerts: 

1996 last = self._last_pressure_alert.get(ptype, 0) 

1997 if now - last >= self._pressure_alert_cooldown: 

1998 self._last_pressure_alert[ptype] = now 

1999 self._emit_event('system.pressure', { 

2000 'type': ptype, 

2001 'message': message, 

2002 'timestamp': now, 

2003 'throttle_factor': self._calculate_throttle_factor(), 

2004 }) 

2005 

2006 # ── Event emission helper ──────────────────────────────────── 

2007 

2008 def _get_model_capabilities(self, tool_name: str) -> dict: 

2009 """Read capabilities from model_catalog (single source of truth).""" 

2010 try: 

2011 from .model_catalog import get_catalog 

2012 catalog = get_catalog() 

2013 # Try direct lookup, then prefix search 

2014 entry = catalog.get(tool_name) 

2015 if not entry: 

2016 for eid in catalog.list_ids(): 

2017 if tool_name in eid: 

2018 entry = catalog.get(eid) 

2019 break 

2020 if entry: 

2021 return entry.capabilities 

2022 except Exception: 

2023 pass 

2024 return {} 

2025 

2026 def _emit_event(self, event_type: str, data: dict): 

2027 """Emit an event to the EventBus (non-blocking, safe to fail).""" 

2028 try: 

2029 from core.platform.events import emit_event 

2030 emit_event(event_type, data) 

2031 except Exception: 

2032 pass # EventBus not bootstrapped — silent 

2033 

2034 def _guess_model_type(self, tool_name: str) -> str: 

2035 """Map tool name to model_type for catalog sync.""" 

2036 from integrations.service_tools.model_catalog import ModelType 

2037 if tool_name == 'llm' or 'llama' in tool_name: 

2038 return ModelType.LLM 

2039 if 'tts' in tool_name or 'voice' in tool_name: 

2040 return ModelType.TTS 

2041 if 'whisper' in tool_name or 'stt' in tool_name: 

2042 return ModelType.STT 

2043 if 'minicpm' in tool_name or 'vlm' in tool_name or 'vision' in tool_name: 

2044 return ModelType.VLM 

2045 return tool_name 

2046 

2047 # ── Hive intelligence ───────────────────────────────────── 

2048 

2049 def _apply_hive_hints(self): 

2050 """Apply hive-learned placement hints to local priorities.""" 

2051 try: 

2052 from integrations.agent_engine.federated_aggregator import ( 

2053 get_federated_aggregator) 

2054 fed = get_federated_aggregator() 

2055 aggregated = fed.aggregate_lifecycle() 

2056 if not aggregated: 

2057 return 

2058 

2059 popularity = aggregated.get('popularity', {}) 

2060 with self._lock: 

2061 for name, score in popularity.items(): 

2062 state = self._models.get(name) 

2063 if state: 

2064 state.hive_popularity = score 

2065 state.hive_boost = score > 0.6 

2066 else: 

2067 self._hive_hints[name] = score 

2068 except Exception as e: 

2069 logger.debug(f"Hive hints error: {e}") 

2070 

2071 def _report_to_federation(self): 

2072 """Send local model usage stats to FederatedAggregator.""" 

2073 try: 

2074 from integrations.agent_engine.federated_aggregator import ( 

2075 get_federated_aggregator) 

2076 

2077 node_id = '' 

2078 try: 

2079 from security.node_integrity import get_node_identity 

2080 node_id = get_node_identity().get('node_id', '') 

2081 except Exception: 

2082 pass 

2083 

2084 models_stats = {} 

2085 with self._lock: 

2086 for name, state in self._models.items(): 

2087 if state.device != ModelDevice.UNLOADED: 

2088 rate = (state.access_count_session / 

2089 max(1, self._interval * 6)) 

2090 idle_s = (time.time() - state.last_access_time 

2091 if state.last_access_time else 0) 

2092 models_stats[name] = { 

2093 'device': state.device.value, 

2094 'access_rate': round(rate, 4), 

2095 'idle_s': round(idle_s, 1), 

2096 } 

2097 state.access_count_session = 0 

2098 

2099 if models_stats: 

2100 delta = { 

2101 'models': models_stats, 

2102 'node_id': node_id, 

2103 'timestamp': time.time(), 

2104 } 

2105 fed = get_federated_aggregator() 

2106 fed.receive_lifecycle_delta(node_id, delta) 

2107 except Exception as e: 

2108 logger.debug(f"Lifecycle federation report error: {e}") 

2109 

2110 # ── Tier awareness ──────────────────────────────────────── 

2111 

2112 def _detect_tier(self): 

2113 """Cache node capability tier.""" 

2114 try: 

2115 from security.system_requirements import get_tier 

2116 self._node_tier = get_tier() 

2117 except Exception: 

2118 self._node_tier = None 

2119 

2120 def _is_tier_appropriate(self, model_name: str) -> bool: 

2121 """Check if this model is appropriate for our capability tier.""" 

2122 if self._node_tier is None: 

2123 return True 

2124 try: 

2125 from security.system_requirements import NodeTierLevel, _TIER_RANK 

2126 min_tier_str = MODEL_MIN_TIER.get(model_name, 'standard') 

2127 min_tier = NodeTierLevel(min_tier_str) 

2128 return _TIER_RANK[self._node_tier] >= _TIER_RANK[min_tier] 

2129 except Exception: 

2130 return True 

2131 

2132 # ── Helpers ─────────────────────────────────────────────── 

2133 

2134 def _sync_from_rtm(self): 

2135 """Initialize model states from RuntimeToolManager's current state.""" 

2136 try: 

2137 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS 

2138 for tool_name in TOOL_CONFIGS: 

2139 if runtime_tool_manager._is_server_alive(tool_name): 

2140 self._on_tool_started(tool_name, device='gpu') 

2141 except Exception: 

2142 pass 

2143 

2144 def manual_offload(self, model_name: str) -> dict: 

2145 """Manual GPU→CPU offload (admin API).""" 

2146 with self._lock: 

2147 state = self._models.get(model_name) 

2148 if not state: 

2149 return {'error': f'Model {model_name} not tracked'} 

2150 if state.device == ModelDevice.UNLOADED: 

2151 return {'error': f'Model {model_name} is not loaded'} 

2152 if state.device == ModelDevice.CPU: 

2153 return {'message': f'{model_name} already on CPU'} 

2154 

2155 success = self._do_offload_to_cpu(model_name) 

2156 return {'success': success, 'model': model_name, 

2157 'device': 'cpu' if success else state.device.value} 

2158 

2159 def set_priority(self, model_name: str, priority_str: str) -> dict: 

2160 """Manual priority override (admin API).""" 

2161 try: 

2162 priority = ModelPriority(priority_str) 

2163 except ValueError: 

2164 return {'error': f'Invalid priority: {priority_str}'} 

2165 

2166 with self._lock: 

2167 state = self._models.get(model_name) 

2168 if not state: 

2169 return {'error': f'Model {model_name} not tracked'} 

2170 state.priority = priority 

2171 return {'model': model_name, 'priority': priority_str} 

2172 

2173 def set_pressure_evict_only(self, model_name: str, value: bool) -> dict: 

2174 """Toggle the ``pressure_evict_only`` flag for a tracked model. 

2175 

2176 When True, the model is evicted ONLY on VRAM pressure (phase 3 

2177 of ``_tick``) and never by the idle-timeout sweep (phase 7). 

2178 Used by ``TTSEngine.set_language`` to pin the ACTIVE TTS 

2179 backend so a background model load can't idle-out the engine 

2180 the user is actively speaking to. 

2181 

2182 If the model isn't tracked yet (first call can precede the 

2183 RTM ``on_tool_started`` hook when a language is chosen before 

2184 the first synth), the call no-ops with ``tracked=False`` so 

2185 the caller can decide whether to retry on next tick. This is 

2186 additive: the flag lands when the RTM hook fires, via the 

2187 ``_persisted_hints`` path. 

2188 

2189 Returns a small dict for admin-API / journey-test inspection; 

2190 the caller typically discards it. 

2191 """ 

2192 with self._lock: 

2193 state = self._models.get(model_name) 

2194 if state is None: 

2195 # Stage the pin so it's applied the moment the tool 

2196 # starts — treating this as a pre-start hint matches 

2197 # the pattern we already use for persisted hints. 

2198 self._persisted_hints.setdefault(model_name, {}).update({ 

2199 'pressure_evict_only': bool(value), 

2200 }) 

2201 return { 

2202 'model': model_name, 

2203 'pressure_evict_only': bool(value), 

2204 'tracked': False, 

2205 'staged_as_hint': True, 

2206 } 

2207 state.pressure_evict_only = bool(value) 

2208 return { 

2209 'model': model_name, 

2210 'pressure_evict_only': bool(value), 

2211 'tracked': True, 

2212 } 

2213 

2214 def get_system_pressure(self) -> dict: 

2215 """Return current pressure state for dispatch throttling. 

2216 

2217 Called by AgentDaemon to decide whether to reduce concurrency. 

2218 Detects each pressure source once and passes to throttle calculation. 

2219 """ 

2220 vram = self._detect_vram_pressure() 

2221 ram = self._detect_ram_pressure() 

2222 cpu = self._detect_cpu_pressure() 

2223 disk = self._detect_disk_pressure() 

2224 return { 

2225 'vram_pressure': vram, 

2226 'ram_pressure': ram, 

2227 'cpu_pressure': cpu, 

2228 'disk_pressure': disk, 

2229 'throttle_factor': self._calculate_throttle_factor( 

2230 cpu_on=cpu, ram_on=ram, vram_on=vram, disk_on=disk), 

2231 } 

2232 

2233 def _calculate_throttle_factor(self, *, cpu_on: bool = None, 

2234 ram_on: bool = None, vram_on: bool = None, 

2235 disk_on: bool = None) -> float: 

2236 """Compute dispatch throttle: 0.0 = fully throttled, 1.0 = no throttling. 

2237 

2238 Accepts pre-computed pressure booleans to avoid re-detecting. 

2239 Falls back to live detection if not provided (standalone calls). 

2240 """ 

2241 factor = 1.0 

2242 

2243 # CPU pressure — use granular percentage for proportional throttling 

2244 try: 

2245 import psutil 

2246 cpu_pct = psutil.cpu_percent(interval=None) 

2247 if cpu_pct >= 95: 

2248 factor *= 0.1 

2249 elif cpu_pct >= 90: 

2250 factor *= 0.3 

2251 elif cpu_pct >= self._cpu_pressure_pct: 

2252 factor *= 0.5 

2253 except ImportError: 

2254 pass 

2255 

2256 # RAM pressure 

2257 if ram_on if ram_on is not None else self._detect_ram_pressure(): 

2258 factor *= 0.5 

2259 

2260 # VRAM pressure 

2261 if vram_on if vram_on is not None else self._detect_vram_pressure(): 

2262 factor *= 0.7 

2263 

2264 # Disk pressure (critical — stop all heavy work) 

2265 if disk_on if disk_on is not None else self._detect_disk_pressure(): 

2266 factor *= 0.2 

2267 

2268 return max(0.0, min(1.0, factor)) 

2269 

2270 def get_status(self) -> dict: 

2271 """Full lifecycle dashboard.""" 

2272 with self._lock: 

2273 models = {name: s.to_dict() for name, s in self._models.items()} 

2274 

2275 vram_status = {} 

2276 try: 

2277 from .vram_manager import vram_manager 

2278 vram_status = vram_manager.get_status() 

2279 except Exception: 

2280 pass 

2281 

2282 # Crash recovery state 

2283 restart_queue = {} 

2284 for name, info in self._restart_pending.items(): 

2285 if isinstance(info, dict): 

2286 restart_queue[name] = { 

2287 'retry_in_s': max(0, round(info.get('retry_after', 0) - time.time(), 1)), 

2288 'downgrade': info.get('downgrade', False), 

2289 } 

2290 

2291 swap_q = [dict(e) for e in self._swap_queue] 

2292 

2293 return { 

2294 'running': self._running, 

2295 'tick_count': self._tick_count, 

2296 'interval_s': self._interval, 

2297 'models': models, 

2298 'vram': vram_status, 

2299 'vram_pressure': self._detect_vram_pressure(), 

2300 'ram_pressure': self._detect_ram_pressure(), 

2301 'cpu_pressure': self._detect_cpu_pressure(), 

2302 'disk_pressure': self._detect_disk_pressure(), 

2303 'throttle_factor': self._calculate_throttle_factor(), 

2304 'hive_hints': dict(self._hive_hints), 

2305 'node_tier': (self._node_tier.value 

2306 if self._node_tier else 'unknown'), 

2307 'restart_pending': restart_queue, 

2308 'swap_queue': swap_q, 

2309 } 

2310 

2311 

2312# ═══════════════════════════════════════════════════════════════ 

2313# Singleton 

2314# ═══════════════════════════════════════════════════════════════ 

2315 

2316_lifecycle_manager = None 

2317_lifecycle_lock = threading.Lock() 

2318 

2319 

2320def get_model_lifecycle_manager() -> ModelLifecycleManager: 

2321 global _lifecycle_manager 

2322 if _lifecycle_manager is None: 

2323 with _lifecycle_lock: 

2324 if _lifecycle_manager is None: 

2325 _lifecycle_manager = ModelLifecycleManager() 

2326 return _lifecycle_manager 

2327 

2328 

2329# ═══════════════════════════════════════════════════════════════ 

2330# Draft eviction on user-language change 

2331# ═══════════════════════════════════════════════════════════════ 

2332# When the user switches TO a script the 0.8B draft can't handle 

2333# (NON_LATIN_SCRIPT_LANGS), the running draft's ~1.2GB VRAM is dead 

2334# weight — speculative_dispatcher's language guard will never route 

2335# to it. Evict so Indic Parler (~2GB) can fit alongside main 4B on 

2336# 8GB GPUs. Subscribes to core.user_lang.on_lang_change — runs in a 

2337# daemon thread (fired by the subscriber bus), /chat hot path not 

2338# stalled. 

2339 

2340def _evict_draft_on_non_latin_switch(old_lang, new_lang) -> None: 

2341 """on_lang_change subscriber — evicts 0.8B draft when the user 

2342 moves to a non-Latin script. No-op otherwise.""" 

2343 try: 

2344 from core.constants import NON_LATIN_SCRIPT_LANGS 

2345 except ImportError: 

2346 return 

2347 _new_key = (new_lang or '').split('-')[0].lower()[:2] 

2348 if not _new_key or _new_key not in NON_LATIN_SCRIPT_LANGS: 

2349 return 

2350 try: 

2351 _mgr = get_model_lifecycle_manager() 

2352 if _mgr and hasattr(_mgr, 'request_swap'): 

2353 _mgr.request_swap( 

2354 target='draft', 

2355 reason=f'language_switch_to_{_new_key}', 

2356 ) 

2357 except Exception: 

2358 pass 

2359 

2360 

2361try: 

2362 from core.user_lang import on_lang_change as _on_lang_change 

2363 _on_lang_change(_evict_draft_on_non_latin_switch) 

2364except ImportError: 

2365 # user_lang unavailable standalone (e.g., isolated test env); 

2366 # callers may still invoke request_swap directly. 

2367 pass