Coverage for integrations/service_tools/model

1"""

2Model Lifecycle Manager — intelligent load/unload/offload for ML models.

4Daemon-driven (like AgentDaemon), tick-based loop that:

5 1. Tracks per-model access patterns (last_access, count, device)

6 2. Detects VRAM/RAM pressure via real-time nvidia-smi refresh

7 3. Evicts idle models (LRU, configurable timeout per model)

8 4. Offloads GPU models to CPU when VRAM pressure detected

9 5. Reports usage deltas to FederatedAggregator for hive learning

10 6. Applies hive-learned placement hints (pre-cache popular models)

11 7. Continuous process health monitoring with dead-process recovery

12 8. OOM crash detection + auto-restart with resource downgrade

13 9. Model swap queue for sequential multi-model workloads

14 10. Pressure alerts emitted to EventBus for frontend display

16Integration:

17 - Extends RuntimeToolManager via lifecycle hooks (composition, not inheritance)

18 - Uses VRAMManager for GPU tracking (adds refresh_gpu_info calls)

19 - Monitored by NodeWatchdog

20 - Reports to FederatedAggregator

21 - Exposed via /api/tools/lifecycle endpoint

23Terminology: Uses NodeTierLevel CAPABILITY tiers (embedded/lite/standard/full/compute_host),

24NOT topology modes (flat/regional/central) or model tiers (fast/balanced/expert).

25"""

26import collections

27import json

28import logging

29import os

30import shutil

31import threading

32import time

33from contextlib import contextmanager

34from dataclasses import dataclass, field

35from enum import Enum

36from pathlib import Path

37from typing import Dict, List, Optional, Tuple

39logger = logging.getLogger(__name__)

41LIFECYCLE_STATE_FILE = Path.home() / '.hevolve' / 'lifecycle_state.json'

43# Hints older than this are discarded on reload. A 24h cutoff keeps

44# "this user spoke Tamil yesterday" relevant while throwing away

45# "this user spoke Tamil in January" (long-stale entries would pin

46# models that the user actually doesn't use anymore).

47LIFECYCLE_STALENESS_S: float = 24 * 3600.0

50# ═══════════════════════════════════════════════════════════════

51# Enums and State

52# ═══════════════════════════════════════════════════════════════

54class ModelDevice(Enum):

55 UNLOADED = "unloaded"

56 GPU = "gpu"

57 CPU = "cpu"

58 CPU_OFFLOAD = "cpu_offload"

61class ModelPriority(Enum):

62 ACTIVE = "active"

63 WARM = "warm"

64 IDLE = "idle"

65 EVICTABLE = "evictable"

68# Priority sort order: ACTIVE (never touch) → WARM → IDLE → EVICTABLE (first to go)

69_PRIORITY_RANK = {

70 ModelPriority.ACTIVE: 0,

71 ModelPriority.WARM: 1,

72 ModelPriority.IDLE: 2,

73 ModelPriority.EVICTABLE: 3,

74}

77@dataclass

78class ModelState:

79 """Per-model lifecycle state."""

80 name: str

81 device: ModelDevice = ModelDevice.UNLOADED

82 priority: ModelPriority = ModelPriority.IDLE

84 # Access tracking

85 last_access_time: float = 0.0

86 load_time: float = 0.0

87 access_count: int = 0

88 access_count_session: int = 0

90 # Resource tracking

91 vram_gb: float = 0.0

92 ram_gb: float = 0.0

94 # Configuration

95 idle_timeout_s: float = 300.0

96 is_sidecar: bool = False

97 supports_cpu_offload: bool = False

99 # Hive hints

100 hive_popularity: float = 0.0

101 hive_boost: bool = False

102

103 # Inference guard

104 active_inference_count: int = 0

105

106 # ─── Eviction policy flags ────────────────────────────────────

107 # These are the 2026-04-12 fix for the "llama-server kept dying

108 # every 5 minutes" incident. The default idle eviction loop

109 # (_evict_idle_models → _update_priorities) used to demote ANY

110 # idle model to EVICTABLE after its timeout, unconditionally,

111 # which killed the 4B main LLM mid-session even though nobody

112 # was competing for VRAM.

113

114 #: True ⇒ the model is NEVER evicted, regardless of idle time,

115 #: VRAM pressure, or RAM pressure. Reserved for the draft 0.8B

116 #: classifier which is the first-contact path for EVERY chat

117 #: message. Killing the draft means every reply cold-starts a

118 #: 500 MB model + mmproj, so pinning it is always the right call.

119 #: Use sparingly — pinned models consume their VRAM budget

120 #: forever, so this must only tag models where the eviction

121 #: cost always outweighs the freed VRAM.

122 pinned: bool = False

123

124 #: True ⇒ the model is evicted ONLY when VRAM pressure is

125 #: detected (phase 3 of _tick), never by passive idle timeout

126 #: (phase 7). Applied to main-tier chat LLMs (2B / 4B) so a

127 #: 30-second idle doesn't kill the server the user is actively

128 #: talking to. Passive idle eviction is intended for STT / TTS /

129 #: VLM one-shot models where a cold start is cheap and the

130 #: VRAM is better spent on the model you're about to use next.

131 pressure_evict_only: bool = False

132

133 # Crash recovery tracking

134 crash_count: int = 0 # Consecutive crashes (resets on successful access)

135 last_crash_time: float = 0.0 # Timestamp of last crash

136 last_exit_code: Optional[int] = None # Last process exit code (137/9=OOM kill)

137 restart_backoff_s: float = 0.0 # Current backoff delay (exponential)

138 downgraded: bool = False # True if restarted on lower resource tier

139

140 def to_dict(self) -> dict:

141 now = time.time()

142 return {

143 'name': self.name,

144 'device': self.device.value,

145 'priority': self.priority.value,

146 'last_access_time': self.last_access_time,

147 'load_time': self.load_time,

148 'access_count': self.access_count,

149 'idle_seconds': round(now - self.last_access_time, 1) if self.last_access_time else 0,

150 'vram_gb': self.vram_gb,

151 'ram_gb': self.ram_gb,

152 'idle_timeout_s': self.idle_timeout_s,

153 'hive_popularity': self.hive_popularity,

154 'hive_boost': self.hive_boost,

155 'active_inference_count': self.active_inference_count,

156 'crash_count': self.crash_count,

157 'last_exit_code': self.last_exit_code,

158 'downgraded': self.downgraded,

159 'healthy': self.device == ModelDevice.UNLOADED or self.crash_count == 0,

160 }

161

162

163# ═══════════════════════════════════════════════════════════════

164# Configuration Tables

165# ═══════════════════════════════════════════════════════════════

166

167# Default idle timeouts per model (seconds). Expensive-to-reload = longer.

168DEFAULT_IDLE_TIMEOUTS: Dict[str, float] = {

169 'whisper': 300.0,

170 'tts_audio_suite': 600.0,

171 'minicpm': 900.0,

172 'wan2gp': 600.0,

173 'ltx2': 600.0,

174 'acestep': 600.0,

175 'omniparser': 300.0,

176 'clip': 180.0,

177 'sentence_transformers': 180.0,

178 'mobilevlm': 300.0,

179}

180

181# CPU offload capability: (can_offload, cpu_ram_gb, method)

182# method: 'torch_to_cpu' | 'restart_cpu' | 'none'

183CPU_OFFLOAD_TABLE: Dict[str, Tuple[bool, float, str]] = {

184 'whisper': (True, 0.5, 'torch_to_cpu'),

185 'tts_audio_suite': (True, 2.0, 'restart_cpu'),

186 'minicpm': (False, 0.0, 'none'),

187 'wan2gp': (False, 0.0, 'none'),

188 'ltx2': (True, 4.0, 'restart_cpu'),

189 'acestep': (False, 0.0, 'none'),

190 'omniparser': (True, 2.0, 'torch_to_cpu'),

191 'clip': (True, 0.5, 'torch_to_cpu'),

192 'sentence_transformers': (True, 0.3, 'torch_to_cpu'),

193 'mobilevlm': (True, 1.0, 'torch_to_cpu'),

194}

195

196# Capability tier requirements per model (uses NodeTierLevel, NOT topology)

197# Imported lazily to avoid circular deps

198MODEL_MIN_TIER: Dict[str, str] = {

199 'whisper': 'standard',

200 'tts_audio_suite': 'standard',

201 'minicpm': 'full',

202 'wan2gp': 'full',

203 'ltx2': 'full',

204 'acestep': 'full',

205 'omniparser': 'full',

206 'clip': 'standard',

207 'sentence_transformers': 'standard',

208 'mobilevlm': 'standard',

209}

210

211

212# ═══════════════════════════════════════════════════════════════

213# ModelLifecycleManager

214# ═══════════════════════════════════════════════════════════════

215

216class ModelLifecycleManager:

217 """Daemon-driven model lifecycle: load, track, offload, evict, report.

218

219 Follows AgentDaemon pattern: tick-based loop, heartbeat to watchdog,

220 never blocks user requests.

221

222 Singleton via get_model_lifecycle_manager().

223 """

224

225 def __init__(self):

226 self._models: Dict[str, ModelState] = {}

227 self._lock = threading.Lock()

228 self._running = False

229 self._thread: Optional[threading.Thread] = None

230 # Flat/bundled mode: models load once at startup, no dynamic swapping.

231 # Poll every 120s (just health check), not 15s (active management).

232 _bundled = os.environ.get('NUNBA_BUNDLED') == '1'

233 _default_interval = '120' if _bundled else '15'

234 self._interval = int(os.environ.get('HEVOLVE_LIFECYCLE_INTERVAL', _default_interval))

235 self._tick_count = 0

236

237 # Pressure thresholds (configurable via env)

238 self._vram_pressure_pct = float(

239 os.environ.get('HEVOLVE_VRAM_PRESSURE_PCT', '85'))

240 self._ram_pressure_pct = float(

241 os.environ.get('HEVOLVE_RAM_PRESSURE_PCT', '90'))

242 self._cpu_pressure_pct = float(

243 os.environ.get('HEVOLVE_CPU_PRESSURE_PCT', '80'))

244 self._disk_free_min_gb = float(

245 os.environ.get('HEVOLVE_DISK_FREE_MIN_GB', '2.0'))

246

247 # Throttle flags (read by AgentDaemon to reduce dispatch rate)

248 self._cpu_throttle_active = False

249 self._disk_throttle_active = False

250

251 # Hive placement hints from FederatedAggregator

252 self._hive_hints: Dict[str, float] = {}

253

254 # Cached node tier

255 self._node_tier = None

256

257 # ── Crash recovery ────────────────────────────────────────

258 self._max_crash_restarts = int(

259 os.environ.get('HEVOLVE_MAX_CRASH_RESTARTS', '3'))

260 self._base_backoff_s = 5.0 # First retry after 5s

261 self._max_backoff_s = 300.0 # Cap at 5 min

262 self._restart_pending: Dict[str, float] = {} # name → retry_after timestamp

263

264 # ── Swap queue ────────────────────────────────────────────

265 # When model B is needed but A occupies the GPU, A gets evicted

266 # and B loads. A is queued for restore when B finishes/idles.

267 self._swap_queue: collections.deque = collections.deque(maxlen=8)

268 # Each entry: {'name': str, 'device': str, 'evicted_for': str, 'timestamp': float}

269

270 # ── Pressure alert state (debounce) ───────────────────────

271 self._last_pressure_alert: Dict[str, float] = {} # type → timestamp

272 self._pressure_alert_cooldown = 60.0 # seconds between alerts of same type

273

274 # ── G3: Direct llama-server supervision ───────────────────

275 # When Nunba's LlamaConfig is unavailable (standalone, Docker,

276 # or bundled mode where Nunba's supervisor failed) we launch

277 # llama-server ourselves via _launch_llama_server_direct. We

278 # must also supervise it — otherwise a dead server is invisible

279 # to _check_llm_health and never gets restarted.

280 self._direct_llama_proc = None # subprocess.Popen

281 self._direct_llama_port: Optional[int] = None

282 self._direct_llama_mode: Optional[str] = None # 'gpu' | 'cpu'

283 self._direct_llama_log_fh = None

284 self._direct_llama_last_restart: float = 0.0

285 self._direct_llama_restart_cooldown_s: float = 10.0

286

287 # ── Cross-restart persistence ─────────────────────────────

288 # Load any persisted per-model access hints from the previous

289 # Nunba boot. Entries older than LIFECYCLE_STALENESS_S are

290 # dropped — last_access from a week ago is worse than no hint.

291 self._last_persist_time: float = 0.0

292 self._persist_interval_s: float = 60.0 # throttle disk writes

293 self._persisted_hints: Dict[str, dict] = {}

294 try:

295 self._persisted_hints = self._load_persisted_state()

296 except Exception as e:

297 logger.debug(f"Lifecycle: persisted state load skipped: {e}")

298

299 # ── Daemon lifecycle ──────────────────────────────────────

300

301 def start(self):

302 """Start the lifecycle daemon and register hooks."""

303 with self._lock:

304 if self._running:

305 return

306 self._running = True

307

308 # Register hooks with RuntimeToolManager

309 try:

310 from .runtime_manager import runtime_tool_manager

311 runtime_tool_manager.register_lifecycle_hook(

312 'on_tool_started', self._on_tool_started)

313 runtime_tool_manager.register_lifecycle_hook(

314 'on_tool_stopped', self._on_tool_stopped)

315 except Exception as e:

316 logger.debug(f"Lifecycle hook registration skipped: {e}")

317

318 # Sync from current RTM state

319 self._sync_from_rtm()

320

321 # Detect node tier

322 self._detect_tier()

323

324 # VRAM budget check: warn if total GPU VRAM is insufficient for

325 # pinned models. On 4GB GPUs, 0.8B (1.5GB) + 4B (3.5GB) = 5GB

326 # exceeds the budget — the pressure system will handle it, but

327 # the user should know the main model may be CPU-offloaded.

328 try:

329 from .vram_manager import get_vram_manager

330 vm = get_vram_manager()

331 total_vram = vm.total_vram_gb if vm else 0

332 pinned_vram = sum(

333 s.vram_gb for s in self._models.values()

334 if s.pinned and s.vram_gb > 0

335 )

336 all_model_vram = sum(

337 s.vram_gb for s in self._models.values()

338 if s.vram_gb > 0

339 )

340 if total_vram > 0 and all_model_vram > total_vram:

341 logger.warning(

342 f"VRAM budget exceeded: models need {all_model_vram:.1f}GB "

343 f"but GPU has {total_vram:.1f}GB. "

344 f"Pressure eviction will offload non-pinned models to CPU. "

345 f"Pinned models ({pinned_vram:.1f}GB) are protected."

346 )

347 if pinned_vram > total_vram:

348 logger.error(

349 f"CRITICAL: Pinned models alone need {pinned_vram:.1f}GB "

350 f"but GPU only has {total_vram:.1f}GB. "

351 f"Consider reducing pinned model count or using CPU inference."

352 )

353 except Exception:

354 pass # VRAM check is advisory, not blocking

355

356 self._thread = threading.Thread(target=self._loop, daemon=True)

357 self._thread.start()

358 logger.info(f"Model lifecycle manager started (interval={self._interval}s)")

359

360 def stop(self):

361 with self._lock:

362 self._running = False

363 if self._thread:

364 self._thread.join(timeout=10)

365

366 def _loop(self):

367 while self._running:

368 self._wd_sleep(self._interval)

369 if not self._running:

370 break

371 self._wd_heartbeat()

372 try:

373 self._tick()

374 except Exception as e:

375 logger.debug(f"Model lifecycle tick error: {e}")

376

377 def _wd_heartbeat(self):

378 """Send heartbeat to watchdog between potentially blocking phases."""

379 try:

380 from security.node_watchdog import get_watchdog

381 wd = get_watchdog()

382 if wd:

383 wd.heartbeat('model_lifecycle')

384 except Exception:

385 pass

386

387 def _wd_sleep(self, seconds: float) -> None:

388 """Sleep while keeping the model_lifecycle heartbeat fresh.

389

390 Delegates to ``NodeWatchdog.sleep_with_heartbeat``. The

391 2026-04-11 incident showed model_lifecycle frozen for 3606s

392 (exactly one hour) — the investigation points at

393 _report_to_federation or a nvidia-smi subprocess hang inside

394 one of the 13 tick phases, but the sleep path is also a

395 candidate whenever HEVOLVE_MODEL_LIFECYCLE_INTERVAL is set

396 above the 300s frozen threshold. Using this helper collapses

397 both problem classes to one primitive.

398 """

399 try:

400 from security.node_watchdog import get_watchdog

401 wd = get_watchdog()

402 if wd is not None:

403 wd.sleep_with_heartbeat(

404 'model_lifecycle', seconds,

405 stop_check=lambda: not self._running,

406 )

407 return

408 except Exception:

409 pass

410 time.sleep(seconds)

411

412 def _tick(self):

413 """Single lifecycle pass with heartbeat checkpoints between phases."""

414 self._tick_count += 1

415

416 # Guardrail: circuit breaker

417 try:

418 from security.hive_guardrails import HiveCircuitBreaker

419 if HiveCircuitBreaker.is_halted():

420 return

421 except (ImportError, AttributeError):

422 pass

423

424 # Phase 1: Refresh real GPU state (may call nvidia-smi subprocess)

425 self._refresh_memory_state()

426 self._wd_heartbeat()

427

428 # Phase 2: Update priorities

429 self._update_priorities()

430

431 # Phase 3: VRAM pressure response

432 if self._detect_vram_pressure():

433 self._respond_to_vram_pressure()

434 self._wd_heartbeat()

435

436 # Phase 4: RAM pressure response

437 if self._detect_ram_pressure():

438 self._respond_to_ram_pressure()

439

440 # Phase 5: CPU pressure response

441 cpu_pressure = self._detect_cpu_pressure()

442 self._cpu_throttle_active = cpu_pressure

443 if cpu_pressure:

444 self._respond_to_cpu_pressure()

445

446 # Phase 6: Disk pressure response

447 disk_pressure = self._detect_disk_pressure()

448 self._disk_throttle_active = disk_pressure

449 self._wd_heartbeat()

450

451 # Phase 7: Background idle eviction

452 self._evict_idle_models()

453 self._wd_heartbeat()

454

455 # Phase 8: Apply hive hints (every 4th tick, ~60s)

456 if self._tick_count % 4 == 0:

457 self._apply_hive_hints()

458 self._wd_heartbeat()

459

460 # Phase 9: Report to federation (every 6th tick, ~90s).

461 # Investigation found this the most likely culprit for the

462 # 1-hour model_lifecycle freeze in the 2026-04-11 incident —

463 # the federation HTTP call had no explicit timeout and could

464 # hang indefinitely on a dead remote. Heartbeat immediately

465 # afterwards so a slow (but not infinite) call doesn't bleed

466 # into phases 10-13.

467 if self._tick_count % 6 == 0:

468 self._report_to_federation()

469 self._wd_heartbeat()

470

471 # Phase 10: Process health check + crash recovery

472 self._check_process_health()

473 self._wd_heartbeat()

474

475 # Phase 11: Process pending crash restarts (with backoff)

476 self._process_restart_queue()

477 self._wd_heartbeat()

478

479 # Phase 12: Swap queue — restore evicted models when space frees up

480 self._process_swap_queue()

481 self._wd_heartbeat()

482

483 # Phase 13: Pressure alerts to frontend (debounced)

484 self._emit_pressure_alerts()

485

486 # Phase 14: Persist access hints (throttled — J211).

487 # Cheap I/O (single JSON file, atomic rename) so the rate limit

488 # lives in _persist_state_to_disk itself rather than here.

489 try:

490 self._persist_state_to_disk()

491 except Exception:

492 pass

493

494 # ── Hook callbacks (from RuntimeToolManager) ──────────────

495

496 def _on_tool_started(self, tool_name: str, **kwargs):

497 """Called when RTM starts a tool."""

498 device_str = kwargs.get('device', 'gpu')

499 offload_mode = kwargs.get('offload_mode', 'gpu')

500 is_inprocess = kwargs.get('inprocess', False)

501

502 if offload_mode == 'cpu_only':

503 device = ModelDevice.CPU

504 elif offload_mode == 'cpu_offload':

505 device = ModelDevice.CPU_OFFLOAD

506 else:

507 device = ModelDevice.GPU

508

509 offload_info = CPU_OFFLOAD_TABLE.get(tool_name, (False, 0.0, 'none'))

510 timeout = DEFAULT_IDLE_TIMEOUTS.get(tool_name, 300.0)

511

512 # Override timeout from env

513 env_timeout = os.environ.get(

514 f'HEVOLVE_{tool_name.upper()}_IDLE_TIMEOUT')

515 if env_timeout:

516 try:

517 timeout = float(env_timeout)

518 except ValueError:

519 pass

520

521 from .vram_manager import VRAM_BUDGETS

522 budget = VRAM_BUDGETS.get(tool_name, (0, 0))

523 vram_gb = budget[1] if device == ModelDevice.GPU else 0.0

524 ram_gb = offload_info[1] if device == ModelDevice.CPU else 0.0

525

526 now = time.time()

527 with self._lock:

528 state = ModelState(

529 name=tool_name,

530 device=device,

531 priority=ModelPriority.WARM,

532 last_access_time=now,

533 load_time=now,

534 idle_timeout_s=timeout,

535 is_sidecar=not is_inprocess,

536 supports_cpu_offload=offload_info[0],

537 vram_gb=vram_gb,

538 ram_gb=ram_gb,

539 )

540 # Apply persisted-from-previous-boot hints so warm-start

541 # preference survives restart. This is the J211 fix for

542 # the cold-load penalty after Nunba is killed + relaunched.

543 hint = self._persisted_hints.pop(tool_name, None)

544 if hint:

545 try:

546 state.access_count = int(hint.get('access_count', 0) or 0)

547 # prior last_access_time preserved as a hint; the

548 # current boot's load time overrides so idle math

549 # starts fresh. The prior timestamp is only used to

550 # reason about popularity (see _apply_hive_hints).

551 prior_access = float(hint.get('last_access_time', 0.0) or 0.0)

552 if prior_access and (now - prior_access) < LIFECYCLE_STALENESS_S:

553 # Boost priority for recently-used models so the

554 # first request after restart doesn't get stuck

555 # behind a cold-load.

556 state.priority = ModelPriority.WARM

557 state.hive_boost = True

558 if hint.get('pinned'):

559 state.pinned = True

560 # J214: a pending pressure_evict_only staged via

561 # set_pressure_evict_only() applies at start-time

562 # so the TTS backend the user just picked never

563 # enters the idle-sweep phase on its first load.

564 if 'pressure_evict_only' in hint:

565 state.pressure_evict_only = bool(

566 hint['pressure_evict_only'])

567 except Exception:

568 pass

569 self._models[tool_name] = state

570

571 # Notify UI: model loaded — include capabilities from catalog

572 # so UI knows what features are now available

573 self._emit_event('model.loaded', {

574 'model': tool_name,

575 'device': device_str,

576 'vram_gb': vram_gb,

577 'capabilities': self._get_model_capabilities(tool_name),

578 })

579

580 def _on_tool_stopped(self, tool_name: str, **kwargs):

581 """Called when RTM stops a tool."""

582 with self._lock:

583 state = self._models.get(tool_name)

584 if state:

585 state.device = ModelDevice.UNLOADED

586 state.priority = ModelPriority.IDLE

587 state.vram_gb = 0.0

588 state.ram_gb = 0.0

589 state.active_inference_count = 0

590

591 # Persist the release — even if we crash immediately after,

592 # the next boot has a fresh snapshot of which models were in use.

593 try:

594 self._persist_state_to_disk(force=True)

595 except Exception:

596 pass

597

598 # Notify UI: model unloaded — these capabilities are now unavailable

599 self._emit_event('model.unloaded', {

600 'model': tool_name,

601 'capabilities': self._get_model_capabilities(tool_name),

602 })

603

604 # ── Cross-restart persistence (J211) ──────────────────────

605 #

606 # The lifecycle state file was declared as LIFECYCLE_STATE_FILE at

607 # module top but never written until the 2026-04-18 live audit

608 # flagged the gap. We persist a compact, user-scoped JSON doc:

609 #

610 # {

611 # "version": 1,

612 # "saved_at": 1713456789.12,

613 # "models": {

614 # "indic_parler": {

615 # "last_access_time": 1713450000.0,

616 # "access_count": 17,

617 # "pinned": false

618 # },

619 # ...

620 # }

621 # }

622 #

623 # Reload happens ONCE in __init__. Save happens throttled inside

624 # _tick (every _persist_interval_s) and on every model stop event.

625 # Stale entries (older than LIFECYCLE_STALENESS_S) are discarded at

626 # load-time so a 6-month-old hint doesn't distort today's boot.

627 # No secrets / credentials / PII are persisted — only tool names

628 # and access counters.

629

630 def _load_persisted_state(self) -> Dict[str, dict]:

631 """Read the on-disk hint doc and return the map of models.

632

633 Returns an empty dict if the file is missing, malformed, or

634 older than LIFECYCLE_STALENESS_S (entries are filtered on

635 that threshold to avoid pinning stale models).

636 """

637 try:

638 if not LIFECYCLE_STATE_FILE.exists():

639 return {}

640 with open(LIFECYCLE_STATE_FILE, 'r', encoding='utf-8') as f:

641 data = json.load(f)

642 except Exception:

643 return {}

644

645 models = data.get('models') if isinstance(data, dict) else None

646 if not isinstance(models, dict):

647 return {}

648

649 now = time.time()

650 fresh: Dict[str, dict] = {}

651 for name, hint in models.items():

652 if not isinstance(hint, dict):

653 continue

654 last = float(hint.get('last_access_time', 0.0) or 0.0)

655 # Stale entries get dropped — a week-old hint is worse than

656 # no hint (it would keep a model the user no longer uses).

657 if last and (now - last) > LIFECYCLE_STALENESS_S:

658 continue

659 fresh[name] = hint

660 if fresh:

661 logger.info(

662 f"Lifecycle: loaded {len(fresh)} persisted hint(s) "

663 f"from {LIFECYCLE_STATE_FILE}"

664 )

665 return fresh

666

667 def _persist_state_to_disk(self, force: bool = False) -> None:

668 """Write current access hints to disk.

669

670 Throttled to at most one write per _persist_interval_s unless

671 ``force=True`` (used from _on_tool_stopped so a user-visible

672 release is recorded immediately). Writes are atomic via

673 rename-tmpfile-to-target so a crash mid-write can't leave a

674 half-written JSON blob behind.

675 """

676 now = time.time()

677 if not force and (now - self._last_persist_time) < self._persist_interval_s:

678 return

679

680 with self._lock:

681 payload = {

682 'version': 1,

683 'saved_at': now,

684 'models': {

685 name: {

686 'last_access_time': s.last_access_time,

687 'access_count': s.access_count,

688 'pinned': bool(s.pinned),

689 }

690 for name, s in self._models.items()

691 if s.last_access_time > 0

692 },

693 }

694

695 try:

696 LIFECYCLE_STATE_FILE.parent.mkdir(parents=True, exist_ok=True)

697 tmp = LIFECYCLE_STATE_FILE.with_suffix('.json.tmp')

698 with open(tmp, 'w', encoding='utf-8') as f:

699 json.dump(payload, f, ensure_ascii=False, indent=2)

700 # Atomic rename — survives a crash mid-write. On Windows,

701 # os.replace handles "target exists" correctly.

702 os.replace(tmp, LIFECYCLE_STATE_FILE)

703 self._last_persist_time = now

704 except Exception as e:

705 logger.debug(f"Lifecycle: persist skipped ({e})")

706

707 # ── Access tracking (called by tool wrappers) ─────────────

708

709 @contextmanager

710 def inference_guard(self, tool_name: str):

711 """Context manager — prevents eviction during active inference."""

712 with self._lock:

713 state = self._models.get(tool_name)

714 if state:

715 state.active_inference_count += 1

716 state.last_access_time = time.time()

717 state.access_count += 1

718 state.access_count_session += 1

719 try:

720 yield

721 finally:

722 with self._lock:

723 state = self._models.get(tool_name)

724 if state:

725 state.active_inference_count = max(

726 0, state.active_inference_count - 1)

727

728 def notify_access(self, tool_name: str):

729 """Lightweight access notification. Updates timestamps + counters.

730

731 Also resets crash count on successful access — confirms recovery.

732 """

733 with self._lock:

734 state = self._models.get(tool_name)

735 if state:

736 state.last_access_time = time.time()

737 state.access_count += 1

738 state.access_count_session += 1

739 # Successful access = process is healthy, reset crash state

740 if state.crash_count > 0:

741 logger.info(f"Model {tool_name} recovered after "

742 f"{state.crash_count} crash(es)")

743 state.crash_count = 0

744 state.restart_backoff_s = 0.0

745 state.downgraded = False

746

747 # ── Tick phases ───────────────────────────────────────────

748

749 def _refresh_memory_state(self):

750 """Re-read actual GPU state and sync with RTM running state."""

751 try:

752 from .vram_manager import vram_manager

753 vram_manager.refresh_gpu_info()

754 except Exception:

755 pass

756

757 # Sync with RTM's actual process state

758 try:

759 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS

760 for tool_name in TOOL_CONFIGS:

761 is_alive = runtime_tool_manager._is_server_alive(tool_name)

762 with self._lock:

763 state = self._models.get(tool_name)

764 if state and not is_alive and state.device != ModelDevice.UNLOADED:

765 state.device = ModelDevice.UNLOADED

766 state.priority = ModelPriority.IDLE

767 state.vram_gb = 0.0

768 state.ram_gb = 0.0

769 except Exception:

770 pass

771

772 def _update_priorities(self):

773 """Recalculate priority for every tracked model.

774

775 Respects two eviction-policy flags on each ModelState:

776

777 * ``pinned`` — the model is always ACTIVE; no matter how

778 long it idles, it is never demoted to EVICTABLE. Applied

779 to the draft 0.8B classifier which every chat message

780 hits first.

781

782 * ``pressure_evict_only`` — the model never enters the

783 EVICTABLE priority through the passive idle path. It

784 can still hit EVICTABLE temporarily in

785 ``_respond_to_vram_pressure`` / ``_respond_to_ram_pressure``

786 which is exactly what "evict under pressure" means.

787 Applied to main-tier chat LLMs (2B / 4B) so a 30-second

788 idle doesn't kill the server the user is actively

789 talking to.

790 """

791 now = time.time()

792 with self._lock:

793 for state in self._models.values():

794 if state.device == ModelDevice.UNLOADED:

795 continue

796

797 # Pinned models are permanently ACTIVE. They stay loaded

798 # for the life of the process and never feel eviction.

799 if state.pinned:

800 state.priority = ModelPriority.ACTIVE

801 continue

802

803 if state.active_inference_count > 0:

804 state.priority = ModelPriority.ACTIVE

805 continue

806

807 idle_s = (now - state.last_access_time

808 if state.last_access_time else float('inf'))

809

810 if idle_s < state.idle_timeout_s * 0.5:

811 state.priority = ModelPriority.WARM

812 elif idle_s < state.idle_timeout_s:

813 # Hive boost extends warm period

814 if state.hive_boost:

815 state.priority = ModelPriority.WARM

816 else:

817 state.priority = ModelPriority.IDLE

818 else:

819 # Past the idle timeout. A normal model becomes

820 # EVICTABLE and is swept up by _evict_idle_models

821 # in phase 7. A pressure_evict_only model is

822 # capped at IDLE so it survives the passive sweep

823 # and only gets unloaded when VRAM pressure forces

824 # the hand in phase 3.

825 if state.pressure_evict_only:

826 state.priority = ModelPriority.IDLE

827 else:

828 state.priority = ModelPriority.EVICTABLE

829

830 def _detect_vram_pressure(self) -> bool:

831 """True if current VRAM usage exceeds threshold."""

832 try:

833 from .vram_manager import vram_manager

834 pct = vram_manager.get_vram_usage_pct()

835 return pct >= self._vram_pressure_pct

836 except Exception:

837 return False

838

839 def _detect_ram_pressure(self) -> bool:

840 """True if system RAM usage exceeds threshold."""

841 try:

842 import psutil

843 return psutil.virtual_memory().percent >= self._ram_pressure_pct

844 except ImportError:

845 return False

846

847 def _detect_cpu_pressure(self) -> bool:

848 """True if CPU usage exceeds threshold."""

849 try:

850 import psutil

851 pct = psutil.cpu_percent(interval=None)

852 return pct >= self._cpu_pressure_pct

853 except ImportError:

854 return False

855

856 def _detect_disk_pressure(self) -> bool:

857 """True if free disk space is below minimum threshold."""

858 try:

859 code_root = os.environ.get(

860 'HEVOLVE_CODE_ROOT',

861 os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

862 usage = shutil.disk_usage(code_root)

863 free_gb = usage.free / (1024 ** 3)

864 return free_gb < self._disk_free_min_gb

865 except Exception:

866 return False

867

868 def _respond_to_vram_pressure(self):

869 """Evict or offload GPU models to relieve VRAM pressure.

870

871 Strategy: EVICTABLE first (LRU) → IDLE offload to CPU → WARM offload.

872 Never touches ACTIVE models. Never touches pinned models — the

873 ``pinned`` guard is a second safety net on top of the

874 ``_update_priorities`` rule that forces pinned models into

875 ACTIVE priority. It catches out-of-band callers that might

876 invoke this method between priority-update phases.

877 """

878 with self._lock:

879 candidates = sorted(

880 [s for s in self._models.values()

881 if s.device in (ModelDevice.GPU, ModelDevice.CPU_OFFLOAD)

882 and s.priority != ModelPriority.ACTIVE

883 and not s.pinned],

884 key=lambda s: (_PRIORITY_RANK.get(s.priority, 99),

885 s.last_access_time)

886 )

887 # Work on a copy of names to avoid modifying dict during iteration

888 candidate_names = [(s.name, s.priority, s.supports_cpu_offload)

889 for s in candidates]

890

891 for name, priority, can_offload in reversed(candidate_names):

892 if not self._detect_vram_pressure():

893 break # Pressure relieved

894

895 if priority == ModelPriority.EVICTABLE:

896 self._do_unload(name)

897 elif can_offload:

898 self._do_offload_to_cpu(name)

899 elif priority in (ModelPriority.IDLE, ModelPriority.EVICTABLE):

900 self._do_unload(name)

901

902 def _respond_to_ram_pressure(self):

903 """Unload CPU models under RAM pressure (LRU first).

904

905 Pinned models are excluded as a second safety net — the

906 ``pinned`` guard ensures a draft classifier that was

907 offloaded to CPU for any reason still survives RAM pressure.

908 """

909 with self._lock:

910 candidates = sorted(

911 [s for s in self._models.values()

912 if s.device == ModelDevice.CPU

913 and s.priority != ModelPriority.ACTIVE

914 and not s.pinned],

915 key=lambda s: s.last_access_time

916 )

917 names = [s.name for s in candidates]

918

919 for name in names:

920 if not self._detect_ram_pressure():

921 break

922 self._do_unload(name)

923

924 def _respond_to_cpu_pressure(self):

925 """Reduce system load by evicting non-essential models.

926

927 Under CPU pressure, background model processes consume cycles.

928 Evict EVICTABLE and IDLE models to free CPU for user-facing

929 work. Pinned models are exempt as a second safety net.

930 """

931 with self._lock:

932 evictable = [

933 s.name for s in self._models.values()

934 if s.priority in (ModelPriority.EVICTABLE, ModelPriority.IDLE)

935 and s.device != ModelDevice.UNLOADED

936 and s.active_inference_count == 0

937 and not s.pinned

938 ]

939 for name in evictable:

940 logger.info(f"Lifecycle: evicting '{name}' due to CPU pressure")

941 self._do_unload(name)

942

943 def _evict_idle_models(self):

944 """Background GC: unload models past their idle timeout.

945

946 Belt-and-suspenders: the ``pinned`` and ``pressure_evict_only``

947 guards live in :meth:`_update_priorities` which never demotes

948 those models to EVICTABLE, so they don't appear in the

949 candidate list below. We still check ``pinned`` explicitly

950 here as a second safety net — if a future code path assigns

951 EVICTABLE directly (e.g. manual admin eviction), pinning

952 should still beat it.

953 """

954 now = time.time()

955 with self._lock:

956 evictable = [

957 s.name for s in self._models.values()

958 if s.priority == ModelPriority.EVICTABLE

959 and s.device != ModelDevice.UNLOADED

960 and s.active_inference_count == 0

961 and not s.pinned

962 ]

963

964 for name in evictable:

965 with self._lock:

966 state = self._models.get(name)

967 if not state or state.pinned:

968 continue

969 idle_s = now - state.last_access_time if state.last_access_time else float('inf')

970 if idle_s > state.idle_timeout_s:

971 logger.info(

972 f"Lifecycle: evicting idle model '{name}' "

973 f"(idle {idle_s:.0f}s > timeout {state.idle_timeout_s:.0f}s)")

974 self._do_unload(name)

975

976 # ── Load/Unload/Offload operations ────────────────────────

977

978 def _do_unload(self, tool_name: str):

979 """Unload a model via RuntimeToolManager."""

980 try:

981 from .runtime_manager import runtime_tool_manager

982 runtime_tool_manager.stop_tool(tool_name)

983 # on_tool_stopped hook will update our state

984 logger.info(f"Lifecycle: unloaded '{tool_name}'")

985 except Exception as e:

986 logger.debug(f"Lifecycle unload error for {tool_name}: {e}")

987

988 def _do_offload_to_cpu(self, tool_name: str) -> bool:

989 """Migrate a GPU model to CPU."""

990 offload_info = CPU_OFFLOAD_TABLE.get(tool_name)

991 if not offload_info or not offload_info[0]:

992 return False

993

994 with self._lock:

995 state = self._models.get(tool_name)

996 if not state or state.device == ModelDevice.CPU:

997 return False

998 if state.active_inference_count > 0:

999 return False

1000

1001 _, cpu_ram_gb, method = offload_info

1002 success = False

1003

1004 if method == 'torch_to_cpu':

1005 success = self._offload_torch_to_cpu(tool_name)

1006 elif method == 'restart_cpu':

1007 success = self._offload_via_restart(tool_name)

1008

1009 if success:

1010 with self._lock:

1011 state = self._models.get(tool_name)

1012 if state:

1013 old_vram = state.vram_gb

1014 state.device = ModelDevice.CPU

1015 state.vram_gb = 0.0

1016 state.ram_gb = cpu_ram_gb

1017 try:

1018 from .vram_manager import vram_manager

1019 vram_manager.release(tool_name)

1020 except Exception:

1021 pass

1022 logger.info(f"Lifecycle: offloaded '{tool_name}' to CPU")

1023

1024 return success

1025

1026 def _offload_torch_to_cpu(self, tool_name: str) -> bool:

1027 """Move in-process model to CPU / unload to free VRAM."""

1028 if tool_name == 'whisper':

1029 try:

1030 # faster-whisper (CTranslate2) — used by Nunba. No .cpu() method,

1031 # must set to None and clear cache to free VRAM.

1032 from .whisper_tool import unload_whisper

1033 unload_whisper()

1034 return True

1035 except Exception as e:

1036 logger.debug(f"Whisper offload failed: {e}")

1037 return False

1038

1039 def _offload_via_restart(self, tool_name: str) -> bool:

1040 """Stop and restart a sidecar tool with cpu_only offload mode."""

1041 try:

1042 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS

1043 runtime_tool_manager.stop_tool(tool_name)

1044 config = TOOL_CONFIGS.get(tool_name)

1045 if config and not config.get('is_inprocess'):

1046 result = runtime_tool_manager._start_sidecar(

1047 tool_name, config, 'cpu_only')

1048 return result.get('running', False)

1049 except Exception as e:

1050 logger.debug(f"Restart offload for {tool_name} failed: {e}")

1051 return False

1052

1053 # ── Phase 10-13: Process Health, Crash Recovery, Swap, Alerts ──

1054

1055 # Exit codes that indicate OOM kill

1056 _OOM_EXIT_CODES = {

1057 137, # Linux SIGKILL (128 + 9) — typical OOM killer

1058 -9, # Python representation of SIGKILL

1059 9, # Raw SIGKILL

1060 3221225477, # Windows 0xC0000005 — access violation (often OOM-related)

1061 3221225725, # Windows 0xC00000FD — stack overflow

1062 }

1063

1064 def _check_process_health(self):

1065 """Phase 10: Detect dead processes and classify crash vs clean exit.

1066

1067 For each model we think is loaded, verify the actual process is alive.

1068 If dead: record exit code, classify OOM vs clean, queue restart.

1069 """

1070 dead_models = []

1071

1072 # Check RTM-managed sidecar processes

1073 try:

1074 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS

1075 for tool_name in TOOL_CONFIGS:

1076 with self._lock:

1077 state = self._models.get(tool_name)

1078 if not state or state.device == ModelDevice.UNLOADED:

1079 continue

1080

1081 proc = runtime_tool_manager._processes.get(tool_name)

1082 config = TOOL_CONFIGS.get(tool_name, {})

1083

1084 if config.get('is_inprocess'):

1085 # In-process models — check module-level state

1086 if not runtime_tool_manager._is_server_alive(tool_name):

1087 dead_models.append((tool_name, None, 'inprocess'))

1088 elif proc is not None:

1089 exit_code = proc.poll()

1090 if exit_code is not None:

1091 dead_models.append((tool_name, exit_code, 'sidecar'))

1092 else:

1093 # No process object but state says loaded — stale state

1094 dead_models.append((tool_name, None, 'orphan'))

1095 except Exception as e:

1096 logger.debug(f"Health check RTM scan error: {e}")

1097

1098 # Check LLM (llama.cpp) process — not managed by RTM

1099 try:

1100 self._check_llm_health(dead_models)

1101 except Exception as e:

1102 # CRITICAL: previous code was `except Exception: pass` which

1103 # silently masked LLM watchdog failures. The user's "main LLM

1104 # cannot die" rule requires every health-check failure to be

1105 # visible in the log. Log + traceback so the next iteration

1106 # of root-cause work has signal instead of silence.

1107 logger.exception(

1108 f"[LLM-WATCHDOG] _check_llm_health raised {type(e).__name__}: "

1109 f"{e!s} — main LLM health is now UNKNOWN; "

1110 f"watchdog will retry on next tick"

1111 )

1112

1113 # Process each dead model

1114 for tool_name, exit_code, proc_type in dead_models:

1115 logger.warning(

1116 f"[LLM-WATCHDOG] Dead model queued for handling: "

1117 f"tool={tool_name} exit_code={exit_code} proc_type={proc_type}"

1118 )

1119 self._handle_dead_process(tool_name, exit_code, proc_type)

1120

1121 def _check_llm_health(self, dead_models: list):

1122 """Check llama.cpp server health (separate from RTM sidecar tools).

1123

1124 Supervises BOTH:

1125 1. Nunba-managed server via LlamaConfig (primary in bundled mode).

1126 2. Direct-launch server (self._direct_llama_proc) when Nunba's

1127 LlamaConfig module isn't available (G3 — standalone / Docker /

1128 bundled-without-supervisor).

1129

1130 Either source adding to ``dead_models`` triggers _handle_dead_process,

1131 which queues a restart through _process_restart_queue.

1132 """

1133 with self._lock:

1134 state = self._models.get('llm')

1135 # The MAIN LLM is the system's primary engine — per the user's

1136 # explicit "main LLM cannot die" invariant we MUST keep checking

1137 # whether it is alive even when our local state thinks it is

1138 # UNLOADED. An UNLOADED state can come from:

1139 # (a) crash_count exceeded max → _handle_dead_process bailed out

1140 # (b) eviction under VRAM pressure

1141 # (c) a stale state from a prior boot where Nunba "adopted" an

1142 # externally-running server (server_process=None) and never

1143 # refreshed device when that adopted server died

1144 # In every one of those cases, the previous early-return left the

1145 # main engine permanently dead. We instead log + continue, so

1146 # the HTTP probe below can decide.

1147 if not state:

1148 # STATELESS PROBE PATH (added 2026-05-08 after live evidence

1149 # showed llama-server died at 18:47 and the watchdog was blind

1150 # because Nunba's LlamaConfig.start_server() spawns the server

1151 # outside RuntimeToolManager — so RTM's `_on_tool_started('llm')`

1152 # callback never fires and `self._models['llm']` stays absent).

1153 # Even without a registered state we MUST keep the main engine

1154 # alive: probe llama-server via Nunba's LlamaConfig directly,

1155 # and if it's down, queue a restart via the existing

1156 # `_handle_dead_process` path (which will re-spawn via

1157 # `LlamaConfig.start_server`). The fake state we synthesize

1158 # here lives only inside `_handle_dead_process` (it reads

1159 # state.crash_count etc.), so we register a minimal one

1160 # before queueing.

1161 try:

1162 from llama.llama_config import LlamaConfig

1163 _cfg = LlamaConfig()

1164 if not _cfg.check_server_running():

1165 logger.warning(

1166 f"[LLM-WATCHDOG] STATELESS-PROBE: llama-server "

1167 f"unreachable on port "

1168 f"{_cfg.config.get('server_port')!r} and no 'llm' "

1169 f"state registered with lifecycle. Registering a "

1170 f"minimal state and queueing restart so the main "

1171 f"engine cannot stay dead silently."

1172 )

1173 # Register a minimal state so _handle_dead_process can

1174 # update crash_count + restart_backoff etc.

1175 with self._lock:

1176 self._models['llm'] = ModelState(

1177 name='llm',

1178 device=ModelDevice.UNLOADED,

1179 priority=ModelPriority.IDLE,

1180 vram_gb=0.0,

1181 ram_gb=0.0,

1182 last_access_time=time.time(),

1183 crash_count=0,

1184 pressure_evict_only=True,

1185 )

1186 dead_models.append(('llm', None, 'stateless_probe'))

1187 return

1188 else:

1189 # Healthy but unregistered — register state lazily so

1190 # subsequent ticks take the fast path with full info.

1191 logger.info(

1192 f"[LLM-WATCHDOG] STATELESS-PROBE: llama-server is "

1193 f"alive on port {_cfg.config.get('server_port')!r} "

1194 f"but had no registered state — registering now so "

1195 f"future ticks can supervise it."

1196 )

1197 with self._lock:

1198 self._models['llm'] = ModelState(

1199 name='llm',

1200 device=ModelDevice.GPU, # Best-effort default

1201 priority=ModelPriority.ACTIVE,

1202 vram_gb=0.0,

1203 ram_gb=0.0,

1204 last_access_time=time.time(),

1205 crash_count=0,

1206 pressure_evict_only=True,

1207 )

1208 return

1209 except ImportError:

1210 logger.debug(

1211 "[LLM-WATCHDOG] No 'llm' state and LlamaConfig not "

1212 "importable (Docker / standalone mode) — relying on "

1213 "G3 direct-launch supervision below."

1214 )

1215 except Exception as e:

1216 logger.exception(

1217 f"[LLM-WATCHDOG] STATELESS-PROBE raised "

1218 f"{type(e).__name__}: {e!s} — main engine health is "

1219 f"now UNKNOWN; will retry on next tick"

1220 )

1221 return

1222 if state and state.device == ModelDevice.UNLOADED:

1223 logger.warning(

1224 f"[LLM-WATCHDOG] state.device=UNLOADED — investigating "

1225 f"whether the main engine should be re-loaded "

1226 f"(crash_count={state.crash_count}, "

1227 f"last_exit_code={state.last_exit_code}). "

1228 f"Continuing health check despite UNLOADED — "

1229 f"main LLM cannot stay dead."

1230 )

1231

1232 nunba_handled = False

1233 try:

1234 from llama.llama_config import LlamaConfig

1235 config = LlamaConfig()

1236 if config.server_process is not None:

1237 nunba_handled = True

1238 exit_code = config.server_process.poll()

1239 if exit_code is not None:

1240 logger.warning(

1241 f"[LLM-WATCHDOG] Nunba-spawned llama-server died: "

1242 f"PID={config.server_process.pid} "

1243 f"exit_code={exit_code}"

1244 )

1245 dead_models.append(('llm', exit_code, 'llm_server'))

1246 else:

1247 logger.debug(

1248 f"[LLM-WATCHDOG] Nunba-spawned llama-server alive "

1249 f"PID={config.server_process.pid}"

1250 )

1251 else:

1252 # No Popen handle — either we adopted an external server, OR

1253 # state thinks it's loaded but the spawning Python process

1254 # already exited (handle gone with it). Either way, an

1255 # HTTP probe is the only authoritative signal. We do NOT

1256 # gate on `state.device != UNLOADED` here — the main LLM

1257 # must be probed even when state is stale (see Bug B in

1258 # the 2026-05-08 incident: adopt-then-die left the system

1259 # half-alive because the watchdog stopped probing once

1260 # state went UNLOADED).

1261 alive = config.check_server_running()

1262 if not alive:

1263 logger.warning(

1264 f"[LLM-WATCHDOG] Adopted/stateless llama-server is "

1265 f"UNREACHABLE via HTTP probe — queueing restart. "

1266 f"state.device={state.device.value} "

1267 f"server_port={config.config.get('server_port')}"

1268 )

1269 dead_models.append(('llm', None, 'llm_server'))

1270 nunba_handled = True

1271 else:

1272 nunba_handled = True

1273 logger.debug(

1274 "[LLM-WATCHDOG] Adopted llama-server alive (HTTP 200)"

1275 )

1276 except ImportError as e:

1277 # Previous code: `except ImportError: pass` — silent. Surface

1278 # this as a log line so when Nunba is bundled-without-supervisor

1279 # we can see the LlamaConfig path is unavailable and the G3

1280 # direct-launch path below is being used by design.

1281 logger.info(

1282 f"[LLM-WATCHDOG] LlamaConfig import unavailable "

1283 f"({e!s}) — falling through to G3 direct-launch supervision"

1284 )

1285 except Exception as e:

1286 # Previous code only caught ImportError — a real LlamaConfig()

1287 # instantiation error (e.g. corrupted llama_config.json,

1288 # filesystem permission error) would propagate up and be

1289 # swallowed by the outer except in _check_process_health.

1290 # Log + re-raise so caller's CRITICAL log fires.

1291 logger.exception(

1292 f"[LLM-WATCHDOG] LlamaConfig health-check raised "

1293 f"{type(e).__name__}: {e!s}"

1294 )

1295 raise

1296

1297 # G3: Direct-launch supervision (fires when Nunba not in charge).

1298 if nunba_handled:

1299 return

1300 proc = self._direct_llama_proc

1301 port = self._direct_llama_port

1302 if proc is not None:

1303 exit_code = proc.poll()

1304 if exit_code is not None:

1305 logger.warning(

1306 f"[G3] direct llama-server died: PID={proc.pid} "

1307 f"exit_code={exit_code} port={port}")

1308 # Clear stale handle so _handle_dead_process → restart

1309 # can cleanly relaunch via _launch_llama_server_direct.

1310 try:

1311 fh = self._direct_llama_log_fh

1312 if fh is not None and not fh.closed:

1313 fh.close()

1314 except Exception:

1315 pass

1316 self._direct_llama_proc = None

1317 self._direct_llama_log_fh = None

1318 dead_models.append(('llm', exit_code, 'llm_server_direct'))

1319 elif port is not None and state.device != ModelDevice.UNLOADED:

1320 # We once launched directly but lost the handle — verify via HTTP.

1321 # A missing handle + unreachable port == dead server.

1322 try:

1323 import urllib.request as _ur

1324 import urllib.error as _ue

1325 url = f'http://127.0.0.1:{port}/health'

1326 try:

1327 with _ur.urlopen(url, timeout=2) as resp:

1328 if resp.status != 200:

1329 dead_models.append(

1330 ('llm', None, 'llm_server_direct'))

1331 except (_ue.URLError, _ue.HTTPError, OSError):

1332 dead_models.append(('llm', None, 'llm_server_direct'))

1333 except Exception as e:

1334 logger.debug(f"[G3] direct HTTP health check skipped: {e}")

1335

1336 def _handle_dead_process(self, tool_name: str, exit_code: Optional[int],

1337 proc_type: str):

1338 """Classify crash, update state, queue restart if appropriate."""

1339 is_oom = exit_code in self._OOM_EXIT_CODES if exit_code is not None else False

1340 crash_type = 'oom' if is_oom else ('crash' if exit_code else 'disappeared')

1341

1342 logger.warning(

1343 f"Dead process detected: {tool_name} "

1344 f"(exit_code={exit_code}, type={crash_type}, proc={proc_type})")

1345

1346 with self._lock:

1347 state = self._models.get(tool_name)

1348 if not state:

1349 return

1350

1351 old_device = state.device

1352 old_vram = state.vram_gb

1353

1354 # Update state to unloaded

1355 state.device = ModelDevice.UNLOADED

1356 state.priority = ModelPriority.IDLE

1357 state.crash_count += 1

1358 state.last_crash_time = time.time()

1359 state.last_exit_code = exit_code

1360 state.vram_gb = 0.0

1361 state.ram_gb = 0.0

1362 state.active_inference_count = 0

1363

1364 # Exponential backoff: 5s, 10s, 20s, 40s... capped at 300s

1365 state.restart_backoff_s = min(

1366 self._base_backoff_s * (2 ** (state.crash_count - 1)),

1367 self._max_backoff_s

1368 )

1369

1370 # MAIN LLM EXEMPTION (user invariant 2026-05-08):

1371 # "the LLM is the main engine, it cannot die — we observe with

1372 # watchdogs etc." For every other tool we honor max_crash_restarts

1373 # so a permanently-broken sidecar doesn't loop forever. For 'llm'

1374 # specifically, we keep retrying — but at the capped backoff

1375 # interval (300s) so we don't burn CPU on a hopeless box. The

1376 # alternative (give up) leaves the system permanently degraded

1377 # and was the root cause of the 2026-05-08 11:40 → 13:56+ silent

1378 # outage.

1379 if tool_name == 'llm':

1380 should_restart = True

1381 if state.crash_count > self._max_crash_restarts:

1382 # Past the configured ceiling — log loudly so on-call

1383 # sees the persistent failure, but keep trying at the

1384 # max backoff. Reset crash_count to the ceiling so

1385 # restart_backoff_s stays clamped at _max_backoff_s.

1386 logger.error(

1387 f"[LLM-WATCHDOG] LLM has crashed {state.crash_count} "

1388 f"times (> max_crash_restarts={self._max_crash_restarts})"

1389 f" — continuing to retry every {self._max_backoff_s}s "

1390 f"because the main engine cannot stay dead. "

1391 f"Investigate llama-server log for repeated crash "

1392 f"cause."

1393 )

1394 else:

1395 should_restart = state.crash_count <= self._max_crash_restarts

1396

1397 # Release VRAM allocation

1398 try:

1399 from .vram_manager import vram_manager

1400 vram_manager.release(tool_name)

1401 except Exception:

1402 pass

1403

1404 # Release from RTM process table

1405 try:

1406 from .runtime_manager import runtime_tool_manager

1407 runtime_tool_manager._processes.pop(tool_name, None)

1408 runtime_tool_manager._ports.pop(tool_name, None)

1409 except Exception:

1410 pass

1411

1412 # Sync catalog state

1413 try:

1414 from integrations.service_tools.model_orchestrator import get_orchestrator

1415 get_orchestrator().notify_unloaded(

1416 self._guess_model_type(tool_name), tool_name)

1417 except Exception:

1418 pass

1419

1420 # Emit crash event

1421 self._emit_event('model.crash', {

1422 'model': tool_name,

1423 'crash_type': crash_type,

1424 'exit_code': exit_code,

1425 'crash_count': state.crash_count if state else 0,

1426 'will_restart': should_restart,

1427 })

1428

1429 # Queue restart with backoff (if under max retries)

1430 if should_restart:

1431 # IDEMPOTENCE GUARD (2026-05-09 starvation fix):

1432 # If a restart is already queued or in progress, do NOT

1433 # overwrite it. Re-queuing every dead-detection tick was

1434 # the 2026-05-09 starvation bug — retry_after kept getting

1435 # pushed forward by the latest tick's `now + backoff_s`,

1436 # so _process_restart_queue's `now >= retry_after` check

1437 # never succeeded. Live evidence: 220 `Queued restart for

1438 # llm` lines, 0 `_restart_llm starting` lines over 7.5h.

1439 # Keep the original retry_after so the spawn fires.

1440 with self._lock:

1441 existing = self._restart_pending.get(tool_name)

1442 if existing and isinstance(existing, dict):

1443 if existing.get('in_progress'):

1444 logger.debug(

1445 f"[LIFECYCLE] {tool_name} restart IN PROGRESS — "

1446 f"suppressing re-queue from dead-detection tick"

1447 )

1448 else:

1449 eta = max(0, existing.get('retry_after', 0) - time.time())

1450 logger.debug(

1451 f"[LIFECYCLE] {tool_name} restart already queued "

1452 f"(fires in {eta:.0f}s) — keeping original schedule, "

1453 f"not re-queuing"

1454 )

1455 return

1456 retry_after = time.time() + state.restart_backoff_s

1457 downgrade = is_oom # OOM → restart on lower resource tier

1458 self._restart_pending[tool_name] = {

1459 'retry_after': retry_after,

1460 'downgrade': downgrade,

1461 'old_device': old_device.value if old_device else 'gpu',

1462 'in_progress': False,

1463 }

1464 logger.info(

1465 f"Queued restart for {tool_name} in {state.restart_backoff_s:.0f}s"

1466 f" (attempt {state.crash_count}/{self._max_crash_restarts})"

1467 f"{' [DOWNGRADE]' if downgrade else ''}")

1468 else:

1469 logger.error(

1470 f"Model {tool_name} exceeded max restarts "

1471 f"({self._max_crash_restarts}), giving up. "

1472 f"Manual intervention required.")

1473 self._emit_event('model.restart_exhausted', {

1474 'model': tool_name,

1475 'crash_count': state.crash_count if state else 0,

1476 })

1477

1478 def _process_restart_queue(self):

1479 """Phase 11: Process pending crash restarts with exponential backoff."""

1480 now = time.time()

1481 ready = []

1482 with self._lock:

1483 for name, info in list(self._restart_pending.items()):

1484 if not isinstance(info, dict):

1485 continue

1486 if info.get('in_progress'):

1487 # Already being spawned by a prior tick — skip. Without

1488 # this check, two ticks could pull the same entry and

1489 # double-spawn (race that motivated the IDEMPOTENCE

1490 # GUARD in _handle_dead_process — both halves needed).

1491 continue

1492 if now >= info.get('retry_after', 0):

1493 # Mark in_progress instead of deleting. Retain the

1494 # entry so concurrent ticks see in_progress=True and

1495 # skip. We pop the entry only AFTER spawn completes

1496 # (success or exception), in the finally block below.

1497 info['in_progress'] = True

1498 self._restart_pending[name] = info

1499 ready.append((name, info))

1500

1501 for name, info in ready:

1502 # IDEMPOTENCE: pre-spawn HTTP probe. If the server came up

1503 # via an external respawn (user restarted llama-server in a

1504 # different terminal) between when this restart was queued

1505 # and now, skip the spawn — don't kill the user's process

1506 # via stop_server() and re-spawn our own. Same probe also

1507 # protects against the race where an earlier _restart_llm

1508 # already succeeded but a stale queue entry survived.

1509 if name == 'llm':

1510 try:

1511 from llama.llama_config import LlamaConfig

1512 if LlamaConfig().check_server_running():

1513 logger.info(

1514 f"[LIFECYCLE] {name} already healthy on HTTP "

1515 f"probe — skipping queued spawn (idempotent "

1516 f"no-op; likely came up via external respawn "

1517 f"or earlier _restart_llm)"

1518 )

1519 # Detect ACTUAL device — Nunba defaults to GPU but

1520 # an externally-respawned server (or a Nunba retry

1521 # after a CUDA-context loss from sleep/hibernate)

1522 # may be on CPU. Self-review caught the v1 of this

1523 # code unconditionally stamping GPU, which would

1524 # mislead the catalog/dispatcher. vram_manager has

1525 # an internal cache so this is sub-millisecond after

1526 # first call (no nvidia-smi cost per tick).

1527 try:

1528 from integrations.service_tools.vram_manager import (

1529 vram_manager as _vmm,

1530 )

1531 _gpu_info = _vmm.detect_gpu() or {}

1532 _device = (ModelDevice.GPU

1533 if _gpu_info.get('cuda_available')

1534 else ModelDevice.CPU)

1535 except Exception:

1536 # If vram_manager can't be reached, prefer GPU

1537 # since that's the default Nunba launch mode —

1538 # next full _check_llm_health tick will resync

1539 # if wrong.

1540 _device = ModelDevice.GPU

1541 with self._lock:

1542 self._restart_pending.pop(name, None)

1543 # Sync watchdog state so STATELESS-PROBE doesn't

1544 # immediately re-detect "dead" and re-queue.

1545 st = self._models.get(name)

1546 if st:

1547 st.device = _device

1548 st.priority = ModelPriority.ACTIVE

1549 st.last_access_time = time.time()

1550 continue

1551 except Exception:

1552 pass # Probe failed — proceed with spawn

1553 downgrade = info.get('downgrade', False)

1554 old_device = info.get('old_device', 'gpu')

1555

1556 # Decide restart mode

1557 if downgrade and old_device == 'gpu':

1558 restart_mode = 'cpu_offload' # OOM on GPU → try CPU offload

1559 elif downgrade and old_device == 'cpu_offload':

1560 restart_mode = 'cpu_only' # OOM on offload → pure CPU

1561 else:

1562 restart_mode = old_device # Same mode as before

1563

1564 logger.info(f"Restarting {name} in {restart_mode} mode"

1565 f" (was {old_device}, downgrade={downgrade})")

1566

1567 success = False

1568 try:

1569 if name == 'llm':

1570 success = self._restart_llm(restart_mode)

1571 else:

1572 success = self._restart_rtm_tool(name, restart_mode)

1573 finally:

1574 # ALWAYS clear the in_progress entry after spawn returns.

1575 # If we succeeded, no further pending needed. If we failed

1576 # and the LLM-EXEMPTION + dead-detection wants another

1577 # attempt, the next _check_process_health tick re-queues

1578 # via _handle_dead_process — and that path's IDEMPOTENCE

1579 # GUARD already prevents starvation by NOT overwriting

1580 # pending if one already exists. So pop here is safe.

1581 # Without this pop, in_progress=True would persist and

1582 # the entry would be skipped forever by future ticks.

1583 with self._lock:

1584 self._restart_pending.pop(name, None)

1585

1586 if success:

1587 with self._lock:

1588 state = self._models.get(name)

1589 if state:

1590 state.downgraded = downgrade

1591 self._emit_event('model.restarted', {

1592 'model': name, 'mode': restart_mode, 'downgraded': downgrade})

1593 else:

1594 # Failure path: bump crash_count + backoff, then re-queue

1595 # ONLY if no entry is currently pending (other ticks may

1596 # have re-queued via _handle_dead_process while we were

1597 # spawning). Same idempotence rule as _handle_dead_process.

1598 with self._lock:

1599 state = self._models.get(name)

1600 already_pending = name in self._restart_pending

1601 if (state and not already_pending

1602 and state.crash_count <= self._max_crash_restarts):

1603 state.crash_count += 1

1604 state.restart_backoff_s = min(

1605 state.restart_backoff_s * 2, self._max_backoff_s)

1606 self._restart_pending[name] = {

1607 'retry_after': now + state.restart_backoff_s,

1608 'downgrade': downgrade,

1609 'old_device': restart_mode,

1610 'in_progress': False,

1611 }

1612

1613 def _restart_llm(self, mode: str) -> bool:

1614 """Restart llama.cpp server in specified mode.

1615

1616 Primary path: Nunba's LlamaConfig (reads ~/.nunba/llama_config.json).

1617 Fallback: direct subprocess launch (Docker/standalone without Nunba).

1618 """

1619 logger.info(

1620 f"[LLM-WATCHDOG] _restart_llm starting: mode={mode!r} "

1621 f"(primary path: Nunba LlamaConfig.start_server)"

1622 )

1623 # Primary: Nunba manages the server

1624 try:

1625 from llama.llama_config import LlamaConfig

1626 config = LlamaConfig()

1627 logger.info(

1628 f"[LLM-WATCHDOG] Calling LlamaConfig.stop_server() to clean "

1629 f"up any half-alive server state before restart"

1630 )

1631 config.stop_server()

1632 config.config['use_gpu'] = (mode == 'gpu')

1633 config._save_config()

1634 logger.info(

1635 f"[LLM-WATCHDOG] Calling LlamaConfig.start_server() "

1636 f"(use_gpu={config.config['use_gpu']}, "

1637 f"port={config.config.get('server_port')})"

1638 )

1639 ok = config.start_server()

1640 if ok:

1641 logger.info(

1642 f"[LLM-WATCHDOG] Nunba LLM restart SUCCEEDED on port "

1643 f"{config.config.get('server_port')}"

1644 )

1645 else:

1646 logger.error(

1647 f"[LLM-WATCHDOG] Nunba LLM restart RETURNED FALSE "

1648 f"from start_server() — caller will re-queue with "

1649 f"backoff. Check llama_server_<port>.log for the "

1650 f"underlying spawn failure."

1651 )

1652 return ok

1653 except ImportError:

1654 logger.info(

1655 "[LLM-WATCHDOG] Nunba LlamaConfig not importable — "

1656 "falling through to direct llama-server launch (G3 path)"

1657 )

1658 except Exception as e:

1659 logger.exception(

1660 f"[LLM-WATCHDOG] Nunba LLM restart raised "

1661 f"{type(e).__name__}: {e!s} — caller will re-queue with "

1662 f"backoff"

1663 )

1664 return False

1665

1666 # Fallback: direct launch (Docker/standalone)

1667 return self._launch_llama_server_direct(mode)

1668

1669 def _launch_llama_server_direct(self, mode: str) -> bool:

1670 """Launch llama-server directly without Nunba.

1671

1672 Uses model_catalog for model selection, port_registry for port,

1673 compute_config for parallel slots, vram_manager for GPU detection.

1674 """

1675 import subprocess as _sp

1676

1677 # Find llama-server binary

1678 server_bin = self._find_llama_server_binary()

1679 if not server_bin:

1680 logger.error("llama-server binary not found")

1681 return False

1682

1683 # Find model + mmproj from catalog

1684 model_path, mmproj_path = self._find_model_files()

1685 if not model_path:

1686 logger.error("No GGUF model file found")

1687 return False

1688

1689 # Port from registry

1690 try:

1691 from core.port_registry import get_port

1692 port = get_port('llm')

1693 except Exception:

1694 port = int(os.environ.get('HEVOLVE_LLM_PORT', 8080))

1695

1696 # Build command

1697 gpu_layers = 99 if mode == 'gpu' else 0

1698 ctx_size = int(os.environ.get('HEVOLVE_LLM_CTX_SIZE', 8192))

1699

1700 cmd = [

1701 server_bin,

1702 '--model', model_path,

1703 '--port', str(port),

1704 '--ctx-size', str(ctx_size),

1705 '--n-gpu-layers', str(gpu_layers),

1706 '--threads', '4',

1707 '--flash-attn', 'on',

1708 ]

1709 if mmproj_path:

1710 cmd.extend(['--mmproj', mmproj_path])

1711

1712 log_path = os.path.join(

1713 os.environ.get('TEMP', '/tmp'), f'llama_{port}.log')

1714 log_fh = None

1715 try:

1716 log_fh = open(log_path, 'w')

1717 _popen_kwargs = dict(stdout=log_fh, stderr=_sp.STDOUT)

1718 if os.name == 'nt':

1719 _popen_kwargs['creationflags'] = _sp.CREATE_NO_WINDOW

1720 proc = _sp.Popen(cmd, **_popen_kwargs)

1721 # Store handle so it stays open for the child process

1722 self._direct_log_fh = log_fh

1723 log_fh = None # Prevent close in finally

1724 # G3: Track the Popen so _check_llm_health can detect death

1725 # and _process_restart_queue can resurrect it.

1726 self._direct_llama_proc = proc

1727 self._direct_llama_port = port

1728 self._direct_llama_mode = mode

1729 self._direct_llama_log_fh = self._direct_log_fh

1730 self._direct_llama_last_restart = time.time()

1731 logger.info(f"llama-server started: PID={proc.pid} port={port} "

1732 f"gpu_layers={gpu_layers} log={log_path}")

1733 return True

1734 except Exception as e:

1735 logger.error(f"Direct llama-server launch failed: {e}")

1736 return False

1737 finally:

1738 if log_fh is not None:

1739 log_fh.close()

1740

1741 @staticmethod

1742 def _find_llama_server_binary() -> Optional[str]:

1743 """Find llama-server binary across known locations."""

1744 home = str(Path.home())

1745 for base in ['.nunba', '.trueflow']:

1746 for sub in ['llama.cpp/build/bin/Release', 'llama.cpp']:

1747 for ext in ['', '.exe']:

1748 p = os.path.join(home, base, sub, f'llama-server{ext}')

1749 if os.path.isfile(p):

1750 return p

1751 # Check PATH

1752 binary = shutil.which('llama-server')

1753 if binary:

1754 return binary

1755 return None

1756

1757 @staticmethod

1758 def _find_model_files() -> Tuple[Optional[str], Optional[str]]:

1759 """Find GGUF model + mmproj from known locations.

1760

1761 Searches ~/.nunba/models/ and ~/.trueflow/models/ for the largest

1762 Qwen GGUF file (prefers 4B over 0.8B for the main LLM port).

1763 """

1764 home = str(Path.home())

1765 model_dirs = [

1766 os.path.join(home, '.nunba', 'models'),

1767 os.path.join(home, '.trueflow', 'models'),

1768 ]

1769 best_model = None

1770 best_size = 0

1771 mmproj = None

1772

1773 for d in model_dirs:

1774 if not os.path.isdir(d):

1775 continue

1776 for f in os.listdir(d):

1777 fp = os.path.join(d, f)

1778 if not os.path.isfile(fp):

1779 continue

1780 if f.startswith('mmproj') and f.endswith('.gguf') and not mmproj:

1781 mmproj = fp

1782 elif f.endswith('.gguf') and not f.startswith('mmproj'):

1783 size = os.path.getsize(fp)

1784 if size > best_size:

1785 best_size = size

1786 best_model = fp

1787

1788 return best_model, mmproj

1789

1790 def _restart_rtm_tool(self, tool_name: str, mode: str) -> bool:

1791 """Restart a RuntimeToolManager-managed tool."""

1792 try:

1793 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS

1794 config = TOOL_CONFIGS.get(tool_name)

1795 if not config:

1796 return False

1797 if config.get('is_inprocess'):

1798 result = runtime_tool_manager._start_inprocess(tool_name, config)

1799 else:

1800 result = runtime_tool_manager._start_sidecar(

1801 tool_name, config, mode)

1802 return result.get('running', False)

1803 except Exception as e:

1804 logger.error(f"RTM restart failed for {tool_name}: {e}")

1805 return False

1806

1807 # ── Swap Queue ───────────────────────────────────────────────

1808

1809 def request_swap(self, needed_model: str, needed_type: str = 'gpu',

1810 evict_target: Optional[str] = None) -> bool:

1811 """Request a model swap: evict lowest-priority GPU model to make room.

1812

1813 Called by orchestrator when a model can't fit alongside current loads.

1814 The evicted model is queued for restoration when the new model idles.

1815

1816 Returns True if swap was initiated, False if nothing can be evicted.

1817 """

1818 with self._lock:

1819 # Find eviction candidate: lowest priority, non-ACTIVE GPU model

1820 if evict_target:

1821 candidates = [evict_target]

1822 else:

1823 # LLMs are owned by llama-server (separate process); evicting

1824 # them from the registry doesn't free VRAM (see line 1562).

1825 # Skip them as swap candidates so we never burn down the

1826 # active LLM in exchange for a TTS that still won't fit.

1827 gpu_models = sorted(

1828 [s for s in self._models.values()

1829 if s.device == ModelDevice.GPU

1830 and s.priority != ModelPriority.ACTIVE

1831 and s.active_inference_count == 0

1832 and s.name != needed_model

1833 and not s.name.startswith('llm-')

1834 and s.model_type != 'llm'],

1835 key=lambda s: (

1836 _PRIORITY_RANK.get(s.priority, 99),

1837 s.last_access_time,

1838 )

1839 )

1840 candidates = [s.name for s in gpu_models]

1841

1842 if not candidates:

1843 logger.warning(f"Swap failed for {needed_model}: no evictable GPU model")

1844 return False

1845

1846 evicted = candidates[-1] # Lowest priority, oldest access

1847 logger.info(f"Swap: evicting '{evicted}' to make room for '{needed_model}'")

1848

1849 # Record in swap queue BEFORE eviction

1850 self._swap_queue.append({

1851 'name': evicted,

1852 'device': 'gpu',

1853 'evicted_for': needed_model,

1854 'timestamp': time.time(),

1855 })

1856

1857 # Evict

1858 self._do_unload(evicted)

1859

1860 self._emit_event('model.swapped', {

1861 'evicted': evicted,

1862 'loaded': needed_model,

1863 'swap_queue_depth': len(self._swap_queue),

1864 })

1865 return True

1866

1867 def ensure_inference_headroom(self, needed_gb: float = 1.0,

1868 requester: str = 'tts') -> bool:

1869 """Ensure enough free VRAM for inference buffers.

1870

1871 Called BEFORE synthesis/inference — not model loading.

1872 Offloads models to CPU even if they're ACTIVE, as long as the

1873 requester has higher priority (e.g., TTS > idle STT).

1874

1875 Whisper STT runs continuously but is mostly filtering silence —

1876 safe to offload for a few seconds during TTS synthesis.

1877

1878 Returns True if headroom was created (or already existed).

1879 """

1880 try:

1881 from .vram_manager import vram_manager

1882 free_gb = vram_manager.get_free_vram()

1883 if free_gb >= needed_gb:

1884 return True

1885

1886 logger.info(f"Inference headroom: {free_gb:.1f}GB free, need {needed_gb}GB "

1887 f"for {requester}")

1888

1889 # NEVER evict the LLM — it runs as a separate process (llama-server)

1890 # that the lifecycle manager can't actually unload. "Evicting" it just

1891 # removes the registry entry while llama-server keeps all its VRAM.

1892 # Instead, skip headroom check — the model will use whatever is free.

1893 if free_gb < needed_gb:

1894 logger.info(f"Inference headroom: proceeding anyway — model will "

1895 f"use available {free_gb:.1f}GB (llama-server owns the rest)")

1896 return True

1897

1898 # Standard swap first (evicts IDLE/EVICTABLE models)

1899 if self.request_swap(needed_model=requester, needed_type='gpu'):

1900 return True

1901

1902 # Force-offload any GPU model that isn't the requester.

1903 # Whisper registers itself on load (whisper_tool.py).

1904 with self._lock:

1905 gpu_models = [

1906 s for s in self._models.values()

1907 if s.device == ModelDevice.GPU

1908 and s.name != requester

1909 ]

1910 for model in gpu_models:

1911 logger.info(f"Inference headroom: offloading {model.name} to CPU "

1912 f"for {requester}")

1913 success = self._do_offload_to_cpu(model.name)

1914 if success:

1915 self._swap_queue.append({

1916 'name': model.name,

1917 'device': 'gpu',

1918 'evicted_for': requester,

1919 'timestamp': time.time(),

1920 })

1921 with self._lock:

1922 model.device = ModelDevice.CPU

1923 return True

1924 else:

1925 logger.warning(f"Inference headroom: offload of {model.name} failed")

1926

1927 logger.warning(f"Inference headroom: no model to evict for {requester}")

1928 return False

1929 except Exception as e:

1930 logger.debug(f"Inference headroom check failed: {e}")

1931 return False

1932

1933 def _process_swap_queue(self):

1934 """Phase 12: Restore evicted models when the model that displaced them idles.

1935

1936 If model B evicted model A, and B is now IDLE/EVICTABLE, restore A if VRAM allows.

1937 """

1938 if not self._swap_queue:

1939 return

1940

1941 restored = []

1942 for entry in list(self._swap_queue):

1943 evicted_for = entry.get('evicted_for')

1944 evicted_name = entry.get('name')

1945

1946 with self._lock:

1947 # Check if the model that caused the eviction has become idle

1948 displacer = self._models.get(evicted_for)

1949 if displacer and displacer.priority in (

1950 ModelPriority.IDLE, ModelPriority.EVICTABLE):

1951 pass # Displacer is idle — consider restoring

1952 elif displacer and displacer.device == ModelDevice.UNLOADED:

1953 pass # Displacer already gone — restore

1954 else:

1955 continue # Displacer still active — skip

1956

1957 # Check if VRAM has room

1958 try:

1959 from .vram_manager import vram_manager, VRAM_BUDGETS

1960 budget = VRAM_BUDGETS.get(evicted_name, (0, 0))

1961 if vram_manager.get_free_vram() < budget[1]:

1962 continue # Still no room

1963 except Exception:

1964 continue

1965

1966 # Restore the evicted model

1967 logger.info(f"Swap queue: restoring '{evicted_name}' "

1968 f"(displaced by '{evicted_for}' which is now idle)")

1969 success = self._restart_rtm_tool(evicted_name, entry.get('device', 'gpu'))

1970 if success:

1971 restored.append(entry)

1972

1973 for entry in restored:

1974 try:

1975 self._swap_queue.remove(entry)

1976 except ValueError:

1977 pass

1978

1979 # ── Pressure Alerts ──────────────────────────────────────────

1980

1981 def _emit_pressure_alerts(self):

1982 """Phase 13: Emit debounced pressure events to EventBus for frontend."""

1983 now = time.time()

1984 alerts = []

1985

1986 if self._detect_vram_pressure():

1987 alerts.append(('vram', 'GPU memory is under pressure — models may be evicted'))

1988 if self._detect_ram_pressure():

1989 alerts.append(('ram', 'System memory is under pressure — performance may degrade'))

1990 if self._detect_cpu_pressure():

1991 alerts.append(('cpu', 'CPU is under heavy load — inference may be slower'))

1992 if self._detect_disk_pressure():

1993 alerts.append(('disk', 'Low disk space — downloads and caching disabled'))

1994

1995 for ptype, message in alerts:

1996 last = self._last_pressure_alert.get(ptype, 0)

1997 if now - last >= self._pressure_alert_cooldown:

1998 self._last_pressure_alert[ptype] = now

1999 self._emit_event('system.pressure', {

2000 'type': ptype,

2001 'message': message,

2002 'timestamp': now,

2003 'throttle_factor': self._calculate_throttle_factor(),

2004 })

2005

2006 # ── Event emission helper ────────────────────────────────────

2007

2008 def _get_model_capabilities(self, tool_name: str) -> dict:

2009 """Read capabilities from model_catalog (single source of truth)."""

2010 try:

2011 from .model_catalog import get_catalog

2012 catalog = get_catalog()

2013 # Try direct lookup, then prefix search

2014 entry = catalog.get(tool_name)

2015 if not entry:

2016 for eid in catalog.list_ids():

2017 if tool_name in eid:

2018 entry = catalog.get(eid)

2019 break

2020 if entry:

2021 return entry.capabilities

2022 except Exception:

2023 pass

2024 return {}

2025

2026 def _emit_event(self, event_type: str, data: dict):

2027 """Emit an event to the EventBus (non-blocking, safe to fail)."""

2028 try:

2029 from core.platform.events import emit_event

2030 emit_event(event_type, data)

2031 except Exception:

2032 pass # EventBus not bootstrapped — silent

2033

2034 def _guess_model_type(self, tool_name: str) -> str:

2035 """Map tool name to model_type for catalog sync."""

2036 from integrations.service_tools.model_catalog import ModelType

2037 if tool_name == 'llm' or 'llama' in tool_name:

2038 return ModelType.LLM

2039 if 'tts' in tool_name or 'voice' in tool_name:

2040 return ModelType.TTS

2041 if 'whisper' in tool_name or 'stt' in tool_name:

2042 return ModelType.STT

2043 if 'minicpm' in tool_name or 'vlm' in tool_name or 'vision' in tool_name:

2044 return ModelType.VLM

2045 return tool_name

2046

2047 # ── Hive intelligence ─────────────────────────────────────

2048

2049 def _apply_hive_hints(self):

2050 """Apply hive-learned placement hints to local priorities."""

2051 try:

2052 from integrations.agent_engine.federated_aggregator import (

2053 get_federated_aggregator)

2054 fed = get_federated_aggregator()

2055 aggregated = fed.aggregate_lifecycle()

2056 if not aggregated:

2057 return

2058

2059 popularity = aggregated.get('popularity', {})

2060 with self._lock:

2061 for name, score in popularity.items():

2062 state = self._models.get(name)

2063 if state:

2064 state.hive_popularity = score

2065 state.hive_boost = score > 0.6

2066 else:

2067 self._hive_hints[name] = score

2068 except Exception as e:

2069 logger.debug(f"Hive hints error: {e}")

2070

2071 def _report_to_federation(self):

2072 """Send local model usage stats to FederatedAggregator."""

2073 try:

2074 from integrations.agent_engine.federated_aggregator import (

2075 get_federated_aggregator)

2076

2077 node_id = ''

2078 try:

2079 from security.node_integrity import get_node_identity

2080 node_id = get_node_identity().get('node_id', '')

2081 except Exception:

2082 pass

2083

2084 models_stats = {}

2085 with self._lock:

2086 for name, state in self._models.items():

2087 if state.device != ModelDevice.UNLOADED:

2088 rate = (state.access_count_session /

2089 max(1, self._interval * 6))

2090 idle_s = (time.time() - state.last_access_time

2091 if state.last_access_time else 0)

2092 models_stats[name] = {

2093 'device': state.device.value,

2094 'access_rate': round(rate, 4),

2095 'idle_s': round(idle_s, 1),

2096 }

2097 state.access_count_session = 0

2098

2099 if models_stats:

2100 delta = {

2101 'models': models_stats,

2102 'node_id': node_id,

2103 'timestamp': time.time(),

2104 }

2105 fed = get_federated_aggregator()

2106 fed.receive_lifecycle_delta(node_id, delta)

2107 except Exception as e:

2108 logger.debug(f"Lifecycle federation report error: {e}")

2109

2110 # ── Tier awareness ────────────────────────────────────────

2111

2112 def _detect_tier(self):

2113 """Cache node capability tier."""

2114 try:

2115 from security.system_requirements import get_tier

2116 self._node_tier = get_tier()

2117 except Exception:

2118 self._node_tier = None

2119

2120 def _is_tier_appropriate(self, model_name: str) -> bool:

2121 """Check if this model is appropriate for our capability tier."""

2122 if self._node_tier is None:

2123 return True

2124 try:

2125 from security.system_requirements import NodeTierLevel, _TIER_RANK

2126 min_tier_str = MODEL_MIN_TIER.get(model_name, 'standard')

2127 min_tier = NodeTierLevel(min_tier_str)

2128 return _TIER_RANK[self._node_tier] >= _TIER_RANK[min_tier]

2129 except Exception:

2130 return True

2131

2132 # ── Helpers ───────────────────────────────────────────────

2133

2134 def _sync_from_rtm(self):

2135 """Initialize model states from RuntimeToolManager's current state."""

2136 try:

2137 from .runtime_manager import runtime_tool_manager, TOOL_CONFIGS

2138 for tool_name in TOOL_CONFIGS:

2139 if runtime_tool_manager._is_server_alive(tool_name):

2140 self._on_tool_started(tool_name, device='gpu')

2141 except Exception:

2142 pass

2143

2144 def manual_offload(self, model_name: str) -> dict:

2145 """Manual GPU→CPU offload (admin API)."""

2146 with self._lock:

2147 state = self._models.get(model_name)

2148 if not state:

2149 return {'error': f'Model {model_name} not tracked'}

2150 if state.device == ModelDevice.UNLOADED:

2151 return {'error': f'Model {model_name} is not loaded'}

2152 if state.device == ModelDevice.CPU:

2153 return {'message': f'{model_name} already on CPU'}

2154

2155 success = self._do_offload_to_cpu(model_name)

2156 return {'success': success, 'model': model_name,

2157 'device': 'cpu' if success else state.device.value}

2158

2159 def set_priority(self, model_name: str, priority_str: str) -> dict:

2160 """Manual priority override (admin API)."""

2161 try:

2162 priority = ModelPriority(priority_str)

2163 except ValueError:

2164 return {'error': f'Invalid priority: {priority_str}'}

2165

2166 with self._lock:

2167 state = self._models.get(model_name)

2168 if not state:

2169 return {'error': f'Model {model_name} not tracked'}

2170 state.priority = priority

2171 return {'model': model_name, 'priority': priority_str}

2172

2173 def set_pressure_evict_only(self, model_name: str, value: bool) -> dict:

2174 """Toggle the ``pressure_evict_only`` flag for a tracked model.

2175

2176 When True, the model is evicted ONLY on VRAM pressure (phase 3

2177 of ``_tick``) and never by the idle-timeout sweep (phase 7).

2178 Used by ``TTSEngine.set_language`` to pin the ACTIVE TTS

2179 backend so a background model load can't idle-out the engine

2180 the user is actively speaking to.

2181

2182 If the model isn't tracked yet (first call can precede the

2183 RTM ``on_tool_started`` hook when a language is chosen before

2184 the first synth), the call no-ops with ``tracked=False`` so

2185 the caller can decide whether to retry on next tick. This is

2186 additive: the flag lands when the RTM hook fires, via the

2187 ``_persisted_hints`` path.

2188

2189 Returns a small dict for admin-API / journey-test inspection;

2190 the caller typically discards it.

2191 """

2192 with self._lock:

2193 state = self._models.get(model_name)

2194 if state is None:

2195 # Stage the pin so it's applied the moment the tool

2196 # starts — treating this as a pre-start hint matches

2197 # the pattern we already use for persisted hints.

2198 self._persisted_hints.setdefault(model_name, {}).update({

2199 'pressure_evict_only': bool(value),

2200 })

2201 return {

2202 'model': model_name,

2203 'pressure_evict_only': bool(value),

2204 'tracked': False,

2205 'staged_as_hint': True,

2206 }

2207 state.pressure_evict_only = bool(value)

2208 return {

2209 'model': model_name,

2210 'pressure_evict_only': bool(value),

2211 'tracked': True,

2212 }

2213

2214 def get_system_pressure(self) -> dict:

2215 """Return current pressure state for dispatch throttling.

2216

2217 Called by AgentDaemon to decide whether to reduce concurrency.

2218 Detects each pressure source once and passes to throttle calculation.

2219 """

2220 vram = self._detect_vram_pressure()

2221 ram = self._detect_ram_pressure()

2222 cpu = self._detect_cpu_pressure()

2223 disk = self._detect_disk_pressure()

2224 return {

2225 'vram_pressure': vram,

2226 'ram_pressure': ram,

2227 'cpu_pressure': cpu,

2228 'disk_pressure': disk,

2229 'throttle_factor': self._calculate_throttle_factor(

2230 cpu_on=cpu, ram_on=ram, vram_on=vram, disk_on=disk),

2231 }

2232

2233 def _calculate_throttle_factor(self, *, cpu_on: bool = None,

2234 ram_on: bool = None, vram_on: bool = None,

2235 disk_on: bool = None) -> float:

2236 """Compute dispatch throttle: 0.0 = fully throttled, 1.0 = no throttling.

2237

2238 Accepts pre-computed pressure booleans to avoid re-detecting.

2239 Falls back to live detection if not provided (standalone calls).

2240 """

2241 factor = 1.0

2242

2243 # CPU pressure — use granular percentage for proportional throttling

2244 try:

2245 import psutil

2246 cpu_pct = psutil.cpu_percent(interval=None)

2247 if cpu_pct >= 95:

2248 factor *= 0.1

2249 elif cpu_pct >= 90:

2250 factor *= 0.3

2251 elif cpu_pct >= self._cpu_pressure_pct:

2252 factor *= 0.5

2253 except ImportError:

2254 pass

2255

2256 # RAM pressure

2257 if ram_on if ram_on is not None else self._detect_ram_pressure():

2258 factor *= 0.5

2259

2260 # VRAM pressure

2261 if vram_on if vram_on is not None else self._detect_vram_pressure():

2262 factor *= 0.7

2263

2264 # Disk pressure (critical — stop all heavy work)

2265 if disk_on if disk_on is not None else self._detect_disk_pressure():

2266 factor *= 0.2

2267

2268 return max(0.0, min(1.0, factor))

2269

2270 def get_status(self) -> dict:

2271 """Full lifecycle dashboard."""

2272 with self._lock:

2273 models = {name: s.to_dict() for name, s in self._models.items()}

2274

2275 vram_status = {}

2276 try:

2277 from .vram_manager import vram_manager

2278 vram_status = vram_manager.get_status()

2279 except Exception:

2280 pass

2281

2282 # Crash recovery state

2283 restart_queue = {}

2284 for name, info in self._restart_pending.items():

2285 if isinstance(info, dict):

2286 restart_queue[name] = {

2287 'retry_in_s': max(0, round(info.get('retry_after', 0) - time.time(), 1)),

2288 'downgrade': info.get('downgrade', False),

2289 }

2290

2291 swap_q = [dict(e) for e in self._swap_queue]

2292

2293 return {

2294 'running': self._running,

2295 'tick_count': self._tick_count,

2296 'interval_s': self._interval,

2297 'models': models,

2298 'vram': vram_status,

2299 'vram_pressure': self._detect_vram_pressure(),

2300 'ram_pressure': self._detect_ram_pressure(),

2301 'cpu_pressure': self._detect_cpu_pressure(),

2302 'disk_pressure': self._detect_disk_pressure(),

2303 'throttle_factor': self._calculate_throttle_factor(),

2304 'hive_hints': dict(self._hive_hints),

2305 'node_tier': (self._node_tier.value

2306 if self._node_tier else 'unknown'),

2307 'restart_pending': restart_queue,

2308 'swap_queue': swap_q,

2309 }

2310

2311

2312# ═══════════════════════════════════════════════════════════════

2313# Singleton

2314# ═══════════════════════════════════════════════════════════════

2315

2316_lifecycle_manager = None

2317_lifecycle_lock = threading.Lock()

2318

2319

2320def get_model_lifecycle_manager() -> ModelLifecycleManager:

2321 global _lifecycle_manager

2322 if _lifecycle_manager is None:

2323 with _lifecycle_lock:

2324 if _lifecycle_manager is None:

2325 _lifecycle_manager = ModelLifecycleManager()

2326 return _lifecycle_manager

2327

2328

2329# ═══════════════════════════════════════════════════════════════

2330# Draft eviction on user-language change

2331# ═══════════════════════════════════════════════════════════════

2332# When the user switches TO a script the 0.8B draft can't handle

2333# (NON_LATIN_SCRIPT_LANGS), the running draft's ~1.2GB VRAM is dead

2334# weight — speculative_dispatcher's language guard will never route

2335# to it. Evict so Indic Parler (~2GB) can fit alongside main 4B on

2336# 8GB GPUs. Subscribes to core.user_lang.on_lang_change — runs in a

2337# daemon thread (fired by the subscriber bus), /chat hot path not

2338# stalled.

2339

2340def _evict_draft_on_non_latin_switch(old_lang, new_lang) -> None:

2341 """on_lang_change subscriber — evicts 0.8B draft when the user

2342 moves to a non-Latin script. No-op otherwise."""

2343 try:

2344 from core.constants import NON_LATIN_SCRIPT_LANGS

2345 except ImportError:

2346 return

2347 _new_key = (new_lang or '').split('-')[0].lower()[:2]

2348 if not _new_key or _new_key not in NON_LATIN_SCRIPT_LANGS:

2349 return

2350 try:

2351 _mgr = get_model_lifecycle_manager()

2352 if _mgr and hasattr(_mgr, 'request_swap'):

2353 _mgr.request_swap(

2354 target='draft',

2355 reason=f'language_switch_to_{_new_key}',

2356 )

2357 except Exception:

2358 pass

2359

2360

2361try:

2362 from core.user_lang import on_lang_change as _on_lang_change

2363 _on_lang_change(_evict_draft_on_non_latin_switch)

2364except ImportError:

2365 # user_lang unavailable standalone (e.g., isolated test env);

2366 # callers may still invoke request_swap directly.

2367 pass

Coverage for integrations / service_tools / model_lifecycle.py: 61.6%

983 statements