Coverage for core/resource_governor.py: 57.4%

1"""

2Resource Governor — HARTOS never slows down the host OS.

4Three modes:

5 ACTIVE: User is busy -> HARTOS uses <5% CPU, no GPU, minimal RAM

6 IDLE: User is idle -> HARTOS uses up to 50% CPU, available GPU, moderate RAM

7 SLEEP: System on battery/low resources -> HARTOS suspends all non-essential work

9Enforcement layer (ResourceEnforcer):

10 Hard OS-level caps so Nunba CANNOT exceed its budget, regardless of code bugs:

11 - Windows: Job Object with CPU rate limit + memory limit on the process tree

12 - Linux: cgroup v2 cpu.max + memory.max, fallback to RLIMIT_AS

13 - macOS: RLIMIT_AS (soft cap) + process priority

14 - GPU: CUDA_MPS_ACTIVE_THREAD_PERCENTAGE (NVIDIA MPS) or VRAM budget via lifecycle

15 - Process priority: BELOW_NORMAL on Windows, nice +10 on POSIX

17 System buffer: always reserves 25% CPU, 2 GB RAM, 1 GB VRAM for the rest of the OS.

18 Caps tighten/relax on mode transitions (ACTIVE → tight, IDLE → relaxed, SLEEP → minimal).

20Proactive Action Stream:

21 When IDLE, periodically samples hive intelligence:

22 - Check if any hive tasks match this node's capabilities

23 - Pre-download popular models that users nearby are requesting

24 - Run background benchmarks to optimize inference settings

25 - Explore community signal feed for actionable insights

27All sampling is randomized (jitter) to avoid thundering herd across hive nodes.

29Integration:

30 - Uses VRAMManager for GPU detection (vram_manager.detect_gpu())

31 - Reads model_lifecycle get_system_pressure() for existing throttle awareness

32 - Emits EventBus events: 'resource.mode_changed', 'resource.proactive_action'

33 - AgentDaemon and HiveTaskDispatcher check get_throttle() / should_proceed()

34 - Lazy imports for all hive modules (zero startup cost)

36Singleton: get_governor() returns the module-level instance.

37Convenience: should_proceed(resource) for quick checks from any module.

38"""

40import ctypes

41import logging

42import os

43import random

44import struct

45import sys

46import threading

47import time

48from typing import Optional

50logger = logging.getLogger(__name__)

52# ═══════════════════════════════════════════════════════════════════════

53# Constants

54# ═══════════════════════════════════════════════════════════════════════

56# Mode thresholds

57IDLE_THRESHOLD_SECONDS = 120 # 2 min of no input -> idle

58ACTIVE_CPU_LIMIT = 0.05 # 5% CPU max when user is active

59IDLE_CPU_LIMIT = 0.50 # 50% CPU max when idle

60SLEEP_CPU_LIMIT = 0.0 # Nothing when sleeping

62# ── System buffer — always reserved for the rest of the OS ──

63SYSTEM_BUFFER_CPU_FRACTION = 0.25 # Reserve 25% CPU cores for OS

64SYSTEM_BUFFER_RAM_GB = 2.0 # Reserve 2 GB RAM for OS

65SYSTEM_BUFFER_VRAM_GB = 1.0 # Reserve 1 GB VRAM for other apps

67# Proactive action intervals (seconds) — randomized +/-50%

68SIGNAL_CHECK_INTERVAL = 60 # Check hive signals

69TASK_CHECK_INTERVAL = 600 # Check pending tasks

70MODEL_PREFETCH_INTERVAL = 1800 # Pre-download models

71BENCHMARK_INTERVAL = 3600 # Run benchmarks

73# Battery thresholds

74BATTERY_SLEEP_THRESHOLD = 0.20 # Force sleep below 20%

75BATTERY_THROTTLE_THRESHOLD = 0.40 # Reduce activity below 40%

77# Monitor loop interval

78_MONITOR_INTERVAL_SECONDS = 5

80# Mode constants

81MODE_ACTIVE = 'active'

82MODE_IDLE = 'idle'

83MODE_SLEEP = 'sleep'

86# ═══════════════════════════════════════════════════════════════════════

87# ResourceEnforcer — hard OS-level process caps

88# ═══════════════════════════════════════════════════════════════════════

90class ResourceEnforcer:

91 """Enforce hard resource caps at the OS level.

93 The governor ADVISES. The enforcer CONSTRAINS.

94 Even buggy code cannot exceed the caps set here.

96 Platform-specific mechanisms:

97 Windows: Job Object (CPU rate limit + memory limit on process tree)

98 Linux: cgroup v2 (cpu.max + memory.max), fallback RLIMIT_AS

99 macOS: RLIMIT_AS (soft cap) + process priority

100

101 Process priority: BELOW_NORMAL on Windows, nice +10 on POSIX.

102 This alone prevents Nunba from competing with foreground apps.

103 """

104

105 def __init__(self):

106 self._enforced = False

107 self._job_handle = None # Windows Job Object handle

108 self._cgroup_path = None # Linux cgroup path

109 self._original_nice = None

110

111 def enforce(self, cpu_fraction: float = 0.75, ram_fraction: float = 0.75,

112 gpu_fraction: float = 0.75):

113 """Apply hard caps. Call once at startup.

114

115 Args:

116 cpu_fraction: Max fraction of total CPU Nunba can use (0.75 = 75%)

117 ram_fraction: Max fraction of total RAM Nunba can use

118 gpu_fraction: Max fraction of GPU Nunba can use (advisory for CUDA)

119 """

120 if self._enforced:

121 return

122

123 # Always subtract system buffer from the requested fraction

124 total_ram_gb = self._get_total_ram_gb()

125 usable_ram_gb = max(0.5, total_ram_gb * ram_fraction - SYSTEM_BUFFER_RAM_GB)

126

127 total_cores = os.cpu_count() or 4

128 usable_cores = max(1, int(total_cores * (cpu_fraction - SYSTEM_BUFFER_CPU_FRACTION)))

129

130 logger.info("ResourceEnforcer: total_ram=%.1fGB, cap=%.1fGB, "

131 "total_cores=%d, cap_cores=%d",

132 total_ram_gb, usable_ram_gb, total_cores, usable_cores)

133

134 self._set_process_priority()

135 self._enforce_cpu(cpu_fraction, usable_cores, total_cores)

136 self._enforce_ram(usable_ram_gb)

137 self._enforce_gpu(gpu_fraction)

138 self._enforced = True

139 logger.info("ResourceEnforcer: hard caps applied")

140

141 def update_caps(self, mode: str):

142 """Tighten or relax caps based on governor mode.

143

144 ACTIVE: tight (25% CPU, 50% RAM, no GPU)

145 IDLE: relaxed (75% CPU, 75% RAM, GPU allowed)

146 SLEEP: minimal (5% CPU, 25% RAM, no GPU)

147 """

148 if not self._enforced:

149 return

150

151 caps = {

152 MODE_ACTIVE: (0.25, 0.50, 0.0),

153 MODE_IDLE: (0.75, 0.75, 0.75),

154 MODE_SLEEP: (0.05, 0.25, 0.0),

155 }

156 cpu_frac, ram_frac, gpu_frac = caps.get(mode, (0.50, 0.50, 0.0))

157

158 total_ram_gb = self._get_total_ram_gb()

159 usable_ram_gb = max(0.5, total_ram_gb * ram_frac - SYSTEM_BUFFER_RAM_GB)

160 total_cores = os.cpu_count() or 4

161

162 self._enforce_cpu(cpu_frac, max(1, int(total_cores * cpu_frac)), total_cores)

163 self._enforce_ram(usable_ram_gb)

164 self._enforce_gpu(gpu_frac)

165

166 # ── Process priority ──────────────────────────────────────────────

167

168 def _set_process_priority(self):

169 """Set process to below-normal priority so foreground apps always win."""

170 try:

171 if sys.platform == 'win32':

172 # BELOW_NORMAL_PRIORITY_CLASS = 0x4000

173 kernel32 = ctypes.windll.kernel32

174 handle = kernel32.GetCurrentProcess()

175 kernel32.SetPriorityClass(handle, 0x4000)

176 logger.info("ResourceEnforcer: set BELOW_NORMAL priority (Windows)")

177 else:

178 self._original_nice = os.nice(0)

179 os.nice(10) # Lower priority (higher nice = lower priority)

180 logger.info("ResourceEnforcer: set nice +10 (POSIX)")

181 except Exception as e:

182 logger.debug("ResourceEnforcer: priority set failed: %s", e)

183

184 # ── CPU enforcement ───────────────────────────────────────────────

185

186 def _enforce_cpu(self, cpu_fraction: float, usable_cores: int,

187 total_cores: int):

188 """Enforce CPU cap via Job Object (Windows) or cgroup (Linux)."""

189 if sys.platform == 'win32':

190 self._enforce_cpu_windows(cpu_fraction)

191 elif sys.platform == 'linux':

192 self._enforce_cpu_linux(cpu_fraction, total_cores)

193 # macOS: no hard CPU cap, rely on nice priority + psutil affinity

194

195 # CPU affinity: restrict to subset of cores

196 psutil = _try_import_psutil()

197 if psutil is not None:

198 try:

199 p = psutil.Process()

200 available = list(range(total_cores))

201 # Use last N cores (leave first cores for OS/foreground)

202 target = available[-usable_cores:] if usable_cores < total_cores else available

203 p.cpu_affinity(target)

204 logger.info("ResourceEnforcer: CPU affinity set to cores %s", target)

205 except Exception as e:

206 logger.debug("ResourceEnforcer: CPU affinity failed: %s", e)

207

208 def _enforce_cpu_windows(self, cpu_fraction: float):

209 """Windows: use Job Object CPU rate limit."""

210 try:

211 kernel32 = ctypes.windll.kernel32

212

213 # CreateJobObjectW

214 job = kernel32.CreateJobObjectW(None, None)

215 if not job:

216 logger.debug("ResourceEnforcer: CreateJobObject failed")

217 return

218

219 # JOBOBJECT_CPU_RATE_CONTROL_INFORMATION

220 # Enable CPU rate control with hard cap

221 class JOBOBJECT_CPU_RATE_CONTROL_INFORMATION(ctypes.Structure):

222 _fields_ = [

223 ('ControlFlags', ctypes.c_ulong),

224 ('Value', ctypes.c_ulong), # Union, using CpuRate

225 ]

226

227 # JOB_OBJECT_CPU_RATE_CONTROL_ENABLE = 0x1

228 # JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP = 0x4

229 rate_info = JOBOBJECT_CPU_RATE_CONTROL_INFORMATION()

230 rate_info.ControlFlags = 0x1 | 0x4 # Enable + Hard cap

231 # CpuRate is in units of 1/100th of a percent (100 = 1%)

232 rate_info.Value = max(100, int(cpu_fraction * 10000))

233

234 # SetInformationJobObject with JobObjectCpuRateControlInformation (15)

235 kernel32.SetInformationJobObject(

236 job, 15, ctypes.byref(rate_info), ctypes.sizeof(rate_info))

237

238 # Assign current process to the Job Object

239 kernel32.AssignProcessToJobObject(

240 job, kernel32.GetCurrentProcess())

241

242 self._job_handle = job

243 logger.info("ResourceEnforcer: Windows Job Object CPU rate = %.0f%%",

244 cpu_fraction * 100)

245 except Exception as e:

246 logger.debug("ResourceEnforcer: Windows Job Object failed: %s", e)

247

248 def _enforce_cpu_linux(self, cpu_fraction: float, total_cores: int):

249 """Linux: use cgroup v2 cpu.max."""

250 try:

251 # Try cgroup v2

252 cg_path = f'/sys/fs/cgroup/nunba_{os.getpid()}'

253 if os.path.isdir('/sys/fs/cgroup/cgroup.controllers'):

254 os.makedirs(cg_path, exist_ok=True)

255 period = 100000 # 100ms

256 quota = int(period * cpu_fraction * total_cores)

257 with open(os.path.join(cg_path, 'cpu.max'), 'w') as f:

258 f.write(f'{quota} {period}')

259 # Move current process into the cgroup

260 with open(os.path.join(cg_path, 'cgroup.procs'), 'w') as f:

261 f.write(str(os.getpid()))

262 self._cgroup_path = cg_path

263 logger.info("ResourceEnforcer: cgroup v2 cpu.max = %d/%d (%.0f%%)",

264 quota, period, cpu_fraction * 100)

265 except PermissionError:

266 logger.debug("ResourceEnforcer: cgroup v2 requires root, skipping")

267 except Exception as e:

268 logger.debug("ResourceEnforcer: cgroup cpu failed: %s", e)

269

270 # ── RAM enforcement ───────────────────────────────────────────────

271

272 def _enforce_ram(self, max_ram_gb: float):

273 """Enforce RAM cap via Job Object (Windows), cgroup (Linux), or RLIMIT."""

274 max_bytes = int(max_ram_gb * 1024 * 1024 * 1024)

275

276 if sys.platform == 'win32':

277 self._enforce_ram_windows(max_bytes)

278 elif sys.platform == 'linux':

279 self._enforce_ram_linux(max_bytes)

280 else:

281 self._enforce_ram_rlimit(max_bytes)

282

283 def _enforce_ram_windows(self, max_bytes: int):

284 """Windows: Job Object memory limit."""

285 if not self._job_handle:

286 return

287 try:

288 kernel32 = ctypes.windll.kernel32

289

290 # JOBOBJECT_EXTENDED_LIMIT_INFORMATION

291 class IO_COUNTERS(ctypes.Structure):

292 _fields_ = [('_' + str(i), ctypes.c_ulonglong) for i in range(6)]

293

294 class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure):

295 _fields_ = [

296 ('PerProcessUserTimeLimit', ctypes.c_longlong),

297 ('PerJobUserTimeLimit', ctypes.c_longlong),

298 ('LimitFlags', ctypes.c_ulong),

299 ('MinimumWorkingSetSize', ctypes.c_size_t),

300 ('MaximumWorkingSetSize', ctypes.c_size_t),

301 ('ActiveProcessLimit', ctypes.c_ulong),

302 ('Affinity', ctypes.c_size_t),

303 ('PriorityClass', ctypes.c_ulong),

304 ('SchedulingClass', ctypes.c_ulong),

305 ]

306

307 class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure):

308 _fields_ = [

309 ('BasicLimitInformation', JOBOBJECT_BASIC_LIMIT_INFORMATION),

310 ('IoInfo', IO_COUNTERS),

311 ('ProcessMemoryLimit', ctypes.c_size_t),

312 ('JobMemoryLimit', ctypes.c_size_t),

313 ('PeakProcessMemoryUsed', ctypes.c_size_t),

314 ('PeakJobMemoryUsed', ctypes.c_size_t),

315 ]

316

317 info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()

318 # JOB_OBJECT_LIMIT_JOB_MEMORY = 0x200

319 info.BasicLimitInformation.LimitFlags = 0x200

320 info.JobMemoryLimit = max_bytes

321

322 # SetInformationJobObject with JobObjectExtendedLimitInformation (9)

323 kernel32.SetInformationJobObject(

324 self._job_handle, 9, ctypes.byref(info), ctypes.sizeof(info))

325

326 logger.info("ResourceEnforcer: Windows Job memory limit = %.1f GB",

327 max_bytes / (1024**3))

328 except Exception as e:

329 logger.debug("ResourceEnforcer: Windows memory limit failed: %s", e)

330

331 def _enforce_ram_linux(self, max_bytes: int):

332 """Linux: cgroup v2 memory.max."""

333 if self._cgroup_path:

334 try:

335 with open(os.path.join(self._cgroup_path, 'memory.max'), 'w') as f:

336 f.write(str(max_bytes))

337 logger.info("ResourceEnforcer: cgroup memory.max = %.1f GB",

338 max_bytes / (1024**3))

339 return

340 except Exception as e:

341 logger.debug("ResourceEnforcer: cgroup memory failed: %s", e)

342

343 self._enforce_ram_rlimit(max_bytes)

344

345 def _enforce_ram_rlimit(self, max_bytes: int):

346 """POSIX: RLIMIT_AS (soft cap — process gets MemoryError on exceed)."""

347 try:

348 import resource

349 soft, hard = resource.getrlimit(resource.RLIMIT_AS)

350 resource.setrlimit(resource.RLIMIT_AS, (max_bytes, hard))

351 logger.info("ResourceEnforcer: RLIMIT_AS = %.1f GB",

352 max_bytes / (1024**3))

353 except Exception as e:

354 logger.debug("ResourceEnforcer: RLIMIT_AS failed: %s", e)

355

356 # ── GPU enforcement ───────────────────────────────────────────────

357

358 def _enforce_gpu(self, gpu_fraction: float):

359 """Enforce GPU cap via CUDA environment variables.

360

361 CUDA_MPS_ACTIVE_THREAD_PERCENTAGE limits GPU SM utilization.

362 CUDA_VISIBLE_DEVICES can restrict which GPUs are used.

363 Also reserves SYSTEM_BUFFER_VRAM_GB via VRAMManager budget.

364 """

365 if gpu_fraction <= 0:

366 # No GPU allowed in this mode

367 os.environ['CUDA_VISIBLE_DEVICES'] = ''

368 return

369

370 pct = max(10, int(gpu_fraction * 100))

371 os.environ['CUDA_MPS_ACTIVE_THREAD_PERCENTAGE'] = str(pct)

372

373 # Restore GPU visibility if previously hidden

374 if os.environ.get('CUDA_VISIBLE_DEVICES') == '':

375 os.environ.pop('CUDA_VISIBLE_DEVICES', None)

376

377 # Set VRAM budget via lifecycle (leaves buffer for other apps)

378 try:

379 from integrations.service_tools.vram_manager import vram_manager

380 info = vram_manager.detect_gpu()

381 total_vram = info.get('total_gb', 0)

382 if total_vram > 0:

383 budget = max(0.5, total_vram * gpu_fraction - SYSTEM_BUFFER_VRAM_GB)

384 os.environ['HARTOS_VRAM_BUDGET_GB'] = str(round(budget, 1))

385 logger.info("ResourceEnforcer: GPU %d%% SM, VRAM budget %.1fGB "

386 "(total %.1fGB, buffer %.1fGB)",

387 pct, budget, total_vram, SYSTEM_BUFFER_VRAM_GB)

388 except Exception:

389 pass

390

391 # ── Helpers ────────────────────────────────────────────────────────

392

393 @staticmethod

394 def _get_total_ram_gb() -> float:

395 psutil = _try_import_psutil()

396 if psutil:

397 try:

398 return psutil.virtual_memory().total / (1024**3)

399 except Exception:

400 pass

401 # Windows fallback

402 if sys.platform == 'win32':

403 try:

404 class MEMORYSTATUSEX(ctypes.Structure):

405 _fields_ = [

406 ('dwLength', ctypes.c_ulong),

407 ('dwMemoryLoad', ctypes.c_ulong),

408 ('ullTotalPhys', ctypes.c_ulonglong),

409 ('ullAvailPhys', ctypes.c_ulonglong),

410 ('ullTotalPageFile', ctypes.c_ulonglong),

411 ('ullAvailPageFile', ctypes.c_ulonglong),

412 ('ullTotalVirtual', ctypes.c_ulonglong),

413 ('ullAvailVirtual', ctypes.c_ulonglong),

414 ('ullAvailExtendedVirtual', ctypes.c_ulonglong),

415 ]

416 ms = MEMORYSTATUSEX()

417 ms.dwLength = ctypes.sizeof(MEMORYSTATUSEX)

418 ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(ms))

419 return ms.ullTotalPhys / (1024**3)

420 except Exception:

421 pass

422 return 8.0 # Safe default

423

424

425# Module-level enforcer singleton

426_enforcer: Optional[ResourceEnforcer] = None

427

428

429_enforcer_lock = threading.Lock()

430

431

432def get_enforcer() -> ResourceEnforcer:

433 """Get or create the singleton ResourceEnforcer."""

434 global _enforcer

435 if _enforcer is None:

436 with _enforcer_lock:

437 if _enforcer is None:

438 _enforcer = ResourceEnforcer()

439 return _enforcer

440

441

442# ═══════════════════════════════════════════════════════════════════════

443# Helpers — platform-agnostic resource detection (psutil optional)

444# ═══════════════════════════════════════════════════════════════════════

445

446def _try_import_psutil():

447 """Return psutil module or None if unavailable."""

448 try:

449 import psutil

450 return psutil

451 except ImportError:

452 return None

453

454

455def _jitter(base_seconds: float, spread: float = 0.5) -> float:

456 """Return base_seconds +/- spread*base_seconds (uniform random).

457

458 Prevents thundering herd when many hive nodes use the same interval.

459 """

460 low = base_seconds * (1.0 - spread)

461 high = base_seconds * (1.0 + spread)

462 return random.uniform(low, high)

463

464

465# ═══════════════════════════════════════════════════════════════════════

466# ResourceGovernor

467# ═══════════════════════════════════════════════════════════════════════

468

469class ResourceGovernor:

470 """Central resource controller for HARTOS.

471

472 Monitors CPU, memory, battery, and user activity to transition between

473 ACTIVE / IDLE / SLEEP modes. Exposes a throttle factor that all HARTOS

474 subsystems should check before doing heavy work.

475

476 When the user is idle, runs a proactive action stream that samples hive

477 intelligence at randomized intervals.

478 """

479

480 def __init__(self, idle_threshold_seconds: float = IDLE_THRESHOLD_SECONDS):

481 # Mode state

482 self._mode: str = MODE_ACTIVE

483 self._cpu_limit: float = ACTIVE_CPU_LIMIT

484 self._gpu_allowed: bool = False

485 self._last_user_activity: float = time.monotonic()

486 self._idle_threshold_seconds: float = idle_threshold_seconds

487

488 # Threading

489 self._proactive_thread: Optional[threading.Thread] = None

490 self._monitor_thread: Optional[threading.Thread] = None

491 self._running: bool = False

492 self._lock = threading.Lock()

493 self._cancel_event = threading.Event() # instant wake/cancel for proactive

494

495 # Prime psutil CPU counter (first call always returns 0.0)

496 try:

497 import psutil

498 psutil.cpu_percent(interval=None)

499 except Exception:

500 pass

501

502 # Stats (exposed for dashboards)

503 self._stats: dict = {

504 'mode_changes': 0,

505 'proactive_actions': 0,

506 'signals_checked': 0,

507 'tasks_dispatched': 0,

508 'models_prefetched': 0,

509 'benchmarks_run': 0,

510 'last_mode_change': 0.0,

511 'uptime_start': 0.0,

512 }

513

514 # ── Lifecycle ─────────────────────────────────────────────────

515

516 def start(self, defer_memory_limit: bool = False) -> None:

517 """Start the governor background monitor and proactive stream.

518

519 Args:

520 defer_memory_limit: If True, apply priority + CPU caps now but

521 skip the Job Object / cgroup memory limit. The caller is

522 responsible for calling apply_memory_limit() later (after

523 the heavy import spike is over). This prevents Windows

524 from terminating the process during the boot-time memory

525 peak (autogen + flaml + llmlingua + transformers + 96

526 expert agents all imported before webview.start).

527 """

528 with self._lock:

529 if self._running:

530 return

531 self._running = True

532 self._cancel_event.clear()

533 self._stats['uptime_start'] = time.time()

534

535 # Apply hard OS-level resource caps at startup

536 try:

537 enforcer = get_enforcer()

538 if defer_memory_limit:

539 # Priority + CPU only — memory cap deferred to avoid

540 # SIGKILL on boot-time spike (see app.py comment block).

541 enforcer._set_process_priority()

542 enforcer._enforce_cpu(0.75, max(1, int((os.cpu_count() or 4) * 0.75)), os.cpu_count() or 4)

543 enforcer._enforced = True # mark so update_caps doesn't re-enforce

544 logger.info("ResourceEnforcer: priority + CPU applied (memory deferred)")

545 else:

546 enforcer.enforce(cpu_fraction=0.75, ram_fraction=0.75, gpu_fraction=0.75)

547 except Exception as e:

548 logger.warning("ResourceEnforcer failed at startup: %s", e)

549

550 self._monitor_thread = threading.Thread(

551 target=self._monitor_loop,

552 name='ResourceGovernor-Monitor',

553 daemon=True,

554 )

555 self._monitor_thread.start()

556

557 self._proactive_thread = threading.Thread(

558 target=self._proactive_action_stream,

559 name='ResourceGovernor-Proactive',

560 daemon=True,

561 )

562 self._proactive_thread.start()

563

564 logger.info("ResourceGovernor started (idle threshold=%.0fs)",

565 self._idle_threshold_seconds)

566

567 def apply_memory_limit(self) -> None:

568 """Apply the deferred RAM limit — call AFTER boot-time spike is over.

569

570 Pairs with start(defer_memory_limit=True). Safe to call multiple

571 times (idempotent via enforcer._enforce_ram's Job Object update).

572 """

573 try:

574 enforcer = get_enforcer()

575 total_ram_gb = enforcer._get_total_ram_gb()

576 usable_ram_gb = max(0.5, total_ram_gb * 0.75 - SYSTEM_BUFFER_RAM_GB)

577 enforcer._enforce_ram(usable_ram_gb)

578 logger.info("ResourceEnforcer: memory cap now active (%.1fGB)", usable_ram_gb)

579 except Exception as e:

580 logger.warning("ResourceEnforcer: deferred memory cap failed: %s", e)

581

582 def stop(self) -> None:

583 """Stop the governor, release all throttles."""

584 with self._lock:

585 if not self._running:

586 return

587 self._running = False

588

589 self._cancel_event.set()

590

591 # Wait for threads to exit (bounded timeout)

592 for t in (self._monitor_thread, self._proactive_thread):

593 if t is not None and t.is_alive():

594 t.join(timeout=3.0)

595

596 self._monitor_thread = None

597 self._proactive_thread = None

598 logger.info("ResourceGovernor stopped")

599

600 # ── Public API ────────────────────────────────────────────────

601

602 def get_mode(self) -> str:

603 """Current mode: 'active', 'idle', or 'sleep'."""

604 return self._mode

605

606 def get_throttle(self) -> float:

607 """Current throttle factor 0.0 (full stop) to 1.0 (unlimited).

608

609 Other HARTOS subsystems should multiply their resource usage by

610 this value before proceeding with heavy work.

611 """

612 return self._calculate_throttle()

613

614 def should_allow(self, resource: str) -> bool:

615 """Quick check: should this resource usage be allowed right now?

616

617 Args:

618 resource: One of 'cpu_heavy', 'gpu', 'network_heavy', 'disk_heavy'

619

620 Returns:

621 True if the governor permits the resource usage.

622 """

623 mode = self._mode

624 if mode == MODE_SLEEP:

625 return False

626

627 if mode == MODE_ACTIVE:

628 # Active mode: only permit lightweight work

629 if resource in ('gpu', 'cpu_heavy', 'disk_heavy'):

630 return False

631 if resource == 'network_heavy':

632 return False

633 return True

634

635 # IDLE mode: allow most things

636 if resource == 'gpu':

637 return self._gpu_allowed

638 return True

639

640 def report_user_activity(self) -> None:

641 """Signal that the user is active. Immediately switches to ACTIVE mode.

642

643 Called by UI/input handlers, Flask endpoints, or any user-facing code.

644 """

645 self._last_user_activity = time.monotonic()

646 if self._mode != MODE_ACTIVE:

647 self._transition_to(MODE_ACTIVE)

648

649 def get_stats(self) -> dict:

650 """Return a copy of governor statistics for dashboards."""

651 with self._lock:

652 stats = dict(self._stats)

653 stats['mode'] = self._mode

654 stats['throttle'] = self._calculate_throttle()

655 stats['cpu_limit'] = self._cpu_limit

656 stats['gpu_allowed'] = self._gpu_allowed

657 return stats

658

659 # ── Mode Transitions ──────────────────────────────────────────

660

661 def _transition_to(self, new_mode: str) -> None:

662 """Switch modes, update limits, emit event."""

663 old_mode = self._mode

664

665 if new_mode == old_mode:

666 return

667

668 self._mode = new_mode

669

670 if new_mode == MODE_ACTIVE:

671 self._cpu_limit = ACTIVE_CPU_LIMIT

672 self._gpu_allowed = False

673 # Wake the cancel event so proactive stream backs off instantly

674 self._cancel_event.set()

675 elif new_mode == MODE_IDLE:

676 self._cpu_limit = IDLE_CPU_LIMIT

677 self._gpu_allowed = True

678 # Clear cancel so proactive stream can run

679 self._cancel_event.clear()

680 elif new_mode == MODE_SLEEP:

681 self._cpu_limit = SLEEP_CPU_LIMIT

682 self._gpu_allowed = False

683 self._cancel_event.set()

684

685 with self._lock:

686 self._stats['mode_changes'] += 1

687 self._stats['last_mode_change'] = time.time()

688

689 logger.info("ResourceGovernor: %s -> %s (cpu_limit=%.2f, gpu=%s)",

690 old_mode, new_mode, self._cpu_limit, self._gpu_allowed)

691

692 # Update hard OS-level caps to match new mode

693 try:

694 get_enforcer().update_caps(new_mode)

695 except Exception:

696 pass

697

698 # Emit EventBus event (best-effort, lazy import)

699 try:

700 from core.platform.events import emit_event

701 emit_event('resource.mode_changed', {

702 'old_mode': old_mode,

703 'new_mode': new_mode,

704 'cpu_limit': self._cpu_limit,

705 'gpu_allowed': self._gpu_allowed,

706 'timestamp': time.time(),

707 })

708 except Exception:

709 pass

710

711 # ── Monitor Loop ──────────────────────────────────────────────

712

713 def _monitor_loop(self) -> None:

714 """Background thread: monitor system state every 5 seconds.

715

716 Three cheap checks per tick: CPU, user idle, battery.

717 Transitions modes based on combined signals.

718 """

719 while self._running:

720 # Heartbeat to watchdog — prevents false-positive FROZEN alerts

721 try:

722 from security.node_watchdog import get_watchdog

723 _wd = get_watchdog()

724 if _wd:

725 _wd.heartbeat('resource_governor_monitor')

726 except Exception:

727 pass

728 try:

729 cpu = self._get_cpu_usage()

730 mem = self._get_memory_pressure()

731 user_idle = self._detect_user_idle()

732 battery_level, on_battery = self._get_battery_status()

733

734 # Decision tree

735 if on_battery and battery_level < BATTERY_SLEEP_THRESHOLD:

736 # Critical battery: force sleep

737 self._transition_to(MODE_SLEEP)

738 elif not user_idle:

739 # User is active

740 self._transition_to(MODE_ACTIVE)

741 elif cpu > 0.85 or mem > 0.90:

742 # System is heavily loaded even though user is idle

743 # (e.g., background renders, compiles) — stay conservative

744 self._transition_to(MODE_ACTIVE)

745 elif on_battery and battery_level < BATTERY_THROTTLE_THRESHOLD:

746 # Low battery: allow idle work but keep it light

747 self._transition_to(MODE_IDLE)

748 self._cpu_limit = IDLE_CPU_LIMIT * 0.5 # half the idle budget

749 else:

750 # User idle, system not overloaded, power is fine

751 self._transition_to(MODE_IDLE)

752

753 # Update GPU allowance based on VRAM availability

754 if self._mode == MODE_IDLE:

755 self._gpu_allowed = self._check_gpu_available()

756

757 except Exception as e:

758 logger.debug("ResourceGovernor monitor error: %s", e)

759

760 # Sleep for the monitor interval, but wake early if stopping

761 self._cancel_event.wait(timeout=_MONITOR_INTERVAL_SECONDS)

762 if not self._running:

763 break

764 # If cancel_event was set by mode transition, clear it for proactive

765 # (only if we're not in a mode that should keep it set)

766 if self._mode == MODE_IDLE:

767 self._cancel_event.clear()

768

769 # ── Platform-Specific Detection ───────────────────────────────

770

771 def _detect_user_idle(self) -> bool:

772 """Detect whether the user is idle (no input for threshold seconds).

773

774 Platform-specific:

775 Windows: GetLastInputInfo via ctypes

776 Linux: /proc/interrupts delta or fallback

777 macOS: ioreg idle time

778 Fallback: report_user_activity() timestamp

779 """

780 idle_ms = self._get_os_idle_ms()

781 if idle_ms is not None:

782 return idle_ms >= (self._idle_threshold_seconds * 1000)

783

784 # Fallback: use last reported user activity

785 elapsed = time.monotonic() - self._last_user_activity

786 return elapsed >= self._idle_threshold_seconds

787

788 def _get_os_idle_ms(self) -> Optional[float]:

789 """Get OS-level user idle time in milliseconds, or None if unavailable."""

790 if sys.platform == 'win32':

791 return self._get_idle_ms_windows()

792 elif sys.platform == 'linux':

793 return self._get_idle_ms_linux()

794 elif sys.platform == 'darwin':

795 return self._get_idle_ms_macos()

796 return None

797

798 def _get_idle_ms_windows(self) -> Optional[float]:

799 """Windows: GetLastInputInfo returns tick count of last input event."""

800 try:

801 # LASTINPUTINFO struct: cbSize (UINT), dwTime (DWORD)

802 class LASTINPUTINFO(ctypes.Structure):

803 _fields_ = [

804 ('cbSize', ctypes.c_uint),

805 ('dwTime', ctypes.c_uint),

806 ]

807

808 lii = LASTINPUTINFO()

809 lii.cbSize = ctypes.sizeof(LASTINPUTINFO)

810 if ctypes.windll.user32.GetLastInputInfo(ctypes.byref(lii)):

811 current_tick = ctypes.windll.kernel32.GetTickCount()

812 idle_ms = current_tick - lii.dwTime

813 # Handle tick count wraparound (every ~49.7 days)

814 if idle_ms < 0:

815 idle_ms += 0xFFFFFFFF + 1

816 return float(idle_ms)

817 except Exception:

818 pass

819 return None

820

821 def _get_idle_ms_linux(self) -> Optional[float]:

822 """Linux: try xprintidle, then /proc/interrupts delta estimation."""

823 # Try xprintidle first (X11 desktops)

824 try:

825 import subprocess

826 result = subprocess.run(

827 ['xprintidle'], capture_output=True, text=True, timeout=2,

828 )

829 if result.returncode == 0:

830 return float(result.stdout.strip())

831 except Exception:

832 pass

833 return None

834

835 def _get_idle_ms_macos(self) -> Optional[float]:

836 """macOS: ioreg HIDIdleTime (nanoseconds -> milliseconds)."""

837 try:

838 import subprocess

839 result = subprocess.run(

840 ['ioreg', '-c', 'IOHIDSystem'],

841 capture_output=True, text=True, timeout=2,

842 )

843 if result.returncode == 0:

844 for line in result.stdout.splitlines():

845 if 'HIDIdleTime' in line:

846 # Line format: "HIDIdleTime" = 1234567890

847 parts = line.split('=')

848 if len(parts) >= 2:

849 ns = int(parts[-1].strip())

850 return ns / 1_000_000.0 # ns -> ms

851 except Exception:

852 pass

853 return None

854

855 # ── CPU / Memory / Battery / GPU ──────────────────────────────

856

857 def _get_optimizer_snapshot(self):

858 """Read the latest cached SystemSnapshot from ComputeOptimizer.

859

860 Returns the snap or None when the optimizer module/singleton

861 isn't available (e.g. degraded boot, or a deployment that

862 explicitly disables it). ``include_per_process=False`` is

863 critical here — Governor only needs cpu/ram aggregates, and

864 forcing the optimizer's process_iter walk on every Governor

865 tick under sustained pressure would actually INCREASE GIL

866 contention (the opposite of why this consolidation exists).

867

868 Single source of truth for system pressure data — eliminates

869 the 2026-05-01 parallel-psutil-poll between Governor and

870 Optimizer that py-spy caught with BOTH monitors active+gil

871 simultaneously.

872 """

873 try:

874 from core.compute_optimizer import get_optimizer

875 return get_optimizer().get_latest_snapshot(

876 max_age_s=10.0, include_per_process=False,

877 )

878 except Exception:

879 return None

880

881 def _get_cpu_usage(self) -> float:

882 """Get current CPU usage as a float 0.0 to 1.0.

883

884 Read order:

885 1. ComputeOptimizer's cached snapshot (canonical psutil reader).

886 2. Direct psutil fallback when optimizer unavailable.

887 3. Linux ``os.getloadavg()`` when psutil itself is missing.

888 4. Constant 0.3 as a last-resort moderate-usage assumption.

889

890 The first path is the steady-state hot path post-2026-05-01

891 unification — Governor used to call psutil.cpu_percent itself

892 every 5s, duplicating ComputeOptimizer's 15s poll. The cached

893 snap gives us the same data with one writer.

894 """

895 snap = self._get_optimizer_snapshot()

896 if snap is not None:

897 return snap.cpu_percent / 100.0

898

899 psutil = _try_import_psutil()

900 if psutil is not None:

901 try:

902 return psutil.cpu_percent(interval=None) / 100.0

903 except Exception:

904 pass

905

906 # Linux fallback: os.getloadavg()

907 if hasattr(os, 'getloadavg'):

908 try:

909 load_1min = os.getloadavg()[0]

910 cpu_count = os.cpu_count() or 1

911 return min(1.0, load_1min / cpu_count)

912 except Exception:

913 pass

914

915 # Windows fallback without psutil: assume moderate usage

916 return 0.3

917

918 def _get_memory_pressure(self) -> float:

919 """Get memory pressure as a float 0.0 to 1.0.

920

921 Read order (mirrors ``_get_cpu_usage`` for symmetry):

922 1. ComputeOptimizer's cached snapshot (canonical psutil reader).

923 2. Direct psutil fallback when optimizer unavailable.

924 3. Linux ``/proc/meminfo`` when psutil itself is missing.

925 4. Constant 0.4 as a last-resort moderate-usage assumption.

926 """

927 snap = self._get_optimizer_snapshot()

928 if snap is not None:

929 return snap.ram_percent / 100.0

930

931 psutil = _try_import_psutil()

932 if psutil is not None:

933 try:

934 return psutil.virtual_memory().percent / 100.0

935 except Exception:

936 pass

937

938 # Linux fallback: /proc/meminfo

939 if sys.platform == 'linux':

940 try:

941 with open('/proc/meminfo', 'r') as f:

942 mem_info = {}

943 for line in f:

944 parts = line.split()

945 if len(parts) >= 2:

946 key = parts[0].rstrip(':')

947 mem_info[key] = int(parts[1])

948 total = mem_info.get('MemTotal', 1)

949 available = mem_info.get('MemAvailable',

950 mem_info.get('MemFree', total))

951 return 1.0 - (available / total)

952 except Exception:

953 pass

954

955 # Fallback: assume moderate

956 return 0.4

957

958 def _get_battery_status(self) -> tuple:

959 """Return (battery_level: float 0-1, on_battery: bool).

960

961 Returns (1.0, False) if no battery detected (desktop).

962 """

963 psutil = _try_import_psutil()

964 if psutil is not None:

965 try:

966 battery = psutil.sensors_battery()

967 if battery is not None:

968 return (battery.percent / 100.0, not battery.power_plugged)

969 except Exception:

970 pass

971

972 # Windows fallback: GetSystemPowerStatus via ctypes

973 if sys.platform == 'win32':

974 try:

975 class SYSTEM_POWER_STATUS(ctypes.Structure):

976 _fields_ = [

977 ('ACLineStatus', ctypes.c_byte),

978 ('BatteryFlag', ctypes.c_byte),

979 ('BatteryLifePercent', ctypes.c_byte),

980 ('SystemStatusFlag', ctypes.c_byte),

981 ('BatteryLifeTime', ctypes.c_ulong),

982 ('BatteryFullLifeTime', ctypes.c_ulong),

983 ]

984

985 status = SYSTEM_POWER_STATUS()

986 if ctypes.windll.kernel32.GetSystemPowerStatus(

987 ctypes.byref(status)):

988 pct = status.BatteryLifePercent

989 if pct == 255:

990 # Unknown / no battery

991 return (1.0, False)

992 on_battery = (status.ACLineStatus == 0)

993 return (pct / 100.0, on_battery)

994 except Exception:

995 pass

996

997 # Linux fallback: /sys/class/power_supply

998 if sys.platform == 'linux':

999 try:

1000 base = '/sys/class/power_supply'

1001 for entry in os.listdir(base):

1002 supply_path = os.path.join(base, entry)

1003 type_path = os.path.join(supply_path, 'type')

1004 if os.path.exists(type_path):

1005 with open(type_path) as f:

1006 if f.read().strip() != 'Battery':

1007 continue

1008 cap_path = os.path.join(supply_path, 'capacity')

1009 status_path = os.path.join(supply_path, 'status')

1010 if os.path.exists(cap_path):

1011 with open(cap_path) as f:

1012 pct = int(f.read().strip())

1013 on_battery = True

1014 if os.path.exists(status_path):

1015 with open(status_path) as f:

1016 on_battery = f.read().strip() == 'Discharging'

1017 return (pct / 100.0, on_battery)

1018 except Exception:

1019 pass

1020

1021 # No battery detected (desktop)

1022 return (1.0, False)

1023

1024 def _check_gpu_available(self) -> bool:

1025 """Check if GPU has usable free VRAM for hive work."""

1026 try:

1027 from integrations.service_tools.vram_manager import vram_manager

1028 info = vram_manager.detect_gpu()

1029 if not info.get('cuda_available'):

1030 return False

1031 free_gb = info.get('free_gb', 0.0)

1032 # Need at least 1 GB free to be useful for hive tasks

1033 return free_gb >= 1.0

1034 except Exception:

1035 return False

1036

1037 # ── Throttle Calculation ──────────────────────────────────────

1038

1039 def _calculate_throttle(self) -> float:

1040 """Combine all signals into a single throttle factor 0.0 - 1.0.

1041

1042 ACTIVE mode: 0.05 — bare minimum for event processing

1043 IDLE + low CPU: 1.0 — full speed

1044 IDLE + moderate CPU: 0.5

1045 SLEEP: 0.0 — suspend everything

1046 Battery < 20%: force 0.0

1047 """

1048 mode = self._mode

1049

1050 if mode == MODE_SLEEP:

1051 return 0.0

1052

1053 if mode == MODE_ACTIVE:

1054 return ACTIVE_CPU_LIMIT # 0.05

1055

1056 # IDLE mode — scale based on current resource usage

1057 cpu = self._get_cpu_usage()

1058 mem = self._get_memory_pressure()

1059

1060 throttle = 1.0

1061

1062 if cpu > 0.80:

1063 throttle *= 0.2

1064 elif cpu > 0.60:

1065 throttle *= 0.5

1066 elif cpu > 0.40:

1067 throttle *= 0.8

1068

1069 if mem > 0.90:

1070 throttle *= 0.3

1071 elif mem > 0.80:

1072 throttle *= 0.6

1073

1074 # Battery factor

1075 battery_level, on_battery = self._get_battery_status()

1076 if on_battery:

1077 if battery_level < BATTERY_SLEEP_THRESHOLD:

1078 return 0.0

1079 elif battery_level < BATTERY_THROTTLE_THRESHOLD:

1080 throttle *= 0.5

1081

1082 return max(0.0, min(1.0, throttle))

1083

1084 # ── Proactive Action Stream ───────────────────────────────────

1085

1086 def _proactive_action_stream(self) -> None:

1087 """Run during IDLE mode. Randomized sampling of hive intelligence.

1088

1089 Timers:

1090 - 30-120s: check hive signal feed for actionable items

1091 - 5-15min: check if pending hive tasks match this node

1092 - 30-60min: pre-download trending models if VRAM available

1093 - ~1h: run quick benchmark on active model

1094

1095 All timers use random jitter to prevent thundering herd.

1096 Immediately stops when mode switches to ACTIVE (via _cancel_event).

1097 """

1098 # Initialize next-action timestamps with jitter from now

1099 now = time.monotonic()

1100 next_signal_check = now + _jitter(SIGNAL_CHECK_INTERVAL)

1101 next_task_check = now + _jitter(TASK_CHECK_INTERVAL)

1102 next_model_prefetch = now + _jitter(MODEL_PREFETCH_INTERVAL)

1103 next_benchmark = now + _jitter(BENCHMARK_INTERVAL)

1104

1105 while self._running:

1106 # Heartbeat to watchdog each loop iteration

1107 try:

1108 from security.node_watchdog import get_watchdog

1109 _wd = get_watchdog()

1110 if _wd:

1111 _wd.heartbeat('resource_governor_proactive')

1112 except Exception:

1113 pass

1114 # Sleep in short increments, checking cancel event

1115 # Wait returns True if the event is set (cancel requested)

1116 cancelled = self._cancel_event.wait(timeout=5.0)

1117

1118 if not self._running:

1119 break

1120

1121 # Only do proactive work in IDLE mode

1122 if self._mode != MODE_IDLE:

1123 # Reset timers when re-entering idle (with fresh jitter)

1124 now = time.monotonic()

1125 next_signal_check = now + _jitter(SIGNAL_CHECK_INTERVAL)

1126 next_task_check = now + _jitter(TASK_CHECK_INTERVAL)

1127 next_model_prefetch = now + _jitter(MODEL_PREFETCH_INTERVAL)

1128 next_benchmark = now + _jitter(BENCHMARK_INTERVAL)

1129 continue

1130

1131 now = time.monotonic()

1132

1133 # Check hive signal feed

1134 if now >= next_signal_check:

1135 self._proactive_check_signals()

1136 next_signal_check = now + _jitter(SIGNAL_CHECK_INTERVAL)

1137

1138 # Check pending hive tasks

1139 if now >= next_task_check:

1140 self._proactive_check_tasks()

1141 next_task_check = now + _jitter(TASK_CHECK_INTERVAL)

1142

1143 # Pre-download trending models

1144 if now >= next_model_prefetch:

1145 self._proactive_prefetch_models()

1146 next_model_prefetch = now + _jitter(MODEL_PREFETCH_INTERVAL)

1147

1148 # Run benchmark

1149 if now >= next_benchmark:

1150 self._proactive_run_benchmark()

1151 next_benchmark = now + _jitter(BENCHMARK_INTERVAL)

1152

1153 def _proactive_check_signals(self) -> None:

1154 """Check hive signals AND run agentic service discovery."""

1155 if self._mode != MODE_IDLE:

1156 return

1157

1158 # Hive signal feed

1159 try:

1160 from integrations.channels.hive_signal_bridge import get_signal_bridge

1161 bridge = get_signal_bridge()

1162 signals = bridge.get_signal_feed(limit=10)

1163 with self._lock:

1164 self._stats['signals_checked'] += 1

1165 self._stats['proactive_actions'] += 1

1166 if signals:

1167 logger.debug("ResourceGovernor: checked %d hive signals",

1168 len(signals))

1169 self._emit_proactive_event('signal_check', {

1170 'signal_count': len(signals),

1171 })

1172 except Exception as e:

1173 logger.debug("ResourceGovernor: signal check failed: %s", e)

1174

1175 # Agentic service discovery — autonomously find new providers/models

1176 try:

1177 from integrations.providers.discovery_agent import get_discovery_agent

1178 discoveries = get_discovery_agent().run_discovery_cycle()

1179 if discoveries:

1180 with self._lock:

1181 self._stats['proactive_actions'] += 1

1182 self._emit_proactive_event('service_discovery', {

1183 'discoveries': len(discoveries),

1184 })

1185 except Exception as e:

1186 logger.debug("ResourceGovernor: discovery agent failed: %s", e)

1187

1188 def _proactive_check_tasks(self) -> None:

1189 """Check if any pending hive tasks match this node's capabilities."""

1190 if self._mode != MODE_IDLE:

1191 return

1192 try:

1193 from integrations.coding_agent.hive_task_protocol import (

1194 get_dispatcher,

1195 )

1196 dispatcher = get_dispatcher()

1197 dispatched = dispatcher.dispatch_pending()

1198 with self._lock:

1199 self._stats['tasks_dispatched'] += dispatched

1200 self._stats['proactive_actions'] += 1

1201 if dispatched:

1202 logger.info("ResourceGovernor: dispatched %d hive tasks "

1203 "during idle", dispatched)

1204 self._emit_proactive_event('task_dispatch', {

1205 'dispatched': dispatched,

1206 })

1207 except Exception as e:

1208 logger.debug("ResourceGovernor: task check failed: %s", e)

1209

1210 def _proactive_prefetch_models(self) -> None:

1211 """Pre-download trending models if GPU VRAM is available."""

1212 if self._mode != MODE_IDLE or not self._gpu_allowed:

1213 return

1214 try:

1215 # Check with model lifecycle for prefetch suggestions

1216 from integrations.service_tools.model_lifecycle import (

1217 get_model_lifecycle_manager,

1218 )

1219 mgr = get_model_lifecycle_manager()

1220 pressure = mgr.get_system_pressure()

1221 if pressure.get('throttle_factor', 0) < 0.3:

1222 # System too busy for prefetch

1223 return

1224 with self._lock:

1225 self._stats['models_prefetched'] += 1

1226 self._stats['proactive_actions'] += 1

1227 logger.debug("ResourceGovernor: model prefetch check complete")

1228 self._emit_proactive_event('model_prefetch', {

1229 'throttle_factor': pressure.get('throttle_factor', 0),

1230 })

1231 except Exception as e:

1232 logger.debug("ResourceGovernor: model prefetch failed: %s", e)

1233

1234 def _proactive_run_benchmark(self) -> None:

1235 """Run efficiency benchmarks on cloud providers during idle time.

1236

1237 Uses the EfficiencyMatrix to benchmark all configured API providers,

1238 building a continuous picture of speed/quality/cost for optimal routing.

1239 Also checks local model performance via the model_lifecycle manager.

1240 """

1241 if self._mode != MODE_IDLE:

1242 return

1243

1244 # Check system pressure — don't benchmark if system is busy

1245 try:

1246 from integrations.service_tools.model_lifecycle import (

1247 get_model_lifecycle_manager,

1248 )

1249 mgr = get_model_lifecycle_manager()

1250 pressure = mgr.get_system_pressure()

1251 if pressure.get('throttle_factor', 0) < 0.5:

1252 return

1253 except Exception:

1254 pass

1255

1256 # Run provider efficiency benchmarks

1257 try:

1258 from integrations.providers.efficiency_matrix import get_matrix

1259 matrix = get_matrix()

1260 matrix.run_benchmark(model_type='llm')

1261 with self._lock:

1262 self._stats['benchmarks_run'] += 1

1263 self._stats['proactive_actions'] += 1

1264 logger.info("ResourceGovernor: provider benchmarks complete")

1265 self._emit_proactive_event('benchmark', {

1266 'summary': matrix.get_matrix_summary(),

1267 })

1268 except Exception as e:

1269 logger.debug("ResourceGovernor: benchmark failed: %s", e)

1270

1271 def _emit_proactive_event(self, action: str, data: dict) -> None:

1272 """Emit a 'resource.proactive_action' event (best-effort)."""

1273 try:

1274 from core.platform.events import emit_event

1275 emit_event('resource.proactive_action', {

1276 'action': action,

1277 'mode': self._mode,

1278 'timestamp': time.time(),

1279 **data,

1280 })

1281 except Exception:

1282 pass

1283

1284

1285# ═══════════════════════════════════════════════════════════════════════

1286# Singleton

1287# ═══════════════════════════════════════════════════════════════════════

1288

1289_governor: Optional[ResourceGovernor] = None

1290_governor_lock = threading.Lock()

1291

1292

1293def get_governor() -> ResourceGovernor:

1294 """Get or create the singleton ResourceGovernor."""

1295 global _governor

1296 if _governor is None:

1297 with _governor_lock:

1298 if _governor is None:

1299 _governor = ResourceGovernor()

1300 return _governor

1301

1302

1303def should_proceed(resource: str = 'cpu_heavy') -> bool:

1304 """Module-level convenience: should HARTOS proceed with this resource usage?

1305

1306 Returns True if the governor allows the requested resource, or if the

1307 governor has not been started (no throttling by default).

1308

1309 Args:

1310 resource: One of 'cpu_heavy', 'gpu', 'network_heavy', 'disk_heavy'

1311

1312 Usage:

1313 from core.resource_governor import should_proceed

1314

1315 if should_proceed('gpu'):

1316 run_inference()

1317 """

1318 gov = _governor

1319 if gov is None or not gov._running:

1320 return True

1321 return gov.should_allow(resource)

Coverage for core / resource_governor.py: 57.4%

610 statements