Coverage for integrations / agent_engine / model_registry.py: 87.5%

168 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Unified Agent Goal Engine - Model Registry 

3 

4Central registry of available LLM backends with speed/accuracy/cost baselines. 

5Distinguishes local hive models (hardware-dependent latency) from API models 

6(fixed baseline). Every model call is energy-tracked and guardrail-gated. 

7 

8Adding a new backend = register a ModelBackend + set env var for its API key. 

9""" 

10import math 

11import os 

12import logging 

13import threading 

14import time 

15from collections import deque 

16from enum import Enum 

17from typing import Dict, List, Optional 

18 

19logger = logging.getLogger('hevolve_social') 

20 

21 

22# ─── Model Tier ─── 

23 

24class ModelTier(Enum): 

25 DRAFT = "draft" # Tiny (< 1B) models — first-responder / classifier 

26 # only. Always local. Used by draft-first dispatcher 

27 # to emit a standby reply + routing signal before 

28 # waking a heavier model. 

29 FAST = "fast" # Hive compute / local models / ultrafast 

30 BALANCED = "balanced" # Mid-tier API or learning models 

31 EXPERT = "expert" # GPT-4, Claude, DeepSeek — slower, higher quality 

32 

33 

34# ─── Model Backend ─── 

35 

36class ModelBackend: 

37 """Represents a single LLM backend with its baselines.""" 

38 

39 __slots__ = ( 

40 'model_id', 'display_name', 'tier', 'config_list_entry', 

41 'avg_latency_ms', 'accuracy_score', 'cost_per_1k_tokens', 

42 'is_local', 'hardware_dependent', 'gpu_tdp_watts', 

43 '_latency_samples', '_lock', 

44 ) 

45 

46 def __init__(self, model_id: str, display_name: str, tier: ModelTier, 

47 config_list_entry: dict, avg_latency_ms: float = 1000.0, 

48 accuracy_score: float = 0.5, cost_per_1k_tokens: float = 0.0, 

49 is_local: bool = False, hardware_dependent: bool = False, 

50 gpu_tdp_watts: float = 170.0): 

51 self.model_id = model_id 

52 self.display_name = display_name 

53 self.tier = tier 

54 self.config_list_entry = config_list_entry 

55 self.avg_latency_ms = avg_latency_ms 

56 self.accuracy_score = accuracy_score 

57 self.cost_per_1k_tokens = cost_per_1k_tokens 

58 self.is_local = is_local 

59 self.hardware_dependent = hardware_dependent 

60 self.gpu_tdp_watts = gpu_tdp_watts 

61 self._latency_samples: deque = deque(maxlen=100) 

62 self._lock = threading.Lock() 

63 

64 def to_config_list(self) -> list: 

65 return [self.config_list_entry] 

66 

67 def to_dict(self) -> dict: 

68 return { 

69 'model_id': self.model_id, 

70 'display_name': self.display_name, 

71 'tier': self.tier.value, 

72 'avg_latency_ms': self.avg_latency_ms, 

73 'accuracy_score': self.accuracy_score, 

74 'cost_per_1k_tokens': self.cost_per_1k_tokens, 

75 'is_local': self.is_local, 

76 'hardware_dependent': self.hardware_dependent, 

77 'gpu_tdp_watts': self.gpu_tdp_watts, 

78 } 

79 

80 def record_latency(self, latency_ms: float): 

81 """Record an observed latency and update the running average.""" 

82 with self._lock: 

83 self._latency_samples.append(latency_ms) 

84 self.avg_latency_ms = sum(self._latency_samples) / len(self._latency_samples) 

85 

86 

87# ─── Model Registry (Singleton) ─── 

88 

89class ModelRegistry: 

90 """Central registry of all available model backends. 

91 

92 Every model call flows through here, so energy tracking and guardrail 

93 checks attach at this layer. 

94 """ 

95 

96 def __init__(self): 

97 self._models: Dict[str, ModelBackend] = {} 

98 self._lock = threading.Lock() 

99 self._energy_log: deque = deque(maxlen=10000) # (timestamp, model_id, kwh) 

100 

101 def register(self, backend: ModelBackend): 

102 """Register a model backend.""" 

103 with self._lock: 

104 self._models[backend.model_id] = backend 

105 logger.info(f"ModelRegistry: registered {backend.model_id} " 

106 f"(tier={backend.tier.value}, latency={backend.avg_latency_ms}ms, " 

107 f"accuracy={backend.accuracy_score})") 

108 

109 def unregister(self, model_id: str) -> bool: 

110 """Remove a registered backend. 

111 

112 Returns True if a backend was removed, False if no entry existed. 

113 Idempotent — callers that don't track whether a backend is 

114 currently registered (e.g. peer health-check loops) can call 

115 this safely on every disconnect. 

116 

117 Used by HiveExpertDiscovery when a hive peer revokes its 

118 capability advertisement or fails health checks repeatedly. 

119 """ 

120 with self._lock: 

121 existed = model_id in self._models 

122 if existed: 

123 del self._models[model_id] 

124 if existed: 

125 logger.info(f"ModelRegistry: unregistered {model_id}") 

126 return existed 

127 

128 def get_model(self, model_id: str) -> Optional[ModelBackend]: 

129 with self._lock: 

130 return self._models.get(model_id) 

131 

132 def get_draft_model(self) -> Optional[ModelBackend]: 

133 """Get the lowest-latency DRAFT tier model — the first-responder 

134 that drives `SpeculativeDispatcher.dispatch_draft_first()`. DRAFT 

135 models are tiny (<1B), always local, and are expected to emit a 

136 standby reply + routing signal before a heavier model is woken. 

137 

138 Returns None if no DRAFT tier model is registered, which makes the 

139 dispatcher gracefully fall through to the normal path.""" 

140 with self._lock: 

141 candidates = [ 

142 m for m in self._models.values() 

143 if m.tier == ModelTier.DRAFT 

144 ] 

145 if not candidates: 

146 return None 

147 return min(candidates, key=lambda m: m.avg_latency_ms) 

148 

149 def get_fast_model(self, min_accuracy: float = 0.0) -> Optional[ModelBackend]: 

150 """Get the lowest-latency model meeting minimum accuracy. 

151 

152 DRAFT tier is excluded from this selection — DRAFT models answer 

153 via the dedicated draft-first path, not the speculative fast 

154 path, because they can't produce final answers for complex tasks. 

155 """ 

156 with self._lock: 

157 candidates = [ 

158 m for m in self._models.values() 

159 if m.accuracy_score >= min_accuracy 

160 and m.tier != ModelTier.DRAFT 

161 ] 

162 if not candidates: 

163 return None 

164 return min(candidates, key=lambda m: m.avg_latency_ms) 

165 

166 def get_expert_model(self, max_cost: float = float('inf')) -> Optional[ModelBackend]: 

167 """Get the highest-accuracy model within budget. DRAFT models are 

168 excluded — they answer via the draft-first path, never as an 

169 expert cross-check.""" 

170 with self._lock: 

171 candidates = [ 

172 m for m in self._models.values() 

173 if m.cost_per_1k_tokens <= max_cost 

174 and m.tier != ModelTier.DRAFT 

175 ] 

176 if not candidates: 

177 return None 

178 return max(candidates, key=lambda m: m.accuracy_score) 

179 

180 def get_local_model(self, min_accuracy: float = 0.0) -> Optional[ModelBackend]: 

181 """Get the highest-accuracy local model (is_local=True, cost=0). 

182 

183 Used by policy-aware routing to prefer local compute for hive/idle tasks. 

184 """ 

185 with self._lock: 

186 candidates = [m for m in self._models.values() 

187 if m.is_local and m.accuracy_score >= min_accuracy] 

188 if not candidates: 

189 return None 

190 return max(candidates, key=lambda m: m.accuracy_score) 

191 

192 def get_model_by_policy(self, policy: str = 'local_preferred', 

193 task_source: str = 'own', 

194 min_accuracy: float = 0.0) -> Optional[ModelBackend]: 

195 """Policy-aware model selection. 

196 

197 Policies: 

198 local_only — Only local models (is_local=True). Returns None if none available. 

199 local_preferred — Try local first, fall through to metered if none available. 

200 any — Fastest model regardless of locality (metered costs tracked). 

201 

202 For hive/idle tasks, enforces at least local_preferred unless node opted into 'any'. 

203 """ 

204 if task_source in ('hive', 'idle') and policy != 'any': 

205 policy = 'local_preferred' 

206 

207 if policy == 'local_only': 

208 return self.get_local_model(min_accuracy) 

209 

210 if policy == 'local_preferred': 

211 local = self.get_local_model(min_accuracy) 

212 if local: 

213 return local 

214 # Fall through to metered (will be tracked + compensated) 

215 

216 return self.get_fast_model(min_accuracy) 

217 

218 def list_models(self, tier: ModelTier = None) -> List[ModelBackend]: 

219 """List all models, optionally filtered by tier.""" 

220 with self._lock: 

221 models = list(self._models.values()) 

222 if tier: 

223 models = [m for m in models if m.tier == tier] 

224 return sorted(models, key=lambda m: m.avg_latency_ms) 

225 

226 def record_latency(self, model_id: str, latency_ms: float): 

227 """Record observed latency for a model (live running average).""" 

228 with self._lock: 

229 model = self._models.get(model_id) 

230 if model: 

231 model.record_latency(latency_ms) 

232 

233 def record_energy(self, model_id: str, duration_ms: float): 

234 """Record energy consumption for every model call — guardrail requirement.""" 

235 from security.hive_guardrails import EnergyAwareness 

236 with self._lock: 

237 model = self._models.get(model_id) 

238 if model: 

239 kwh = EnergyAwareness.estimate_energy_kwh(model.to_dict(), duration_ms) 

240 self._energy_log.append((time.time(), model_id, kwh)) 

241 

242 def get_total_energy_kwh(self, hours: float = 24) -> float: 

243 """Get total energy consumed in the last N hours.""" 

244 cutoff = time.time() - (hours * 3600) 

245 return sum(kwh for ts, _, kwh in self._energy_log if ts > cutoff) 

246 

247 def get_hardware_adjusted_latency(self, model_id: str, 

248 peer_node: dict = None) -> float: 

249 """PeerNode-aware latency estimate for hive compute nodes. 

250 

251 Uses PeerNode.compute_cpu_cores / compute_ram_gb / compute_gpu_count 

252 to scale the baseline latency. 

253 """ 

254 model = self._models.get(model_id) 

255 if not model: 

256 return float('inf') 

257 base = model.avg_latency_ms 

258 if not model.hardware_dependent or not peer_node: 

259 return base 

260 # Reference hardware: 8 cores, 16 GB RAM, 1 GPU 

261 gpu_factor = 1.0 / max(peer_node.get('compute_gpu_count', 1) or 1, 1) 

262 cpu_factor = 8.0 / max(peer_node.get('compute_cpu_cores', 8) or 8, 1) 

263 ram_factor = 16.0 / max(peer_node.get('compute_ram_gb', 16) or 16, 1) 

264 scale = 0.40 * gpu_factor + 0.35 * cpu_factor + 0.25 * ram_factor 

265 return base * max(scale, 0.3) # Floor at 30% of baseline 

266 

267 def update_accuracy(self, model_id: str, new_score: float): 

268 """Update accuracy with guardrail-enforced cap (max 5%/day improvement).""" 

269 from security.hive_guardrails import WorldModelSafetyBounds 

270 model = self._models.get(model_id) 

271 if model: 

272 capped = WorldModelSafetyBounds.gate_accuracy_update( 

273 model_id, model.accuracy_score, new_score) 

274 model.accuracy_score = capped 

275 

276 

277# ─── Module-level singleton ─── 

278model_registry = ModelRegistry() 

279 

280 

281# ─── Default backend registration ─── 

282 

283def _register_defaults(): 

284 """Register default model backends. Only available if API keys are set.""" 

285 

286 # 0. Local Qwen3.5 0.8B — DRAFT tier first-responder for the 

287 # draft-first dispatcher. Its job is to emit an immediate standby 

288 # reply plus a JSON routing signal saying whether the request can 

289 # be handled locally or needs delegation to the 4B or the hive. 

290 # ~200-400ms on consumer GPUs. Its interactions flow through 

291 # WorldModelBridge.record_interaction so HevolveAI's continual 

292 # learner can distill expert→draft over time (the draft gets 

293 # better at knowing when to delegate vs answer directly). 

294 # 

295 # The draft runs on port 8081 (vlm_caption) — NOT the main LLM 

296 # port 8080 where the 4B lives. Both servers stay resident so 

297 # draft-first pays real draft latency (~300ms) and normal chat 

298 # pays 4B latency (~700ms) without swapping. Nunba's 

299 # LlamaConfig.start_caption_server owns the 0.8B process and 

300 # main.py kicks it off during _deferred_platform_init. 

301 from core.port_registry import get_local_llm_url, get_local_draft_url 

302 _local_url = get_local_llm_url() 

303 _draft_url = get_local_draft_url() 

304 

305 model_registry.register(ModelBackend( 

306 model_id='qwen3.5-0.8b-draft', 

307 display_name='Qwen3.5 0.8B (Draft)', 

308 tier=ModelTier.DRAFT, 

309 config_list_entry={ 

310 'model': 'Qwen3.5-0.8B-Instruct', 

311 'api_key': 'dummy', 

312 'base_url': _draft_url, 

313 'price': [0, 0], 

314 }, 

315 avg_latency_ms=300.0, 

316 accuracy_score=0.45, 

317 cost_per_1k_tokens=0.0, 

318 is_local=True, 

319 hardware_dependent=True, 

320 gpu_tdp_watts=80.0, 

321 )) 

322 

323 # 1. Local Qwen3.5-4B VL — default local model (256K context, vision+text, llama.cpp b8148+) 

324 

325 model_registry.register(ModelBackend( 

326 model_id='qwen3.5-4b-local', 

327 display_name='Qwen3.5 4B VL (Local)', 

328 tier=ModelTier.FAST, 

329 config_list_entry={ 

330 'model': 'Qwen3.5-4B', 

331 'api_key': 'dummy', 

332 'base_url': _local_url, 

333 'price': [0, 0], 

334 }, 

335 avg_latency_ms=700.0, 

336 accuracy_score=0.60, 

337 cost_per_1k_tokens=0.0, 

338 is_local=True, 

339 hardware_dependent=True, 

340 gpu_tdp_watts=170.0, 

341 )) 

342 

343 # 1b. Local Qwen3-VL-4B — fallback for older llama.cpp installs 

344 model_registry.register(ModelBackend( 

345 model_id='qwen3-vl-4b-local', 

346 display_name='Qwen3-VL 4B (Local)', 

347 tier=ModelTier.FAST, 

348 config_list_entry={ 

349 'model': 'Qwen3-VL-4B-Instruct', 

350 'api_key': 'dummy', 

351 'base_url': _local_url, 

352 'price': [0, 0], 

353 }, 

354 avg_latency_ms=800.0, 

355 accuracy_score=0.55, 

356 cost_per_1k_tokens=0.0, 

357 is_local=True, 

358 hardware_dependent=True, 

359 gpu_tdp_watts=170.0, 

360 )) 

361 

362 # 2. Groq (fast API — if key set) 

363 if os.environ.get('GROQ_API_KEY'): 

364 model_registry.register(ModelBackend( 

365 model_id='groq-llama-3.1-8b', 

366 display_name='Groq LLaMA 3.1 8B', 

367 tier=ModelTier.FAST, 

368 config_list_entry={ 

369 'model': 'llama-3.1-8b-instant', 

370 'api_key': os.environ['GROQ_API_KEY'], 

371 'base_url': 'https://api.groq.com/openai/v1', 

372 'price': [0.05, 0.08], 

373 }, 

374 avg_latency_ms=300.0, 

375 accuracy_score=0.60, 

376 cost_per_1k_tokens=0.1, 

377 )) 

378 

379 # 3. DeepSeek V3 (balanced — if key set) 

380 if os.environ.get('DEEPSEEK_API_KEY'): 

381 model_registry.register(ModelBackend( 

382 model_id='deepseek-v3', 

383 display_name='DeepSeek V3', 

384 tier=ModelTier.BALANCED, 

385 config_list_entry={ 

386 'model': 'deepseek-chat', 

387 'api_key': os.environ['DEEPSEEK_API_KEY'], 

388 'base_url': 'https://api.deepseek.com/v1', 

389 'price': [0.14, 0.28], 

390 }, 

391 avg_latency_ms=1500.0, 

392 accuracy_score=0.82, 

393 cost_per_1k_tokens=0.5, 

394 )) 

395 

396 # 4. GPT-4.1 Azure (expert — if key set) 

397 if os.environ.get('AZURE_OPENAI_API_KEY'): 

398 model_registry.register(ModelBackend( 

399 model_id='gpt-4.1-azure', 

400 display_name='GPT-4.1 (Azure)', 

401 tier=ModelTier.EXPERT, 

402 config_list_entry={ 

403 'model': 'gpt-4.1', 

404 'api_type': 'azure', 

405 'api_key': os.environ['AZURE_OPENAI_API_KEY'], 

406 'base_url': os.environ.get('AZURE_OPENAI_ENDPOINT', ''), 

407 'api_version': '2024-12-01-preview', 

408 'price': [0.0025, 0.01], 

409 }, 

410 avg_latency_ms=3000.0, 

411 accuracy_score=0.92, 

412 cost_per_1k_tokens=2.5, 

413 )) 

414 

415 # 5. Claude Sonnet (expert — if key set) 

416 if os.environ.get('ANTHROPIC_API_KEY'): 

417 model_registry.register(ModelBackend( 

418 model_id='claude-sonnet', 

419 display_name='Claude Sonnet 4.5', 

420 tier=ModelTier.EXPERT, 

421 config_list_entry={ 

422 'model': 'claude-sonnet-4-5-20250929', 

423 'api_key': os.environ['ANTHROPIC_API_KEY'], 

424 'base_url': 'https://api.anthropic.com/v1', 

425 'price': [0.003, 0.015], 

426 }, 

427 avg_latency_ms=2500.0, 

428 accuracy_score=0.93, 

429 cost_per_1k_tokens=1.5, 

430 )) 

431 

432 # 6. HevolveAI-Core Learning LLM (balanced — local world model, improves over time) 

433 hevolveai_url = os.environ.get('HEVOLVEAI_API_URL') 

434 if hevolveai_url: 

435 model_registry.register(ModelBackend( 

436 model_id='hevolveai-learning', 

437 display_name='HevolveAI World Model (Learning)', 

438 tier=ModelTier.BALANCED, 

439 config_list_entry={ 

440 'model': 'hevolveai-learning', 

441 'api_key': 'local', 

442 'base_url': hevolveai_url, 

443 'price': [0, 0], 

444 }, 

445 avg_latency_ms=50.0, 

446 accuracy_score=0.70, 

447 cost_per_1k_tokens=0.0, 

448 is_local=True, 

449 hardware_dependent=True, 

450 )) 

451 

452 # 7. MobileVLM ONNX (fast — lightweight CPU vision for embedded/lite tiers) 

453 if os.environ.get('HEVOLVE_VISION_LITE_ENABLED', '').lower() == 'true': 

454 model_registry.register(ModelBackend( 

455 model_id='mobilevlm-1.7b-onnx', 

456 display_name='MobileVLM 1.7B (ONNX CPU)', 

457 tier=ModelTier.FAST, 

458 config_list_entry={ 

459 'model': 'mobilevlm-1.7b', 

460 'api_key': 'local', 

461 'base_url': 'local://onnxruntime', 

462 'price': [0, 0], 

463 }, 

464 avg_latency_ms=500.0, 

465 accuracy_score=0.45, 

466 cost_per_1k_tokens=0.0, 

467 is_local=True, 

468 hardware_dependent=True, 

469 gpu_tdp_watts=0.0, # CPU-only, no GPU power draw 

470 )) 

471 

472 # 8. Pocket TTS — offline, CPU, 100M params, MIT (always available) 

473 model_registry.register(ModelBackend( 

474 model_id='pocket-tts-100m', 

475 display_name='Pocket TTS 100M (Offline)', 

476 tier=ModelTier.FAST, 

477 config_list_entry={ 

478 'model': 'pocket-tts-100m', 

479 'api_key': 'local', 

480 'base_url': 'inprocess://pocket_tts', 

481 'price': [0, 0], 

482 }, 

483 avg_latency_ms=200.0, 

484 accuracy_score=0.85, 

485 cost_per_1k_tokens=0.0, 

486 is_local=True, 

487 hardware_dependent=False, 

488 gpu_tdp_watts=0.0, 

489 )) 

490 

491 # 9. Whisper STT — offline, sherpa-onnx or openai-whisper (always available) 

492 model_registry.register(ModelBackend( 

493 model_id='whisper-stt-local', 

494 display_name='Whisper STT (sherpa-onnx / Local)', 

495 tier=ModelTier.FAST, 

496 config_list_entry={ 

497 'model': 'whisper-stt', 

498 'api_key': 'local', 

499 'base_url': 'inprocess://whisper', 

500 'price': [0, 0], 

501 }, 

502 avg_latency_ms=500.0, 

503 accuracy_score=0.88, 

504 cost_per_1k_tokens=0.0, 

505 is_local=True, 

506 hardware_dependent=True, 

507 gpu_tdp_watts=0.0, 

508 )) 

509 

510 # 10. LuxTTS — 48kHz voice cloning TTS (GPU-accelerated, Apache 2.0) 

511 # 150x realtime on GPU, >1x on CPU, <1GB VRAM 

512 # ZipVoice-distilled, 4-step diffusion, voice cloning from 3s audio 

513 _luxtts_available = False 

514 try: 

515 from zipvoice.luxvoice import LuxTTS as _LuxCheck # noqa: F401 

516 _luxtts_available = True 

517 except ImportError: 

518 pass 

519 

520 if _luxtts_available: 

521 # Detect GPU for latency estimate 

522 _luxtts_has_gpu = False 

523 try: 

524 import torch as _torch_check 

525 _luxtts_has_gpu = _torch_check.cuda.is_available() 

526 except ImportError: 

527 pass 

528 

529 model_registry.register(ModelBackend( 

530 model_id='luxtts-48k', 

531 display_name='LuxTTS 48kHz (Voice Cloning)', 

532 tier=ModelTier.FAST, 

533 config_list_entry={ 

534 'model': 'luxtts-48k', 

535 'api_key': 'local', 

536 'base_url': 'inprocess://luxtts', 

537 'price': [0, 0], 

538 }, 

539 avg_latency_ms=50.0 if _luxtts_has_gpu else 800.0, 

540 accuracy_score=0.93, 

541 cost_per_1k_tokens=0.0, 

542 is_local=True, 

543 hardware_dependent=True, 

544 gpu_tdp_watts=170.0 if _luxtts_has_gpu else 0.0, 

545 )) 

546 

547 # 11. MakeItTalk Cloud — TTS + video generation (if MAKEITTALK_API_URL set) 

548 # Cloud service: Flask+Celery, 7 TTS backends, lip-sync animation 

549 # POST /video-gen/ for full pipeline, audio_generation for TTS only 

550 makeittalk_url = os.environ.get('MAKEITTALK_API_URL') 

551 if makeittalk_url: 

552 model_registry.register(ModelBackend( 

553 model_id='makeittalk-cloud', 

554 display_name='MakeItTalk Cloud (TTS + Video)', 

555 tier=ModelTier.BALANCED, 

556 config_list_entry={ 

557 'model': 'makeittalk', 

558 'api_key': 'cloud', 

559 'base_url': makeittalk_url, 

560 'price': [0, 0], # internal service 

561 }, 

562 avg_latency_ms=5000.0, 

563 accuracy_score=0.92, 

564 cost_per_1k_tokens=0.0, 

565 is_local=False, 

566 hardware_dependent=False, 

567 gpu_tdp_watts=0.0, 

568 )) 

569 

570 logger.info(f"ModelRegistry: {len(model_registry._models)} backends registered") 

571 

572 

573# Auto-register on import 

574_register_defaults()