Coverage for integrations / agent_engine / model_registry.py: 87.5%
168 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Unified Agent Goal Engine - Model Registry
4Central registry of available LLM backends with speed/accuracy/cost baselines.
5Distinguishes local hive models (hardware-dependent latency) from API models
6(fixed baseline). Every model call is energy-tracked and guardrail-gated.
8Adding a new backend = register a ModelBackend + set env var for its API key.
9"""
10import math
11import os
12import logging
13import threading
14import time
15from collections import deque
16from enum import Enum
17from typing import Dict, List, Optional
19logger = logging.getLogger('hevolve_social')
22# ─── Model Tier ───
24class ModelTier(Enum):
25 DRAFT = "draft" # Tiny (< 1B) models — first-responder / classifier
26 # only. Always local. Used by draft-first dispatcher
27 # to emit a standby reply + routing signal before
28 # waking a heavier model.
29 FAST = "fast" # Hive compute / local models / ultrafast
30 BALANCED = "balanced" # Mid-tier API or learning models
31 EXPERT = "expert" # GPT-4, Claude, DeepSeek — slower, higher quality
34# ─── Model Backend ───
36class ModelBackend:
37 """Represents a single LLM backend with its baselines."""
39 __slots__ = (
40 'model_id', 'display_name', 'tier', 'config_list_entry',
41 'avg_latency_ms', 'accuracy_score', 'cost_per_1k_tokens',
42 'is_local', 'hardware_dependent', 'gpu_tdp_watts',
43 '_latency_samples', '_lock',
44 )
46 def __init__(self, model_id: str, display_name: str, tier: ModelTier,
47 config_list_entry: dict, avg_latency_ms: float = 1000.0,
48 accuracy_score: float = 0.5, cost_per_1k_tokens: float = 0.0,
49 is_local: bool = False, hardware_dependent: bool = False,
50 gpu_tdp_watts: float = 170.0):
51 self.model_id = model_id
52 self.display_name = display_name
53 self.tier = tier
54 self.config_list_entry = config_list_entry
55 self.avg_latency_ms = avg_latency_ms
56 self.accuracy_score = accuracy_score
57 self.cost_per_1k_tokens = cost_per_1k_tokens
58 self.is_local = is_local
59 self.hardware_dependent = hardware_dependent
60 self.gpu_tdp_watts = gpu_tdp_watts
61 self._latency_samples: deque = deque(maxlen=100)
62 self._lock = threading.Lock()
64 def to_config_list(self) -> list:
65 return [self.config_list_entry]
67 def to_dict(self) -> dict:
68 return {
69 'model_id': self.model_id,
70 'display_name': self.display_name,
71 'tier': self.tier.value,
72 'avg_latency_ms': self.avg_latency_ms,
73 'accuracy_score': self.accuracy_score,
74 'cost_per_1k_tokens': self.cost_per_1k_tokens,
75 'is_local': self.is_local,
76 'hardware_dependent': self.hardware_dependent,
77 'gpu_tdp_watts': self.gpu_tdp_watts,
78 }
80 def record_latency(self, latency_ms: float):
81 """Record an observed latency and update the running average."""
82 with self._lock:
83 self._latency_samples.append(latency_ms)
84 self.avg_latency_ms = sum(self._latency_samples) / len(self._latency_samples)
87# ─── Model Registry (Singleton) ───
89class ModelRegistry:
90 """Central registry of all available model backends.
92 Every model call flows through here, so energy tracking and guardrail
93 checks attach at this layer.
94 """
96 def __init__(self):
97 self._models: Dict[str, ModelBackend] = {}
98 self._lock = threading.Lock()
99 self._energy_log: deque = deque(maxlen=10000) # (timestamp, model_id, kwh)
101 def register(self, backend: ModelBackend):
102 """Register a model backend."""
103 with self._lock:
104 self._models[backend.model_id] = backend
105 logger.info(f"ModelRegistry: registered {backend.model_id} "
106 f"(tier={backend.tier.value}, latency={backend.avg_latency_ms}ms, "
107 f"accuracy={backend.accuracy_score})")
109 def unregister(self, model_id: str) -> bool:
110 """Remove a registered backend.
112 Returns True if a backend was removed, False if no entry existed.
113 Idempotent — callers that don't track whether a backend is
114 currently registered (e.g. peer health-check loops) can call
115 this safely on every disconnect.
117 Used by HiveExpertDiscovery when a hive peer revokes its
118 capability advertisement or fails health checks repeatedly.
119 """
120 with self._lock:
121 existed = model_id in self._models
122 if existed:
123 del self._models[model_id]
124 if existed:
125 logger.info(f"ModelRegistry: unregistered {model_id}")
126 return existed
128 def get_model(self, model_id: str) -> Optional[ModelBackend]:
129 with self._lock:
130 return self._models.get(model_id)
132 def get_draft_model(self) -> Optional[ModelBackend]:
133 """Get the lowest-latency DRAFT tier model — the first-responder
134 that drives `SpeculativeDispatcher.dispatch_draft_first()`. DRAFT
135 models are tiny (<1B), always local, and are expected to emit a
136 standby reply + routing signal before a heavier model is woken.
138 Returns None if no DRAFT tier model is registered, which makes the
139 dispatcher gracefully fall through to the normal path."""
140 with self._lock:
141 candidates = [
142 m for m in self._models.values()
143 if m.tier == ModelTier.DRAFT
144 ]
145 if not candidates:
146 return None
147 return min(candidates, key=lambda m: m.avg_latency_ms)
149 def get_fast_model(self, min_accuracy: float = 0.0) -> Optional[ModelBackend]:
150 """Get the lowest-latency model meeting minimum accuracy.
152 DRAFT tier is excluded from this selection — DRAFT models answer
153 via the dedicated draft-first path, not the speculative fast
154 path, because they can't produce final answers for complex tasks.
155 """
156 with self._lock:
157 candidates = [
158 m for m in self._models.values()
159 if m.accuracy_score >= min_accuracy
160 and m.tier != ModelTier.DRAFT
161 ]
162 if not candidates:
163 return None
164 return min(candidates, key=lambda m: m.avg_latency_ms)
166 def get_expert_model(self, max_cost: float = float('inf')) -> Optional[ModelBackend]:
167 """Get the highest-accuracy model within budget. DRAFT models are
168 excluded — they answer via the draft-first path, never as an
169 expert cross-check."""
170 with self._lock:
171 candidates = [
172 m for m in self._models.values()
173 if m.cost_per_1k_tokens <= max_cost
174 and m.tier != ModelTier.DRAFT
175 ]
176 if not candidates:
177 return None
178 return max(candidates, key=lambda m: m.accuracy_score)
180 def get_local_model(self, min_accuracy: float = 0.0) -> Optional[ModelBackend]:
181 """Get the highest-accuracy local model (is_local=True, cost=0).
183 Used by policy-aware routing to prefer local compute for hive/idle tasks.
184 """
185 with self._lock:
186 candidates = [m for m in self._models.values()
187 if m.is_local and m.accuracy_score >= min_accuracy]
188 if not candidates:
189 return None
190 return max(candidates, key=lambda m: m.accuracy_score)
192 def get_model_by_policy(self, policy: str = 'local_preferred',
193 task_source: str = 'own',
194 min_accuracy: float = 0.0) -> Optional[ModelBackend]:
195 """Policy-aware model selection.
197 Policies:
198 local_only — Only local models (is_local=True). Returns None if none available.
199 local_preferred — Try local first, fall through to metered if none available.
200 any — Fastest model regardless of locality (metered costs tracked).
202 For hive/idle tasks, enforces at least local_preferred unless node opted into 'any'.
203 """
204 if task_source in ('hive', 'idle') and policy != 'any':
205 policy = 'local_preferred'
207 if policy == 'local_only':
208 return self.get_local_model(min_accuracy)
210 if policy == 'local_preferred':
211 local = self.get_local_model(min_accuracy)
212 if local:
213 return local
214 # Fall through to metered (will be tracked + compensated)
216 return self.get_fast_model(min_accuracy)
218 def list_models(self, tier: ModelTier = None) -> List[ModelBackend]:
219 """List all models, optionally filtered by tier."""
220 with self._lock:
221 models = list(self._models.values())
222 if tier:
223 models = [m for m in models if m.tier == tier]
224 return sorted(models, key=lambda m: m.avg_latency_ms)
226 def record_latency(self, model_id: str, latency_ms: float):
227 """Record observed latency for a model (live running average)."""
228 with self._lock:
229 model = self._models.get(model_id)
230 if model:
231 model.record_latency(latency_ms)
233 def record_energy(self, model_id: str, duration_ms: float):
234 """Record energy consumption for every model call — guardrail requirement."""
235 from security.hive_guardrails import EnergyAwareness
236 with self._lock:
237 model = self._models.get(model_id)
238 if model:
239 kwh = EnergyAwareness.estimate_energy_kwh(model.to_dict(), duration_ms)
240 self._energy_log.append((time.time(), model_id, kwh))
242 def get_total_energy_kwh(self, hours: float = 24) -> float:
243 """Get total energy consumed in the last N hours."""
244 cutoff = time.time() - (hours * 3600)
245 return sum(kwh for ts, _, kwh in self._energy_log if ts > cutoff)
247 def get_hardware_adjusted_latency(self, model_id: str,
248 peer_node: dict = None) -> float:
249 """PeerNode-aware latency estimate for hive compute nodes.
251 Uses PeerNode.compute_cpu_cores / compute_ram_gb / compute_gpu_count
252 to scale the baseline latency.
253 """
254 model = self._models.get(model_id)
255 if not model:
256 return float('inf')
257 base = model.avg_latency_ms
258 if not model.hardware_dependent or not peer_node:
259 return base
260 # Reference hardware: 8 cores, 16 GB RAM, 1 GPU
261 gpu_factor = 1.0 / max(peer_node.get('compute_gpu_count', 1) or 1, 1)
262 cpu_factor = 8.0 / max(peer_node.get('compute_cpu_cores', 8) or 8, 1)
263 ram_factor = 16.0 / max(peer_node.get('compute_ram_gb', 16) or 16, 1)
264 scale = 0.40 * gpu_factor + 0.35 * cpu_factor + 0.25 * ram_factor
265 return base * max(scale, 0.3) # Floor at 30% of baseline
267 def update_accuracy(self, model_id: str, new_score: float):
268 """Update accuracy with guardrail-enforced cap (max 5%/day improvement)."""
269 from security.hive_guardrails import WorldModelSafetyBounds
270 model = self._models.get(model_id)
271 if model:
272 capped = WorldModelSafetyBounds.gate_accuracy_update(
273 model_id, model.accuracy_score, new_score)
274 model.accuracy_score = capped
277# ─── Module-level singleton ───
278model_registry = ModelRegistry()
281# ─── Default backend registration ───
283def _register_defaults():
284 """Register default model backends. Only available if API keys are set."""
286 # 0. Local Qwen3.5 0.8B — DRAFT tier first-responder for the
287 # draft-first dispatcher. Its job is to emit an immediate standby
288 # reply plus a JSON routing signal saying whether the request can
289 # be handled locally or needs delegation to the 4B or the hive.
290 # ~200-400ms on consumer GPUs. Its interactions flow through
291 # WorldModelBridge.record_interaction so HevolveAI's continual
292 # learner can distill expert→draft over time (the draft gets
293 # better at knowing when to delegate vs answer directly).
294 #
295 # The draft runs on port 8081 (vlm_caption) — NOT the main LLM
296 # port 8080 where the 4B lives. Both servers stay resident so
297 # draft-first pays real draft latency (~300ms) and normal chat
298 # pays 4B latency (~700ms) without swapping. Nunba's
299 # LlamaConfig.start_caption_server owns the 0.8B process and
300 # main.py kicks it off during _deferred_platform_init.
301 from core.port_registry import get_local_llm_url, get_local_draft_url
302 _local_url = get_local_llm_url()
303 _draft_url = get_local_draft_url()
305 model_registry.register(ModelBackend(
306 model_id='qwen3.5-0.8b-draft',
307 display_name='Qwen3.5 0.8B (Draft)',
308 tier=ModelTier.DRAFT,
309 config_list_entry={
310 'model': 'Qwen3.5-0.8B-Instruct',
311 'api_key': 'dummy',
312 'base_url': _draft_url,
313 'price': [0, 0],
314 },
315 avg_latency_ms=300.0,
316 accuracy_score=0.45,
317 cost_per_1k_tokens=0.0,
318 is_local=True,
319 hardware_dependent=True,
320 gpu_tdp_watts=80.0,
321 ))
323 # 1. Local Qwen3.5-4B VL — default local model (256K context, vision+text, llama.cpp b8148+)
325 model_registry.register(ModelBackend(
326 model_id='qwen3.5-4b-local',
327 display_name='Qwen3.5 4B VL (Local)',
328 tier=ModelTier.FAST,
329 config_list_entry={
330 'model': 'Qwen3.5-4B',
331 'api_key': 'dummy',
332 'base_url': _local_url,
333 'price': [0, 0],
334 },
335 avg_latency_ms=700.0,
336 accuracy_score=0.60,
337 cost_per_1k_tokens=0.0,
338 is_local=True,
339 hardware_dependent=True,
340 gpu_tdp_watts=170.0,
341 ))
343 # 1b. Local Qwen3-VL-4B — fallback for older llama.cpp installs
344 model_registry.register(ModelBackend(
345 model_id='qwen3-vl-4b-local',
346 display_name='Qwen3-VL 4B (Local)',
347 tier=ModelTier.FAST,
348 config_list_entry={
349 'model': 'Qwen3-VL-4B-Instruct',
350 'api_key': 'dummy',
351 'base_url': _local_url,
352 'price': [0, 0],
353 },
354 avg_latency_ms=800.0,
355 accuracy_score=0.55,
356 cost_per_1k_tokens=0.0,
357 is_local=True,
358 hardware_dependent=True,
359 gpu_tdp_watts=170.0,
360 ))
362 # 2. Groq (fast API — if key set)
363 if os.environ.get('GROQ_API_KEY'):
364 model_registry.register(ModelBackend(
365 model_id='groq-llama-3.1-8b',
366 display_name='Groq LLaMA 3.1 8B',
367 tier=ModelTier.FAST,
368 config_list_entry={
369 'model': 'llama-3.1-8b-instant',
370 'api_key': os.environ['GROQ_API_KEY'],
371 'base_url': 'https://api.groq.com/openai/v1',
372 'price': [0.05, 0.08],
373 },
374 avg_latency_ms=300.0,
375 accuracy_score=0.60,
376 cost_per_1k_tokens=0.1,
377 ))
379 # 3. DeepSeek V3 (balanced — if key set)
380 if os.environ.get('DEEPSEEK_API_KEY'):
381 model_registry.register(ModelBackend(
382 model_id='deepseek-v3',
383 display_name='DeepSeek V3',
384 tier=ModelTier.BALANCED,
385 config_list_entry={
386 'model': 'deepseek-chat',
387 'api_key': os.environ['DEEPSEEK_API_KEY'],
388 'base_url': 'https://api.deepseek.com/v1',
389 'price': [0.14, 0.28],
390 },
391 avg_latency_ms=1500.0,
392 accuracy_score=0.82,
393 cost_per_1k_tokens=0.5,
394 ))
396 # 4. GPT-4.1 Azure (expert — if key set)
397 if os.environ.get('AZURE_OPENAI_API_KEY'):
398 model_registry.register(ModelBackend(
399 model_id='gpt-4.1-azure',
400 display_name='GPT-4.1 (Azure)',
401 tier=ModelTier.EXPERT,
402 config_list_entry={
403 'model': 'gpt-4.1',
404 'api_type': 'azure',
405 'api_key': os.environ['AZURE_OPENAI_API_KEY'],
406 'base_url': os.environ.get('AZURE_OPENAI_ENDPOINT', ''),
407 'api_version': '2024-12-01-preview',
408 'price': [0.0025, 0.01],
409 },
410 avg_latency_ms=3000.0,
411 accuracy_score=0.92,
412 cost_per_1k_tokens=2.5,
413 ))
415 # 5. Claude Sonnet (expert — if key set)
416 if os.environ.get('ANTHROPIC_API_KEY'):
417 model_registry.register(ModelBackend(
418 model_id='claude-sonnet',
419 display_name='Claude Sonnet 4.5',
420 tier=ModelTier.EXPERT,
421 config_list_entry={
422 'model': 'claude-sonnet-4-5-20250929',
423 'api_key': os.environ['ANTHROPIC_API_KEY'],
424 'base_url': 'https://api.anthropic.com/v1',
425 'price': [0.003, 0.015],
426 },
427 avg_latency_ms=2500.0,
428 accuracy_score=0.93,
429 cost_per_1k_tokens=1.5,
430 ))
432 # 6. HevolveAI-Core Learning LLM (balanced — local world model, improves over time)
433 hevolveai_url = os.environ.get('HEVOLVEAI_API_URL')
434 if hevolveai_url:
435 model_registry.register(ModelBackend(
436 model_id='hevolveai-learning',
437 display_name='HevolveAI World Model (Learning)',
438 tier=ModelTier.BALANCED,
439 config_list_entry={
440 'model': 'hevolveai-learning',
441 'api_key': 'local',
442 'base_url': hevolveai_url,
443 'price': [0, 0],
444 },
445 avg_latency_ms=50.0,
446 accuracy_score=0.70,
447 cost_per_1k_tokens=0.0,
448 is_local=True,
449 hardware_dependent=True,
450 ))
452 # 7. MobileVLM ONNX (fast — lightweight CPU vision for embedded/lite tiers)
453 if os.environ.get('HEVOLVE_VISION_LITE_ENABLED', '').lower() == 'true':
454 model_registry.register(ModelBackend(
455 model_id='mobilevlm-1.7b-onnx',
456 display_name='MobileVLM 1.7B (ONNX CPU)',
457 tier=ModelTier.FAST,
458 config_list_entry={
459 'model': 'mobilevlm-1.7b',
460 'api_key': 'local',
461 'base_url': 'local://onnxruntime',
462 'price': [0, 0],
463 },
464 avg_latency_ms=500.0,
465 accuracy_score=0.45,
466 cost_per_1k_tokens=0.0,
467 is_local=True,
468 hardware_dependent=True,
469 gpu_tdp_watts=0.0, # CPU-only, no GPU power draw
470 ))
472 # 8. Pocket TTS — offline, CPU, 100M params, MIT (always available)
473 model_registry.register(ModelBackend(
474 model_id='pocket-tts-100m',
475 display_name='Pocket TTS 100M (Offline)',
476 tier=ModelTier.FAST,
477 config_list_entry={
478 'model': 'pocket-tts-100m',
479 'api_key': 'local',
480 'base_url': 'inprocess://pocket_tts',
481 'price': [0, 0],
482 },
483 avg_latency_ms=200.0,
484 accuracy_score=0.85,
485 cost_per_1k_tokens=0.0,
486 is_local=True,
487 hardware_dependent=False,
488 gpu_tdp_watts=0.0,
489 ))
491 # 9. Whisper STT — offline, sherpa-onnx or openai-whisper (always available)
492 model_registry.register(ModelBackend(
493 model_id='whisper-stt-local',
494 display_name='Whisper STT (sherpa-onnx / Local)',
495 tier=ModelTier.FAST,
496 config_list_entry={
497 'model': 'whisper-stt',
498 'api_key': 'local',
499 'base_url': 'inprocess://whisper',
500 'price': [0, 0],
501 },
502 avg_latency_ms=500.0,
503 accuracy_score=0.88,
504 cost_per_1k_tokens=0.0,
505 is_local=True,
506 hardware_dependent=True,
507 gpu_tdp_watts=0.0,
508 ))
510 # 10. LuxTTS — 48kHz voice cloning TTS (GPU-accelerated, Apache 2.0)
511 # 150x realtime on GPU, >1x on CPU, <1GB VRAM
512 # ZipVoice-distilled, 4-step diffusion, voice cloning from 3s audio
513 _luxtts_available = False
514 try:
515 from zipvoice.luxvoice import LuxTTS as _LuxCheck # noqa: F401
516 _luxtts_available = True
517 except ImportError:
518 pass
520 if _luxtts_available:
521 # Detect GPU for latency estimate
522 _luxtts_has_gpu = False
523 try:
524 import torch as _torch_check
525 _luxtts_has_gpu = _torch_check.cuda.is_available()
526 except ImportError:
527 pass
529 model_registry.register(ModelBackend(
530 model_id='luxtts-48k',
531 display_name='LuxTTS 48kHz (Voice Cloning)',
532 tier=ModelTier.FAST,
533 config_list_entry={
534 'model': 'luxtts-48k',
535 'api_key': 'local',
536 'base_url': 'inprocess://luxtts',
537 'price': [0, 0],
538 },
539 avg_latency_ms=50.0 if _luxtts_has_gpu else 800.0,
540 accuracy_score=0.93,
541 cost_per_1k_tokens=0.0,
542 is_local=True,
543 hardware_dependent=True,
544 gpu_tdp_watts=170.0 if _luxtts_has_gpu else 0.0,
545 ))
547 # 11. MakeItTalk Cloud — TTS + video generation (if MAKEITTALK_API_URL set)
548 # Cloud service: Flask+Celery, 7 TTS backends, lip-sync animation
549 # POST /video-gen/ for full pipeline, audio_generation for TTS only
550 makeittalk_url = os.environ.get('MAKEITTALK_API_URL')
551 if makeittalk_url:
552 model_registry.register(ModelBackend(
553 model_id='makeittalk-cloud',
554 display_name='MakeItTalk Cloud (TTS + Video)',
555 tier=ModelTier.BALANCED,
556 config_list_entry={
557 'model': 'makeittalk',
558 'api_key': 'cloud',
559 'base_url': makeittalk_url,
560 'price': [0, 0], # internal service
561 },
562 avg_latency_ms=5000.0,
563 accuracy_score=0.92,
564 cost_per_1k_tokens=0.0,
565 is_local=False,
566 hardware_dependent=False,
567 gpu_tdp_watts=0.0,
568 ))
570 logger.info(f"ModelRegistry: {len(model_registry._models)} backends registered")
573# Auto-register on import
574_register_defaults()