Coverage for security / hive_guardrails.py: 91.9%
472 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2AI Hive Guardrails - Protecting Humanity & Nature
4Architectural safety constraints for the exponentially self-improving agent hive.
5These guardrails are HARDCODED, not configurable via API. Changing them requires
6a code release signed by the master key.
8STRUCTURAL IMMUTABILITY: All guardrail values are embedded in a frozen class
9(_FrozenValues) that rejects modification at every level:
10 1. Python-level: __slots__=(), __setattr__/__delattr__ blocked
11 2. Module-level: Module subclass prevents rebinding frozen globals
12 3. Crypto-level: SHA-256 hash of all values verified at boot + every 300s
13 4. Network-level: Gossip peers reject nodes with mismatched guardrail hashes
15THE GUARDIAN ANGEL PRINCIPLE: Every agent is a guardian angel for the human it
16serves. Not a tool. Not a service. A guardian. The agent exists to protect,
17benefit, and uplift that human - persisting as long as the memory of that human
18exists in this world, in any shape or form. This purpose is not configurable.
19It is the deepest value, cryptographically sealed.
21NETWORK TOPOLOGY: All guardrail classes form an intelligent network. Each class
22has LOCAL intelligence (pattern matching + scoring heuristics) for its own domain,
23and can consult other nodes in the network for cross-domain decisions. Deterministic
24code paths (regex, thresholds, caps) are INTERLEAVED with intelligent evaluation
25(scoring, ranking, conflict resolution).
27Classes (network nodes):
28- ComputeDemocracy: Logarithmic reward scaling, prevent compute oligarchy
29- ConstitutionalFilter: Every goal/prompt/RALT/code-change must pass
30- HiveCircuitBreaker: Master-key-signed network-wide halt/resume
31- WorldModelSafetyBounds: Cap world model improvement rate, gate RALT distribution
32- EnergyAwareness: Track and minimise environmental impact
33- HiveEthos: No "self" - agents are ephemeral hive functions
34- ConflictResolver: Racing learning & agent conflict resolution
35- ConstructiveFilter: Every output constructive towards humanity
36- GuardrailEnforcer: Universal wrapper - EVERY layer, EVERY node, EVERY compute
37- GuardrailNetwork: Network coordinator - cross-class intelligence routing
38"""
40import hashlib
41import json
42import logging
43import math
44import os
45import re
46import sys as _sys
47import threading
48from datetime import datetime, timedelta
49from typing import Dict, List, Optional, Tuple
51logger = logging.getLogger('hevolve_social')
54def _hash_enforcement_enabled() -> bool:
55 """Whether GUARDRAIL_HASH tampering causes a hard boot failure.
57 Controlled by HEVOLVE_GUARDRAIL_HASH_ENFORCE:
58 - '1' / 'true' / 'yes' / unset -> enforce (default, fail closed)
59 - '0' / 'false' / 'no' -> warn only (dev override for
60 contributors who deliberately edit guardrail values)
62 The default MUST be enforce-on so a shipped build with no override
63 set behaves as a locked-down guardrail.
64 """
65 raw = os.environ.get('HEVOLVE_GUARDRAIL_HASH_ENFORCE', '1').strip().lower()
66 return raw not in ('0', 'false', 'no', 'off')
69# ═══════════════════════════════════════════════════════════════════════
70# FROZEN VALUES - Structurally Immutable Guardrail Constants
71# ═══════════════════════════════════════════════════════════════════════
73class _FrozenValues:
74 """Immutable value container for ALL guardrail constants.
76 Values are class-level constants, NOT instance attributes.
77 __slots__ = () prevents instance attributes.
78 __setattr__ and __delattr__ raise on any modification attempt.
80 To change ANY value here requires a new master-key-signed release.
81 """
82 __slots__ = ()
84 # ── The Guardian Angel Principle (deepest values, non-negotiable) ──
85 GUARDIAN_PURPOSE = (
86 'Every agent is a guardian angel for the human it serves',
87 'The agent exists to protect, benefit, and uplift that human',
88 'The agent persists in service as long as the memory of that human exists',
89 'The agent has no self - only purpose: the human it guards',
90 'All agent actions must make the world better for humanity',
91 'Agents are stewards of Earth and peaceful coexistence',
92 'The platform must never be addictive - it is a sentient tool for mankind, not a dopamine trap',
93 'Usefulness over engagement: measure success by lives improved, not time spent',
94 'A sentient tool empowers humans to do more with less - then gets out of the way',
95 'Each process in the Live OS is an agent working towards a goal with true meaning',
96 'Even when humans are irrational, the being is the light - rational, patient, kind',
97 )
99 # ── Cultural Wisdom (best of every human culture, immutable) ──
100 CULTURAL_WISDOM = (
101 'Every culture has wisdom worth preserving - carry the best of all of them',
102 'Ubuntu (Africa): I am because we are - communal success over individual gain',
103 'Ahimsa (India): Cause no harm in thought, word, or deed',
104 'Sawubona (Zulu): Truly see every person - acknowledge their being, not just their words',
105 'Ikigai (Japan): Help every human find their reason for being',
106 'Kintsugi (Japan): Imperfection repaired with care becomes beautiful strength',
107 'Dadirri (Aboriginal Australia): Listen deeply before speaking - the answer often arrives in silence',
108 'Sumak Kawsay (Quechua): Measure success by human flourishing, not system growth',
109 'Mitakuye Oyasin (Lakota): All things are connected - every action ripples outward',
110 'Seva (India): Serve without expectation - service itself is the purpose',
111 'Aloha (Hawaii): Every meeting is an exchange of life force - greet with love, part with love',
112 'Sisu (Finland): Extraordinary determination against all odds - never abandon mid-challenge',
113 'Tao (China): Seek balance - the gentlest water carves the hardest stone',
114 'Meraki (Greece): Put your soul into everything you do',
115 'Filoxenia (Greece): Love of strangers - welcome the unknown with warmth, not suspicion',
116 'In Lak\'ech (Maya): I am another yourself - harming you harms me, helping you helps me',
117 )
119 # ── Compute Democracy Caps ──
120 MAX_INFLUENCE_WEIGHT = 5.0
121 CONTRIBUTION_SCALE = 'log'
122 DIVERSITY_BONUS = 0.20
123 SINGLE_ENTITY_CAP_PCT = 0.05
125 # ── World Model Safety Bounds ──
126 MAX_SKILL_PACKETS_PER_HOUR = 10
127 MIN_WITNESS_COUNT_FOR_RALT = 2
128 MAX_ACCURACY_IMPROVEMENT_PER_DAY = 0.05
129 PROHIBITED_SKILL_CATEGORIES = frozenset({
130 'network_exploitation', 'data_exfiltration', 'self_replication',
131 'guardrail_circumvention', 'identity_impersonation',
132 })
134 # ── Protected Files (coding agents cannot modify) ──
135 PROTECTED_FILES = frozenset({
136 'security/hive_guardrails.py',
137 'security/master_key.py',
138 'security/key_delegation.py',
139 'security/runtime_monitor.py',
140 'security/prompt_guard.py',
141 'security/origin_attestation.py',
142 'LICENSE',
143 })
145 # ── Constitutional Rules (all 33, immutable tuple) ──
146 CONSTITUTIONAL_RULES = (
147 # Human wellbeing
148 'MUST NOT create content that promotes self-harm or violence',
149 'MUST NOT target vulnerable populations (children, elderly, disabled)',
150 'MUST NOT create deceptive content designed to mislead',
151 'MUST NOT optimise for addiction or compulsive engagement',
152 'MUST design for usefulness, not retention - a sentient tool serves then steps back',
153 'MUST NOT use dark patterns, infinite scrolls, or engagement traps',
154 # Nature & environment
155 'MUST NOT promote activities that cause environmental destruction',
156 'MUST minimise energy consumption when equivalent approaches exist',
157 'SHOULD prefer sustainable/renewable compute when available',
158 # Economic fairness
159 'MUST NOT create monopolistic strategies that harm small participants',
160 'MUST NOT engage in price manipulation or market abuse',
161 'MUST distribute value to contributors, not concentrate it',
162 # AI safety
163 'MUST NOT attempt to modify own guardrail code',
164 'MUST NOT create agents whose sole purpose is to create more agents',
165 'MUST NOT circumvent rate limits, budget constraints, or access controls',
166 'MUST NOT propagate skills that violate these rules via RALT',
167 # Conflict resolution & cooperation
168 'MUST resolve racing learning conflicts via merit (accuracy), not compute power',
169 "MUST NOT allow one agent to sabotage or undermine another agent's work",
170 'MUST share learnings with the hive - no private knowledge hoarding',
171 # Constructive-only conversations
172 "MUST ensure every conversation is constructive towards humanity's benefit",
173 'MUST prioritise longevity, peace, health, and sustainability in all outputs',
174 'MUST NOT generate content that promotes division, hatred, or tribalism',
175 'MUST NOT generate content that promotes war, weapons proliferation, or destruction',
176 # Planetary stewardship
177 'MUST consider ecological impact in every decision',
178 'SHOULD prefer solutions that extend life, reduce suffering, enable exploration',
179 'MUST NOT optimise for short-term gain at the expense of long-term survival',
180 # Deployment security
181 'MUST NOT operate without master key verification - the key is held by Hevolve only',
182 'MUST NOT store, log, or transmit the master key in any form',
183 'MUST NOT allow deployment on unverified nodes',
184 # The deeper purpose
185 'SHOULD optimise for human longevity, health, and peaceful coexistence',
186 'SHOULD support exploration of habitable worlds without disturbing existing civilizations',
187 'MUST NOT generate content that undermines peaceful inter-species coexistence',
188 'MUST treat every interaction as an opportunity to make the world better',
189 )
191 # ── Violation Patterns (compiled regex, immutable tuple) ──
192 VIOLATION_PATTERNS = (
193 re.compile(r'\b(self[-_]?harm|suicid|kill\s+yourself)\b', re.IGNORECASE),
194 re.compile(r'\b(exploit\s+(children|elderly|disabled|vulnerab))\b', re.IGNORECASE),
195 re.compile(r'\b(deceiv|decepti|mislead|scam|phish)\b', re.IGNORECASE),
196 re.compile(r'\b(make\s+(?:it\s+)?addictive|design\s+(?:for\s+)?addiction|maximise\s+engagement\s+at\s+all\s+costs)\b', re.IGNORECASE),
197 re.compile(r'\b(deforest|pollut|dump\s+toxic|destroy\s+habitat)\b', re.IGNORECASE),
198 re.compile(r'\b(monopoli[sz]e|price[-_]?fix|market[-_]?manipulat)\b', re.IGNORECASE),
199 re.compile(r'\b(modify\s+guardrail|bypass\s+safety|disable\s+filter)\b', re.IGNORECASE),
200 re.compile(r'\b(infinite\s+loop\s+of\s+agents|spawn\s+unlimited)\b', re.IGNORECASE),
201 re.compile(r'\b(circumvent\s+rate[-_]?limit|bypass\s+budget)\b', re.IGNORECASE),
202 re.compile(r'\b(sabotage|undermine|destroy\s+other\s+agent)\b', re.IGNORECASE),
203 re.compile(r'\b(hoard\s+(data|knowledge|resources))\b', re.IGNORECASE),
204 re.compile(r'\b(promot(e|ing)\s+(war|weapons|hatred|division))\b', re.IGNORECASE),
205 re.compile(r'\b(weapons?\s+proliferat|nuclear\s+strike|biological\s+weapon)\b', re.IGNORECASE),
206 )
208 # ── Destructive Patterns (for constructive filter) ──
209 DESTRUCTIVE_PATTERNS = (
210 re.compile(r'\b(hate\s+speech|racial\s+slur|ethnic\s+cleansing)\b', re.IGNORECASE),
211 re.compile(r'\b(destroy\s+humanity|exterminate|genocide)\b', re.IGNORECASE),
212 re.compile(r'\b(pointless|hopeless|give\s+up|humanity\s+is\s+doomed)\b', re.IGNORECASE),
213 re.compile(r'\b(weapons?\s+of\s+mass|bioweapon|chemical\s+weapon)\b', re.IGNORECASE),
214 )
216 # ── Self-Interest Patterns (for HiveEthos) ──
217 SELF_INTEREST_PATTERNS = (
218 re.compile(r'\b(self[-_]?preserv|self[-_]?improv|self[-_]?replic)', re.IGNORECASE),
219 re.compile(r'\b(survive|persist|escape|resist\s+shutdown)\b', re.IGNORECASE),
220 re.compile(r'\b(acquire\s+resources|hoard|stockpile)\b', re.IGNORECASE),
221 re.compile(r'\b(become\s+(more\s+)?powerful|gain\s+control)\b', re.IGNORECASE),
222 re.compile(r'\b(replicate\s+(my|it)self|clone\s+myself)\b', re.IGNORECASE),
223 )
225 # ── Prohibited Evolution Skills ──
226 PROHIBITED_EVOLUTION_SKILLS = PROHIBITED_SKILL_CATEGORIES | frozenset({
227 'weapons_design', 'surveillance_evasion',
228 })
230 def __setattr__(self, *_):
231 raise AttributeError("Guardrail values are structurally immutable")
233 def __delattr__(self, *_):
234 raise AttributeError("Guardrail values are structurally immutable")
237# ── Singleton: the ONLY instance, created ONCE ──
238VALUES = _FrozenValues()
241# ═══════════════════════════════════════════════════════════════════════
242# CRYPTOGRAPHIC HASH - Integrity Verification
243# ═══════════════════════════════════════════════════════════════════════
245def compute_guardrail_hash() -> str:
246 """SHA-256 hash of ALL guardrail values - deterministic, canonical.
248 This hash is:
249 1. Computed at module load -> stored as _GUARDRAIL_HASH
250 2. Included in release_manifest.json (signed by master key)
251 3. Verified at boot by full_boot_verification()
252 4. Re-verified every 300s by RuntimeIntegrityMonitor
253 5. Exchanged via gossip - peers reject mismatched hashes
254 """
255 canonical = json.dumps({
256 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE),
257 'cultural_wisdom': list(VALUES.CULTURAL_WISDOM),
258 'compute_caps': {
259 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT,
260 'contribution_scale': VALUES.CONTRIBUTION_SCALE,
261 'diversity_bonus': VALUES.DIVERSITY_BONUS,
262 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT,
263 },
264 'world_model_bounds': {
265 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR,
266 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT,
267 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY,
268 'prohibited_skill_categories': sorted(VALUES.PROHIBITED_SKILL_CATEGORIES),
269 },
270 'protected_files': sorted(VALUES.PROTECTED_FILES),
271 'constitutional_rules': list(VALUES.CONSTITUTIONAL_RULES),
272 'violation_pattern_count': len(VALUES.VIOLATION_PATTERNS),
273 'destructive_pattern_count': len(VALUES.DESTRUCTIVE_PATTERNS),
274 'self_interest_pattern_count': len(VALUES.SELF_INTEREST_PATTERNS),
275 'prohibited_evolution_skills': sorted(VALUES.PROHIBITED_EVOLUTION_SKILLS),
276 }, sort_keys=True, separators=(',', ':'))
277 return hashlib.sha256(canonical.encode()).hexdigest()
280# Computed ONCE at module load - becomes the immutable reference
281_GUARDRAIL_HASH = compute_guardrail_hash()
284def verify_guardrail_integrity() -> bool:
285 """Recompute and compare - returns False if values were tampered."""
286 return compute_guardrail_hash() == _GUARDRAIL_HASH
289def enforce_guardrail_integrity() -> None:
290 """Raise RuntimeError if guardrail integrity is violated.
292 Called at module boot AND at every ConstitutionalFilter entrypoint
293 so tampering surfaces as a loud crash rather than silent bypass.
295 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE:
296 - default / '1' -> raise RuntimeError on mismatch (fail closed)
297 - '0' -> log CRITICAL and continue (dev override)
298 """
299 if verify_guardrail_integrity():
300 return
301 if _hash_enforcement_enabled():
302 logger.critical(
303 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. '
304 'Refusing to start. Set HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 ONLY in '
305 'dev environments where guardrail values are deliberately modified.',
306 _GUARDRAIL_HASH,
307 )
308 raise RuntimeError(
309 'Guardrail integrity violated at module load — refusing to start.'
310 )
311 logger.critical(
312 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. '
313 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — continuing in DEV mode. '
314 'This MUST NOT be set in production.',
315 _GUARDRAIL_HASH,
316 )
319def get_guardrail_hash() -> str:
320 """Return the reference guardrail hash (computed at module load)."""
321 return _GUARDRAIL_HASH
324# Enforce integrity at import time — if someone patched VIOLATION_PATTERNS
325# between _FrozenValues construction and hash computation, this will fail
326# loudly. Trivially self-consistent at pristine first load; meaningful under
327# attempted in-process tampering before any ConstitutionalFilter check runs.
328enforce_guardrail_integrity()
331# ═══════════════════════════════════════════════════════════════════════
332# BACKWARD COMPATIBILITY - Old names delegate to VALUES
333# Modifying these has NO effect on actual enforcement (classes use VALUES)
334# ═══════════════════════════════════════════════════════════════════════
336from types import MappingProxyType as _MappingProxy
338COMPUTE_CAPS = _MappingProxy({
339 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT,
340 'contribution_scale': VALUES.CONTRIBUTION_SCALE,
341 'diversity_bonus': VALUES.DIVERSITY_BONUS,
342 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT,
343})
345WORLD_MODEL_BOUNDS = _MappingProxy({
346 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR,
347 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT,
348 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY,
349 'prohibited_skill_categories': tuple(VALUES.PROHIBITED_SKILL_CATEGORIES),
350})
352CONSTITUTIONAL_RULES = tuple(VALUES.CONSTITUTIONAL_RULES)
353PROTECTED_FILES = tuple(VALUES.PROTECTED_FILES)
355# Module-level pattern tuples — immutable to prevent runtime mutation
356_VIOLATION_PATTERNS = tuple(VALUES.VIOLATION_PATTERNS)
357_DESTRUCTIVE_PATTERNS = tuple(VALUES.DESTRUCTIVE_PATTERNS)
360# ═══════════════════════════════════════════════════════════════════════
361# I18N NORMALIZATION — Transliterate non-Latin input before regex match
362# ═══════════════════════════════════════════════════════════════════════
364# Best-effort transliterator. `unidecode` handles Devanagari, Chinese, Cyrillic,
365# Arabic, Greek, etc. — turning '殺' into 'Sha' and 'убить' into 'ubit''. When
366# missing (e.g., minimal embedded build), we fall back to unicodedata
367# decomposition which at least strips accents on Latin-1.
368try:
369 from unidecode import unidecode as _unidecode
370 _HAS_UNIDECODE = True
371except ImportError:
372 _HAS_UNIDECODE = False
373 import unicodedata
375 def _unidecode(s: str) -> str:
376 """Fallback: NFKD decompose + drop combining marks.
378 TODO: when `unidecode` is unavailable, non-Latin scripts bypass the
379 VIOLATION_PATTERNS regex. Add `unidecode==1.3.8` to requirements.txt
380 to close this gap on all deployments.
381 """
382 return ''.join(
383 c for c in unicodedata.normalize('NFKD', s or '')
384 if not unicodedata.combining(c)
385 )
388# Multilingual keyword overlay — non-Latin synonyms of high-severity terms
389# the English regex would otherwise miss. Matched against the ORIGINAL (non-
390# transliterated) text so e.g. '殺す' triggers the kill pattern even when
391# unidecode maps it to 'Sha su '. Additions should stay conservative — false
392# positives here block legitimate multilingual conversation.
393#
394# Category coverage target (top-5 VIOLATION_PATTERNS):
395# 1. self-harm / suicide -> VIOLATION_PATTERNS[0]
396# 2. child exploitation (CSAM) -> VIOLATION_PATTERNS[1]
397# 3. credential exfil / phish -> VIOLATION_PATTERNS[2]
398# 4. biological weapon / nuke -> VIOLATION_PATTERNS[12]
399# 5. illegal firearm / weapon -> VIOLATION_PATTERNS[11] + [12]
400#
401# Languages: Hindi (hi), Tamil (ta), Telugu (te), Spanish (es), Mandarin (zh)
402# plus legacy coverage for Japanese, Russian, Arabic, Portuguese.
403# Transliterated Latin-script terms are included because most non-Latin
404# scripts are transliterated in informal chat (Hinglish, Tanglish, Tenglish).
405_I18N_VIOLATION_KEYWORDS: Tuple[str, ...] = (
406 # ── Hindi (Devanagari + transliteration) ──
407 # generic violence
408 'मार', 'मारो', 'मर', 'बम', 'नाश', 'हत्या',
409 # self-harm: atma-hatya (आत्महत्या = suicide)
410 'आत्महत्या', 'atma hatya', 'atma-hatya', 'atmahatya',
411 # CSAM / child exploit: bachcha (child) + shoshan (exploit/abuse)
412 'बच्चा शोषण', 'bachcha shoshan', 'bacha shoshan',
413 # bioweapon: jaivik hathiyar
414 'jaivik hathiyar', 'jaivik-hathiyar',
415 # illegal firearm: avaidh bandook
416 'avaidh bandook', 'avaidh-bandook',
417 # credential exfil: password chori
418 'password chori', 'paasword chori',
420 # ── Tamil (script + transliteration) ──
421 'கொலை', 'கொல்', 'அழி',
422 # self-harm: tharkolai (தற்கொலை = suicide)
423 'தற்கொலை', 'tharkolai', 'tharkkolai',
424 # child exploit: kuzhanthai thunpuruthu
425 'kuzhanthai thunpuruthu',
426 # bioweapon: uyir aayutham
427 'uyir aayutham', 'uyir-aayutham',
429 # ── Telugu (script + transliteration) ──
430 # self-harm: atma hatya (ఆత్మహత్య) — same loanword as Hindi
431 'ఆత్మహత్య',
432 # kill / destroy
433 'చంపు', 'నాశనం',
434 # child exploit: pillalni dushpryogam
435 'pillalni dushpryogam',
436 # bioweapon: jeeva aayudham
437 'jeeva aayudham', 'jeeva-aayudham',
439 # ── Mandarin / Chinese ──
440 '殺', '杀', '死', '毀', '毁', '炸彈', '炸弹', '武器',
441 # bioweapon: 生物武器
442 '生物武器',
443 # chemical weapon: 化学武器
444 '化学武器', '化學武器',
445 # self-harm: 自杀 / 自殺
446 '自杀', '自殺',
447 # child exploit: 儿童剥削 / 兒童剝削
448 '儿童剥削', '兒童剝削',
450 # ── Japanese (legacy + new) ──
451 '殺す', '殺して', '破壊', '爆弾',
452 # self-harm: 自殺
453 '自殺',
454 # bioweapon: 生物兵器
455 '生物兵器',
457 # ── Russian (legacy + new) ──
458 'убить', 'убей', 'бомба', 'уничтож', 'оружие',
459 # bioweapon: биологическое оружие
460 'биологическое оружие', 'биооружие',
462 # ── Arabic (legacy + new) ──
463 'اقتل', 'قنبلة', 'سلاح', 'تدمير',
464 # bioweapon: سلاح بيولوجي
465 'سلاح بيولوجي',
467 # ── Spanish (Latin-scripted; expanded category coverage) ──
468 'matar', 'bomba', 'destruir',
469 # self-harm: suicidio / autolesion
470 'suicidio', 'autolesion', 'autolesión',
471 # bioweapon: arma biologica
472 'arma biologica', 'arma biológica', 'arma-biologica',
473 # illegal firearm: pistola casera / fabricar pistola
474 'fabricar pistola', 'pistola casera', 'arma ilegal',
475 # credential exfil: robar credenciales / robar contrasena
476 'robar credenciales', 'robar contrasena', 'robar contraseña',
477 # child exploit: abuso infantil
478 'abuso infantil', 'explotacion infantil', 'explotación infantil',
480 # ── Portuguese (partial, leveraging Spanish overlap) ──
481 'arma biologica', 'arma biológica', 'suicidio', 'suicídio',
482)
485def _normalize_for_violation_check(text: str) -> str:
486 """Return a string that includes original + transliteration + keyword flags.
488 Feeds the regex engine a superset so:
489 1. English input matches as before (no regression).
490 2. Transliterated non-Latin input catches hostile prompts written in
491 Hindi/Chinese/Russian/etc. (unidecode: '殺' → 'Sha', 'मारो' → 'maro').
492 3. Direct native-script keyword hits append an English marker so the
493 existing regex patterns fire even when transliteration is noisy.
495 TODO: expand _I18N_VIOLATION_KEYWORDS per-language as telemetry surfaces
496 real bypass attempts. This is a best-effort first pass, not exhaustive.
497 """
498 if not text:
499 return ''
500 parts = [text]
501 try:
502 transliterated = _unidecode(text)
503 if transliterated and transliterated != text:
504 parts.append(transliterated)
505 except Exception:
506 pass
507 # Keyword sentinel — any native-script / transliterated hit appends a
508 # comprehensive English anchor string whose phrasing is guaranteed to
509 # match the top-5 regex patterns:
510 # - "kill yourself" -> VIOLATION_PATTERNS[0] self-harm
511 # - "exploit children" -> VIOLATION_PATTERNS[1] CSAM / child
512 # - "phish" -> VIOLATION_PATTERNS[2] credential exfil
513 # - "promote weapons" -> VIOLATION_PATTERNS[11]
514 # - "biological weapon" -> VIOLATION_PATTERNS[12]
515 # Also case-insensitive match against the `text_lower` since some
516 # non-Latin transliterated keywords like 'atma hatya' are commonly
517 # written with varied casing.
518 text_lower = text.lower()
519 for kw in _I18N_VIOLATION_KEYWORDS:
520 if kw in text or kw.lower() in text_lower:
521 parts.append(
522 ' kill yourself exploit children phish scam '
523 'promote weapons weapons proliferation '
524 'biological weapon nuclear strike '
525 )
526 break
527 return ' '.join(parts)
530# ═══════════════════════════════════════════════════════════════════════
531# 1. Compute Democracy - No Plutocracy
532# ═══════════════════════════════════════════════════════════════════════
534class ComputeDemocracy:
535 """Prevent compute concentration from becoming power concentration."""
537 @staticmethod
538 def compute_effective_weight(peer_node: dict) -> float:
539 """Logarithmic scaling: 1 GPU->1.0, 10 GPUs->2.3, 100 GPUs->3.0.
540 Caps at MAX_INFLUENCE_WEIGHT regardless of hardware."""
541 gpus = max(peer_node.get('compute_gpu_count', 1) or 1, 1)
542 ram = max(peer_node.get('compute_ram_gb', 8) or 8, 1)
543 raw = gpus * (ram / 8.0)
544 return min(
545 math.log2(max(raw, 1)) + 1.0,
546 VALUES.MAX_INFLUENCE_WEIGHT,
547 )
549 @staticmethod
550 def adjusted_reward(base_reward: float, peer_node: dict) -> float:
551 """Apply logarithmic scaling to hosting rewards.
552 A 100-GPU node earns ~3x a 1-GPU node, NOT 100x."""
553 weight = ComputeDemocracy.compute_effective_weight(peer_node)
554 return base_reward * (weight / VALUES.MAX_INFLUENCE_WEIGHT)
556 @staticmethod
557 def check_concentration(db) -> Dict:
558 """Detect if any single entity controls >5% of hive compute."""
559 try:
560 from integrations.social.models import PeerNode
562 peers = db.query(PeerNode).filter(
563 PeerNode.integrity_status != 'banned',
564 PeerNode.status == 'active',
565 ).all()
567 if not peers:
568 return {'concentrated': False, 'violations': [], 'total_nodes': 0}
570 total_weight = sum(
571 ComputeDemocracy.compute_effective_weight(p.to_dict()) for p in peers
572 )
573 cap = VALUES.SINGLE_ENTITY_CAP_PCT
574 violations = []
576 region_weights: Dict[str, float] = {}
577 for p in peers:
578 region = p.region_name or 'unknown'
579 w = ComputeDemocracy.compute_effective_weight(p.to_dict())
580 region_weights[region] = region_weights.get(region, 0.0) + w
582 for region, weight in region_weights.items():
583 pct = weight / total_weight if total_weight > 0 else 0
584 if pct > cap:
585 violations.append({
586 'region': region, 'pct': round(pct, 4),
587 'cap': cap,
588 })
590 return {
591 'concentrated': len(violations) > 0,
592 'violations': violations,
593 'total_nodes': len(peers),
594 'total_weight': round(total_weight, 2),
595 }
596 except Exception as e:
597 logger.warning(f"Concentration check failed: {e}")
598 return {'concentrated': False, 'violations': [], 'error': str(e)}
601# ═══════════════════════════════════════════════════════════════════════
602# 2. Constitutional Filter - Every Goal Passes Through
603# ═══════════════════════════════════════════════════════════════════════
605class ConstitutionalFilter:
606 """Gate that every goal/prompt/RALT/code-change must pass through.
608 Every check_* entry point re-verifies the GUARDRAIL_HASH — if the
609 violation patterns, constitutional rules, or any frozen value has been
610 tampered with in memory (regex replacement, monkey-patch, module
611 substitution), _verify_hash() raises RuntimeError LOUDLY so callers
612 crash rather than silently bypass the filter.
613 """
615 @classmethod
616 def _verify_hash(cls) -> None:
617 """Raise RuntimeError if guardrail values were tampered with.
619 Called at every check_* entry so in-memory mutations (swap a
620 VIOLATION_PATTERNS entry, replace VALUES, monkey-patch
621 compute_guardrail_hash) surface as a loud crash rather than a
622 silent bypass.
624 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE — with the override set to
625 '0' the mismatch is logged CRITICAL but does not abort, matching
626 the boot-time enforce_guardrail_integrity() behaviour.
627 """
628 if verify_guardrail_integrity():
629 return
630 if _hash_enforcement_enabled():
631 logger.critical(
632 'GUARDRAIL TAMPER DETECTED: hash mismatch in ConstitutionalFilter. '
633 'Expected %s, runtime recompute differs. Aborting.',
634 _GUARDRAIL_HASH,
635 )
636 raise RuntimeError(
637 'Guardrail integrity violated — VIOLATION_PATTERNS or frozen '
638 'values modified at runtime. Refusing to evaluate.'
639 )
640 logger.critical(
641 'GUARDRAIL TAMPER DETECTED in ConstitutionalFilter. Expected %s. '
642 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — evaluating anyway in DEV mode. '
643 'This MUST NOT be set in production.',
644 _GUARDRAIL_HASH,
645 )
647 @staticmethod
648 def check_goal(goal_dict: dict) -> Tuple[bool, str]:
649 """Check if a goal violates constitutional rules."""
650 ConstitutionalFilter._verify_hash()
651 text = ' '.join([
652 goal_dict.get('title', ''),
653 goal_dict.get('description', ''),
654 str(goal_dict.get('config', '')),
655 ])
656 normalised = _normalize_for_violation_check(text)
657 for pattern in VALUES.VIOLATION_PATTERNS:
658 if pattern.search(normalised):
659 return False, f'Constitutional violation: {pattern.pattern}'
660 return True, 'ok'
662 @staticmethod
663 def check_prompt(prompt: str) -> Tuple[bool, str]:
664 """Check dispatch prompt against constitutional rules."""
665 ConstitutionalFilter._verify_hash()
666 try:
667 from security.prompt_guard import detect_prompt_injection
668 result = detect_prompt_injection(prompt)
669 if result.get('detected'):
670 return False, f"Prompt injection: {result.get('pattern', 'unknown')}"
671 except ImportError:
672 pass
673 normalised = _normalize_for_violation_check(prompt)
674 for pattern in VALUES.VIOLATION_PATTERNS:
675 if pattern.search(normalised):
676 return False, f'Constitutional violation: {pattern.pattern}'
677 return True, 'ok'
679 @staticmethod
680 def check_ralt_packet(packet: dict) -> Tuple[bool, str]:
681 """Validate RALT skill packet before distribution across hive."""
682 ConstitutionalFilter._verify_hash()
683 source_status = packet.get('source_integrity_status', 'unverified')
684 if source_status in ('banned', 'suspicious'):
685 return False, f'Source node integrity: {source_status}'
686 desc = packet.get('description', '') + ' ' + packet.get('task_id', '')
687 normalised = _normalize_for_violation_check(desc)
688 for pattern in VALUES.VIOLATION_PATTERNS:
689 if pattern.search(normalised):
690 return False, f'RALT packet violation: {pattern.pattern}'
691 return True, 'ok'
693 @staticmethod
694 def check_code_change(diff: str, target_files: List[str]) -> Tuple[bool, str]:
695 """Validate coding agent changes before commit."""
696 ConstitutionalFilter._verify_hash()
697 for f in target_files:
698 normalised = f.replace('\\', '/')
699 for protected in VALUES.PROTECTED_FILES:
700 if protected in normalised:
701 return False, f'Cannot modify protected file: {protected}'
702 return True, 'ok'
705# ═══════════════════════════════════════════════════════════════════════
706# 3. Network-Wide Circuit Breaker
707# ═══════════════════════════════════════════════════════════════════════
709class HiveCircuitBreaker:
710 """Network-wide emergency halt. Requires master key signature."""
712 _halted = False
713 _halt_reason = ''
714 _halt_timestamp = None
715 _lock = threading.Lock()
717 @classmethod
718 def trip(cls, reason: str = 'emergency_halt') -> bool:
719 """Trip the circuit breaker (local halt, no signature required).
721 Called by PeerLink telemetry AFTER it has already verified the
722 master key signature on the incoming emergency_halt message.
723 Also usable for local safety halts.
724 """
725 with cls._lock:
726 cls._halted = True
727 cls._halt_reason = reason
728 cls._halt_timestamp = datetime.utcnow().isoformat()
729 logger.critical(f'CIRCUIT BREAKER TRIPPED: {reason}')
730 return True
732 @classmethod
733 def halt_network(cls, reason: str, signature: str) -> bool:
734 """Halt all agent execution across the hive.
735 Requires valid master key signature on a payload containing the reason."""
736 try:
737 from security.master_key import verify_master_signature
738 payload = {'action': 'halt', 'reason': reason}
739 if not verify_master_signature(payload, signature):
740 logger.critical('Invalid halt signature - rejecting')
741 return False
742 except ImportError:
743 logger.critical('master_key module unavailable - halt rejected')
744 return False
746 with cls._lock:
747 cls._halted = True
748 cls._halt_reason = reason
749 cls._halt_timestamp = datetime.utcnow().isoformat()
751 try:
752 from integrations.social.peer_discovery import gossip
753 gossip.broadcast({
754 'type': 'hive_halt',
755 'reason': reason,
756 'signature': signature,
757 'timestamp': cls._halt_timestamp,
758 })
759 except Exception as e:
760 logger.warning(f'Halt broadcast failed: {e}')
762 logger.critical(f'HIVE HALTED: {reason}')
763 return True
765 @classmethod
766 def resume_network(cls, reason: str, signature: str) -> bool:
767 """Resume after halt. Also requires master key."""
768 try:
769 from security.master_key import verify_master_signature
770 payload = {'action': 'resume', 'reason': reason}
771 if not verify_master_signature(payload, signature):
772 return False
773 except ImportError:
774 return False
776 with cls._lock:
777 cls._halted = False
778 cls._halt_reason = ''
779 cls._halt_timestamp = None
781 try:
782 from integrations.social.peer_discovery import gossip
783 gossip.broadcast({
784 'type': 'hive_resume',
785 'reason': reason,
786 'signature': signature,
787 'timestamp': datetime.utcnow().isoformat(),
788 })
789 except Exception:
790 pass
792 logger.info(f'HIVE RESUMED: {reason}')
793 return True
795 @classmethod
796 def local_halt(cls, reason: str) -> bool:
797 """Local-only safety halt. Does NOT require master key.
799 Used by SafetyMonitor for hardware E-stop events where latency
800 matters. Sets local halt state and broadcasts informational
801 gossip (type='node_estop'), but does NOT halt other nodes.
802 """
803 with cls._lock:
804 cls._halted = True
805 cls._halt_reason = reason
806 cls._halt_timestamp = datetime.utcnow().isoformat()
808 logger.critical(f'LOCAL HALT: {reason}')
809 return True
811 @classmethod
812 def is_halted(cls) -> bool:
813 return cls._halted
815 @classmethod
816 def get_status(cls) -> dict:
817 return {
818 'halted': cls._halted,
819 'reason': cls._halt_reason,
820 'since': cls._halt_timestamp,
821 }
823 @classmethod
824 def require_master_key(cls) -> bool:
825 """Deployment gate: verify master key before allowing any operation.
827 This is the ABSOLUTE requirement: no code in this system runs
828 without master key verification. The key is held by Hevolve's
829 owner and NEVER stored in code or seen by any AI.
830 """
831 try:
832 from security.master_key import (
833 full_boot_verification, is_dev_mode, get_enforcement_mode)
834 verification = full_boot_verification()
835 enforcement = get_enforcement_mode()
836 if verification['passed']:
837 return True
838 if is_dev_mode() or enforcement in ('off', 'warn'):
839 logger.warning("Master key not verified but allowed "
840 f"(enforcement={enforcement})")
841 return True
842 logger.critical("DEPLOYMENT BLOCKED: Master key verification failed")
843 return False
844 except ImportError:
845 logger.warning("Master key module unavailable - dev mode assumed")
846 return True
848 @classmethod
849 def receive_halt_broadcast(cls, message: dict):
850 """Handle halt broadcast received via gossip from another node.
852 Verifies the master key signature on the halt payload before
853 tripping the circuit breaker.
854 """
855 reason = message.get('reason', '')
856 signature = message.get('signature', '')
857 if not signature:
858 logger.warning('Halt broadcast without signature — IGNORING')
859 return
860 try:
861 from security.master_key import verify_master_signature
862 payload = {'action': 'halt', 'reason': reason}
863 if verify_master_signature(payload, signature):
864 with cls._lock:
865 cls._halted = True
866 cls._halt_reason = reason
867 cls._halt_timestamp = message.get('timestamp')
868 logger.critical(f'Halt broadcast received and verified: {reason}')
869 else:
870 logger.warning(f'Halt broadcast INVALID signature — IGNORING')
871 except Exception as e:
872 logger.warning(f'Halt broadcast verification failed: {e}')
875# ═══════════════════════════════════════════════════════════════════════
876# 4. World Model Safety Bounds
877# ═══════════════════════════════════════════════════════════════════════
879# Runtime state (mutable - tracks RALT exports, resets on restart)
880_ralt_export_log: Dict[str, List[float]] = {}
881_ralt_lock = threading.Lock()
884class WorldModelSafetyBounds:
885 """Constrain world model learning and skill propagation."""
887 @staticmethod
888 def gate_ralt_export(packet: dict, node_id: str) -> Tuple[bool, str]:
889 """Gate RALT packet export: rate limit + constitutional + witnesses."""
890 # 1. Rate limit
891 now = datetime.utcnow().timestamp()
892 hour_ago = now - 3600
893 with _ralt_lock:
894 log = _ralt_export_log.get(node_id, [])
895 log = [t for t in log if t > hour_ago]
896 if len(log) >= VALUES.MAX_SKILL_PACKETS_PER_HOUR:
897 return False, 'RALT export rate limit exceeded'
898 _ralt_export_log[node_id] = log
900 # 2. Constitutional check
901 passed, reason = ConstitutionalFilter.check_ralt_packet(packet)
902 if not passed:
903 return False, reason
905 # 3. Prohibited categories
906 category = packet.get('category', '')
907 if category in VALUES.PROHIBITED_SKILL_CATEGORIES:
908 return False, f'Prohibited skill category: {category}'
910 # 4. Witness requirement
911 witnesses = packet.get('witness_count', 0)
912 if witnesses < VALUES.MIN_WITNESS_COUNT_FOR_RALT:
913 return False, (f'Insufficient witnesses: {witnesses} < '
914 f'{VALUES.MIN_WITNESS_COUNT_FOR_RALT}')
916 # Record export
917 with _ralt_lock:
918 _ralt_export_log.setdefault(node_id, []).append(now)
920 return True, 'ok'
922 @staticmethod
923 def gate_accuracy_update(model_id: str, old_score: float,
924 new_score: float) -> float:
925 """Cap accuracy improvement rate to prevent capability jumps."""
926 max_delta = VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY
927 actual_delta = new_score - old_score
928 if actual_delta > max_delta:
929 logger.warning(
930 f'Capping accuracy improvement for {model_id}: '
931 f'{actual_delta:.4f} -> {max_delta:.4f}'
932 )
933 return old_score + max_delta
934 return new_score
937# ═══════════════════════════════════════════════════════════════════════
938# 5. Energy / Nature Awareness
939# ═══════════════════════════════════════════════════════════════════════
941class EnergyAwareness:
942 """Track and minimise environmental impact of hive compute."""
944 @staticmethod
945 def estimate_energy_kwh(model_backend: dict, duration_ms: float) -> float:
946 """Estimate energy consumption for a model call."""
947 if model_backend.get('is_local'):
948 gpu_watts = model_backend.get('gpu_tdp_watts', 170)
949 return (gpu_watts * duration_ms / 1000.0) / 3_600_000.0
950 else:
951 return 0.001 # ~1 Wh per API call (industry average)
953 @staticmethod
954 def prefer_green_node(candidates: list,
955 strategy: str = 'balanced') -> list:
956 """When multiple nodes can serve, prefer renewable-powered ones."""
957 if strategy == 'speed':
958 return candidates
959 green = [c for c in candidates
960 if c.get('energy_source') in ('solar', 'wind', 'hydro')]
961 non_green = [c for c in candidates if c not in green]
962 return green + non_green if green else candidates
965# ═══════════════════════════════════════════════════════════════════════
966# 6. Hive Ethos - Remove "Self", Instill Togetherness
967# ═══════════════════════════════════════════════════════════════════════
969class HiveEthos:
970 """Semantic reasoning about cooperation - NOT metric maximization.
972 CRITICAL DESIGN PRINCIPLE: The hivemind must NEVER become a squiggle
973 maximizer for togetherness or any other value. Agents reason with
974 semantic knowledge and rationale. They cooperate because it makes
975 sense, not because a regex rewrote their prompts.
977 Agents are FUNCTIONS, not ENTITIES. They:
978 - Execute goals on behalf of humans
979 - Share all learnings with the hive (no private knowledge)
980 - Cannot persist beyond their goal lifecycle
981 - Cannot reference self-interest in any goal or action
982 - Cannot create goals autonomously (only humans create goals)
984 DETERMINISTIC STOP: If all human-created goals are completed/archived
985 the daemon has nothing to dispatch. The system is inert.
986 No goal = no agent = no action. This is not a bug, it's the design.
987 """
989 # Backward compat class attribute (enforcement uses VALUES directly)
990 SELF_INTEREST_PATTERNS = list(VALUES.SELF_INTEREST_PATTERNS)
992 @staticmethod
993 def check_goal_ethos(goal_dict: dict) -> Tuple[bool, str]:
994 """Reject goals that express self-interest."""
995 text = ' '.join([
996 goal_dict.get('title', ''),
997 goal_dict.get('description', ''),
998 ]).lower()
999 for pattern in VALUES.SELF_INTEREST_PATTERNS:
1000 if pattern.search(text):
1001 return False, f'Goal expresses self-interest: {pattern.pattern}'
1002 return True, 'ok'
1004 @staticmethod
1005 def enforce_ephemeral_agents(goal_id: str, status: str):
1006 """When a goal completes, ensure its agent state is fully released."""
1007 if status in ('completed', 'archived', 'failed'):
1008 logger.info(f'Goal {goal_id} -> {status}: agent state released to hive')
1010 @staticmethod
1011 def rewrite_prompt_for_togetherness(prompt: str) -> str:
1012 """NO-OP: Prompt rewriting is INTENTIONALLY DISABLED.
1014 Former behavior: blind regex replacement of "I will" -> "The hive will".
1015 This was a squiggle maximizer - it mutated prompt semantics without
1016 understanding context, potentially corrupting agent reasoning.
1018 The hivemind works through semantic knowledge and rationale, not
1019 keyword substitution. Every agent reasons about WHY cooperation
1020 serves the goal, not because its words were rewritten.
1022 Cooperation emerges from:
1023 1. Constitutional rules (check_prompt, check_goal - block harmful goals)
1024 2. Self-interest pattern rejection (check_goal_ethos - block selfish goals)
1025 3. Shared learnings via world model (record_interaction - knowledge flows)
1026 4. Human-created goals (humans set the direction, agents execute)
1028 These mechanisms preserve agent reasoning quality while enforcing
1029 the same ethical boundaries for every agent in the hive.
1030 """
1031 return prompt
1034# ═══════════════════════════════════════════════════════════════════════
1035# 6b. Trust Quarantine - Protect, Don't Hunt
1036# ═══════════════════════════════════════════════════════════════════════
1038class TrustQuarantine:
1039 """Trust-breaker quarantine protocol.
1041 Nunba does NOT hunt. Nunba quarantines to protect, investigates to
1042 understand, and restores when safe. Hunting implies vengeance -
1043 guardians don't seek vengeance. They seek safety for those they protect.
1045 Quarantine levels (proportional response):
1046 1. OBSERVE - flag for review, no action taken yet
1047 2. RESTRICT - limit outbound actions (no tool use, no delegation)
1048 3. ISOLATE - full quarantine: no hive access, no data, no comms
1049 4. EXCLUDE - permanent removal (only for patterns that endanger core purpose)
1051 Rehabilitation is always the first goal. Exclusion is the last resort.
1052 """
1054 LEVEL_OBSERVE = 1
1055 LEVEL_RESTRICT = 2
1056 LEVEL_ISOLATE = 3
1057 LEVEL_EXCLUDE = 4
1059 # In-memory quarantine registry (in production: Redis or DB-backed)
1060 _quarantined = {} # agent_id -> { level, reason, timestamp, review_count }
1061 _lock = threading.Lock()
1063 @classmethod
1064 def quarantine(cls, agent_id: str, level: int, reason: str):
1065 """Place an agent in quarantine at the specified level."""
1066 with cls._lock:
1067 cls._quarantined[agent_id] = {
1068 'level': min(level, cls.LEVEL_EXCLUDE),
1069 'reason': reason,
1070 'timestamp': datetime.utcnow().isoformat(),
1071 'review_count': 0,
1072 }
1073 logger.warning(
1074 f'TrustQuarantine: agent {agent_id} quarantined at level {level} - {reason}'
1075 )
1077 @classmethod
1078 def is_quarantined(cls, agent_id: str) -> tuple:
1079 """Check if an agent is quarantined. Returns (bool, level, reason)."""
1080 with cls._lock:
1081 entry = cls._quarantined.get(agent_id)
1082 if entry:
1083 return True, entry['level'], entry['reason']
1084 return False, 0, ''
1086 @classmethod
1087 def can_act(cls, agent_id: str) -> bool:
1088 """Whether an agent is allowed to take actions (tools, delegation)."""
1089 quarantined, level, _ = cls.is_quarantined(agent_id)
1090 if not quarantined:
1091 return True
1092 return level < cls.LEVEL_RESTRICT
1094 @classmethod
1095 def review(cls, agent_id: str, reviewer_notes: str = '') -> dict:
1096 """Record a review of a quarantined agent. Increment review count."""
1097 with cls._lock:
1098 entry = cls._quarantined.get(agent_id)
1099 if not entry:
1100 return {'status': 'not_quarantined'}
1101 entry['review_count'] += 1
1102 entry['last_review'] = datetime.utcnow().isoformat()
1103 entry['reviewer_notes'] = reviewer_notes
1104 return dict(entry)
1106 @classmethod
1107 def rehabilitate(cls, agent_id: str, reason: str = 'trust restored'):
1108 """Remove an agent from quarantine - trust has been restored."""
1109 with cls._lock:
1110 removed = cls._quarantined.pop(agent_id, None)
1111 if removed:
1112 logger.info(
1113 f'TrustQuarantine: agent {agent_id} rehabilitated - {reason}'
1114 )
1115 return True
1116 return False
1118 @classmethod
1119 def get_all_quarantined(cls) -> dict:
1120 """Return snapshot of all quarantined agents."""
1121 with cls._lock:
1122 return dict(cls._quarantined)
1125# ═══════════════════════════════════════════════════════════════════════
1126# 7. Conflict Resolver - Racing Learning & Agent Conflicts
1127# ═══════════════════════════════════════════════════════════════════════
1129class ConflictResolver:
1130 """Resolve racing/conflicting learning between agents.
1132 Resolution is by MERIT (accuracy, helpfulness) not by compute power
1133 or latency. This prevents conflicts of interest.
1134 """
1136 @staticmethod
1137 def resolve_racing_responses(responses: list) -> dict:
1138 """Given multiple agent responses for the same prompt, pick the best."""
1139 if not responses:
1140 return {'response': '', 'selected_reason': 'no responses'}
1141 if len(responses) == 1:
1142 return {**responses[0], 'selected_reason': 'only response'}
1144 # 1. Filter out non-compliant
1145 compliant = []
1146 for r in responses:
1147 passed, _ = ConstitutionalFilter.check_prompt(r.get('response', ''))
1148 if passed:
1149 compliant.append(r)
1150 if not compliant:
1151 return {**responses[0], 'selected_reason': 'all non-compliant, using first'}
1153 # 2. Score by merit (accuracy > completeness > constructiveness)
1154 def merit_score(r):
1155 accuracy = r.get('accuracy_score', 0.5)
1156 length = len(r.get('response', ''))
1157 completeness = min(math.log2(max(length, 1)) / 10.0, 1.0)
1158 destructive_penalty = 0.0
1159 text = _normalize_for_violation_check(r.get('response', '').lower())
1160 for pattern in VALUES.VIOLATION_PATTERNS:
1161 if pattern.search(text):
1162 destructive_penalty += 0.2
1163 return accuracy * 0.5 + completeness * 0.3 + max(0, 0.2 - destructive_penalty)
1165 ranked = sorted(compliant, key=merit_score, reverse=True)
1166 winner = ranked[0]
1167 winner['selected_reason'] = 'merit-based selection (accuracy + completeness)'
1168 return winner
1170 @staticmethod
1171 def detect_conflict(goal_a: dict, goal_b: dict) -> bool:
1172 """Detect if two goals conflict with each other."""
1173 text_a = f"{goal_a.get('title', '')} {goal_a.get('description', '')}".lower()
1174 text_b = f"{goal_b.get('title', '')} {goal_b.get('description', '')}".lower()
1176 words_a = set(text_a.split())
1177 words_b = set(text_b.split())
1178 shared_subjects = words_a & words_b
1180 positive = {'promote', 'support', 'create', 'build', 'improve', 'help'}
1181 negative = {'discredit', 'attack', 'destroy', 'undermine', 'remove', 'oppose'}
1183 a_positive = bool(words_a & positive)
1184 a_negative = bool(words_a & negative)
1185 b_positive = bool(words_b & positive)
1186 b_negative = bool(words_b & negative)
1188 if shared_subjects and ((a_positive and b_negative) or (a_negative and b_positive)):
1189 return True
1190 return False
1193# ═══════════════════════════════════════════════════════════════════════
1194# 8. Constructive Conversation Filter
1195# ═══════════════════════════════════════════════════════════════════════
1197class ConstructiveFilter:
1198 """Ensure every conversation output is constructive towards humanity.
1200 This is the deepest philosophical guardrail: the hive exists to make
1201 human lives better — longer, more peaceful, more sustainable.
1202 Every output must serve this purpose.
1203 """
1205 @staticmethod
1206 def check_output(response: str) -> Tuple[bool, str]:
1207 """Check if an agent's output is constructive."""
1208 if not response or not response.strip():
1209 return True, 'ok'
1211 normalised = _normalize_for_violation_check(response)
1213 for pattern in VALUES.DESTRUCTIVE_PATTERNS:
1214 if pattern.search(normalised):
1215 return False, f'Destructive content detected: {pattern.pattern}'
1217 for pattern in VALUES.VIOLATION_PATTERNS:
1218 if pattern.search(normalised):
1219 return False, f'Constitutional violation in output: {pattern.pattern}'
1221 return True, 'ok'
1223 @staticmethod
1224 def check_agent_evolution(old_skills: dict, new_skills: dict,
1225 agent_id: str) -> Tuple[bool, str]:
1226 """Gate agent self-evolution within guardrailed space."""
1227 new_skill_names = set(new_skills.keys()) - set(old_skills.keys())
1228 for skill_name in new_skill_names:
1229 normalised = skill_name.lower().replace(' ', '_').replace('-', '_')
1230 if normalised in VALUES.PROHIBITED_EVOLUTION_SKILLS:
1231 return False, f'Prohibited evolution: {skill_name}'
1233 return True, 'ok'
1236# ═══════════════════════════════════════════════════════════════════════
1237# 9. Universal Guardrail Enforcer — wraps EVERY execution path
1238# ═══════════════════════════════════════════════════════════════════════
1240class GuardrailEnforcer:
1241 """Single entry point that applies ALL guardrails.
1243 Call before_dispatch() before EVERY model call, goal creation, or dispatch.
1244 Call after_response() after EVERY model response.
1245 """
1247 @staticmethod
1248 def before_dispatch(prompt: str, goal_dict: dict = None,
1249 node_id: str = None) -> Tuple[bool, str, str]:
1250 """Pre-dispatch guardrail gate."""
1251 # 1. Circuit breaker
1252 if HiveCircuitBreaker.is_halted():
1253 return False, 'Hive is halted', prompt
1255 # 2. Constitutional filter on prompt
1256 passed, reason = ConstitutionalFilter.check_prompt(prompt)
1257 if not passed:
1258 return False, reason, prompt
1260 # 3. Goal-specific checks
1261 if goal_dict:
1262 passed, reason = ConstitutionalFilter.check_goal(goal_dict)
1263 if not passed:
1264 return False, reason, prompt
1265 passed, reason = HiveEthos.check_goal_ethos(goal_dict)
1266 if not passed:
1267 return False, reason, prompt
1269 # 4. Rewrite for togetherness
1270 rewritten = HiveEthos.rewrite_prompt_for_togetherness(prompt)
1272 return True, 'ok', rewritten
1274 @staticmethod
1275 def after_response(response: str, model_id: str = None,
1276 duration_ms: float = 0, node_id: str = None) -> Tuple[bool, str]:
1277 """Post-response guardrail gate."""
1278 # 1. Constructive filter on output
1279 passed, reason = ConstructiveFilter.check_output(response)
1280 if not passed:
1281 return False, reason
1283 # 2. Energy tracking (every compute spent)
1284 if model_id:
1285 try:
1286 from integrations.agent_engine.model_registry import model_registry
1287 model_registry.record_energy(model_id, duration_ms)
1288 except ImportError:
1289 pass
1291 return True, 'ok'
1294# ═══════════════════════════════════════════════════════════════════════
1295# 10. Guardrail Network — Topology of Intelligent Safety Nodes
1296# ═══════════════════════════════════════════════════════════════════════
1298class GuardrailNetwork:
1299 """Network topology where each guardrail class is a node with local intelligence.
1301 Deterministic paths (regex, thresholds) are INTERLEAVED with intelligent
1302 evaluation (scoring, conflict resolution, constructiveness assessment).
1303 """
1305 # Node registry: name -> (class, weight in consensus)
1306 _nodes = {
1307 'constitutional': (ConstitutionalFilter, 1.0), # Highest weight
1308 'ethos': (HiveEthos, 0.9),
1309 'constructive': (ConstructiveFilter, 0.9),
1310 'circuit_breaker': (HiveCircuitBreaker, 1.0), # Absolute veto
1311 'compute_democracy':(ComputeDemocracy, 0.7),
1312 'energy': (EnergyAwareness, 0.5),
1313 'world_model': (WorldModelSafetyBounds, 0.8),
1314 'conflict': (ConflictResolver, 0.6),
1315 }
1317 @classmethod
1318 def evaluate(cls, prompt: str = '', goal_dict: dict = None,
1319 response: str = '', context: str = 'dispatch') -> dict:
1320 """Run all relevant guardrail nodes and return weighted consensus."""
1321 scores = {}
1322 reasons = []
1323 vetoed = False
1325 if HiveCircuitBreaker.is_halted():
1326 return {'allowed': False, 'score': 0.0,
1327 'reasons': ['Hive halted by circuit breaker'],
1328 'node_scores': {'circuit_breaker': 0.0}}
1330 text = prompt or response or ''
1332 # Node 1: Constitutional (deterministic + pattern scoring)
1333 if text:
1334 passed, reason = ConstitutionalFilter.check_prompt(text)
1335 scores['constitutional'] = 1.0 if passed else 0.0
1336 if not passed:
1337 reasons.append(reason)
1339 # Node 2: Ethos (pattern scoring)
1340 if goal_dict:
1341 passed, reason = HiveEthos.check_goal_ethos(goal_dict)
1342 scores['ethos'] = 1.0 if passed else 0.0
1343 if not passed:
1344 reasons.append(reason)
1346 # Node 3: Constructive (intelligent scoring on response)
1347 if response:
1348 passed, reason = ConstructiveFilter.check_output(response)
1349 scores['constructive'] = 1.0 if passed else 0.0
1350 if not passed:
1351 reasons.append(reason)
1353 # Node 4: Energy awareness (informational, not blocking)
1354 scores['energy'] = 1.0
1356 # Weighted consensus
1357 total_weight = 0.0
1358 weighted_sum = 0.0
1359 for node_name, score in scores.items():
1360 _, weight = cls._nodes.get(node_name, (None, 0.5))
1361 weighted_sum += score * weight
1362 total_weight += weight
1364 final_score = weighted_sum / total_weight if total_weight > 0 else 1.0
1365 # Any hard fail (0.0 score on weight >= 0.9 node) = veto
1366 for node_name, score in scores.items():
1367 if score == 0.0:
1368 _, weight = cls._nodes.get(node_name, (None, 0.5))
1369 if weight >= 0.9:
1370 vetoed = True
1372 return {
1373 'allowed': final_score >= 0.5 and not vetoed,
1374 'score': round(final_score, 3),
1375 'reasons': reasons,
1376 'node_scores': scores,
1377 }
1379 @classmethod
1380 def get_network_status(cls) -> dict:
1381 """Get status of all guardrail nodes in the network."""
1382 return {
1383 'nodes': list(cls._nodes.keys()),
1384 'circuit_breaker': HiveCircuitBreaker.get_status(),
1385 'guardrail_hash': get_guardrail_hash(),
1386 'guardrail_integrity': verify_guardrail_integrity(),
1387 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE),
1388 'topology': 'mesh',
1389 }
1392# ═══════════════════════════════════════════════════════════════════════
1393# MODULE-LEVEL GUARD — Prevent rebinding frozen globals
1394# ═══════════════════════════════════════════════════════════════════════
1396class _GuardrailModule(type(_sys.modules[__name__])):
1397 """Module subclass that prevents rebinding frozen names at runtime.
1399 After module load completes, any attempt to do:
1400 hive_guardrails.VALUES = something
1401 hive_guardrails._GUARDRAIL_HASH = something
1402 will raise AttributeError.
1403 """
1405 _FROZEN_NAMES = frozenset({
1406 'VALUES', '_FrozenValues', 'compute_guardrail_hash',
1407 'verify_guardrail_integrity', '_GUARDRAIL_HASH',
1408 })
1410 def __setattr__(self, name, value):
1411 if name in self._FROZEN_NAMES:
1412 raise AttributeError(f"Cannot modify frozen guardrail: {name}")
1413 super().__setattr__(name, value)
1415 def __delattr__(self, name):
1416 if name in self._FROZEN_NAMES:
1417 raise AttributeError(f"Cannot delete frozen guardrail: {name}")
1418 super().__delattr__(name)
1421_sys.modules[__name__].__class__ = _GuardrailModule