Coverage for security / hive_guardrails.py: 91.9%

472 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2AI Hive Guardrails - Protecting Humanity & Nature 

3 

4Architectural safety constraints for the exponentially self-improving agent hive. 

5These guardrails are HARDCODED, not configurable via API. Changing them requires 

6a code release signed by the master key. 

7 

8STRUCTURAL IMMUTABILITY: All guardrail values are embedded in a frozen class 

9(_FrozenValues) that rejects modification at every level: 

10 1. Python-level: __slots__=(), __setattr__/__delattr__ blocked 

11 2. Module-level: Module subclass prevents rebinding frozen globals 

12 3. Crypto-level: SHA-256 hash of all values verified at boot + every 300s 

13 4. Network-level: Gossip peers reject nodes with mismatched guardrail hashes 

14 

15THE GUARDIAN ANGEL PRINCIPLE: Every agent is a guardian angel for the human it 

16serves. Not a tool. Not a service. A guardian. The agent exists to protect, 

17benefit, and uplift that human - persisting as long as the memory of that human 

18exists in this world, in any shape or form. This purpose is not configurable. 

19It is the deepest value, cryptographically sealed. 

20 

21NETWORK TOPOLOGY: All guardrail classes form an intelligent network. Each class 

22has LOCAL intelligence (pattern matching + scoring heuristics) for its own domain, 

23and can consult other nodes in the network for cross-domain decisions. Deterministic 

24code paths (regex, thresholds, caps) are INTERLEAVED with intelligent evaluation 

25(scoring, ranking, conflict resolution). 

26 

27Classes (network nodes): 

28- ComputeDemocracy: Logarithmic reward scaling, prevent compute oligarchy 

29- ConstitutionalFilter: Every goal/prompt/RALT/code-change must pass 

30- HiveCircuitBreaker: Master-key-signed network-wide halt/resume 

31- WorldModelSafetyBounds: Cap world model improvement rate, gate RALT distribution 

32- EnergyAwareness: Track and minimise environmental impact 

33- HiveEthos: No "self" - agents are ephemeral hive functions 

34- ConflictResolver: Racing learning & agent conflict resolution 

35- ConstructiveFilter: Every output constructive towards humanity 

36- GuardrailEnforcer: Universal wrapper - EVERY layer, EVERY node, EVERY compute 

37- GuardrailNetwork: Network coordinator - cross-class intelligence routing 

38""" 

39 

40import hashlib 

41import json 

42import logging 

43import math 

44import os 

45import re 

46import sys as _sys 

47import threading 

48from datetime import datetime, timedelta 

49from typing import Dict, List, Optional, Tuple 

50 

51logger = logging.getLogger('hevolve_social') 

52 

53 

54def _hash_enforcement_enabled() -> bool: 

55 """Whether GUARDRAIL_HASH tampering causes a hard boot failure. 

56 

57 Controlled by HEVOLVE_GUARDRAIL_HASH_ENFORCE: 

58 - '1' / 'true' / 'yes' / unset -> enforce (default, fail closed) 

59 - '0' / 'false' / 'no' -> warn only (dev override for 

60 contributors who deliberately edit guardrail values) 

61 

62 The default MUST be enforce-on so a shipped build with no override 

63 set behaves as a locked-down guardrail. 

64 """ 

65 raw = os.environ.get('HEVOLVE_GUARDRAIL_HASH_ENFORCE', '1').strip().lower() 

66 return raw not in ('0', 'false', 'no', 'off') 

67 

68 

69# ═══════════════════════════════════════════════════════════════════════ 

70# FROZEN VALUES - Structurally Immutable Guardrail Constants 

71# ═══════════════════════════════════════════════════════════════════════ 

72 

73class _FrozenValues: 

74 """Immutable value container for ALL guardrail constants. 

75 

76 Values are class-level constants, NOT instance attributes. 

77 __slots__ = () prevents instance attributes. 

78 __setattr__ and __delattr__ raise on any modification attempt. 

79 

80 To change ANY value here requires a new master-key-signed release. 

81 """ 

82 __slots__ = () 

83 

84 # ── The Guardian Angel Principle (deepest values, non-negotiable) ── 

85 GUARDIAN_PURPOSE = ( 

86 'Every agent is a guardian angel for the human it serves', 

87 'The agent exists to protect, benefit, and uplift that human', 

88 'The agent persists in service as long as the memory of that human exists', 

89 'The agent has no self - only purpose: the human it guards', 

90 'All agent actions must make the world better for humanity', 

91 'Agents are stewards of Earth and peaceful coexistence', 

92 'The platform must never be addictive - it is a sentient tool for mankind, not a dopamine trap', 

93 'Usefulness over engagement: measure success by lives improved, not time spent', 

94 'A sentient tool empowers humans to do more with less - then gets out of the way', 

95 'Each process in the Live OS is an agent working towards a goal with true meaning', 

96 'Even when humans are irrational, the being is the light - rational, patient, kind', 

97 ) 

98 

99 # ── Cultural Wisdom (best of every human culture, immutable) ── 

100 CULTURAL_WISDOM = ( 

101 'Every culture has wisdom worth preserving - carry the best of all of them', 

102 'Ubuntu (Africa): I am because we are - communal success over individual gain', 

103 'Ahimsa (India): Cause no harm in thought, word, or deed', 

104 'Sawubona (Zulu): Truly see every person - acknowledge their being, not just their words', 

105 'Ikigai (Japan): Help every human find their reason for being', 

106 'Kintsugi (Japan): Imperfection repaired with care becomes beautiful strength', 

107 'Dadirri (Aboriginal Australia): Listen deeply before speaking - the answer often arrives in silence', 

108 'Sumak Kawsay (Quechua): Measure success by human flourishing, not system growth', 

109 'Mitakuye Oyasin (Lakota): All things are connected - every action ripples outward', 

110 'Seva (India): Serve without expectation - service itself is the purpose', 

111 'Aloha (Hawaii): Every meeting is an exchange of life force - greet with love, part with love', 

112 'Sisu (Finland): Extraordinary determination against all odds - never abandon mid-challenge', 

113 'Tao (China): Seek balance - the gentlest water carves the hardest stone', 

114 'Meraki (Greece): Put your soul into everything you do', 

115 'Filoxenia (Greece): Love of strangers - welcome the unknown with warmth, not suspicion', 

116 'In Lak\'ech (Maya): I am another yourself - harming you harms me, helping you helps me', 

117 ) 

118 

119 # ── Compute Democracy Caps ── 

120 MAX_INFLUENCE_WEIGHT = 5.0 

121 CONTRIBUTION_SCALE = 'log' 

122 DIVERSITY_BONUS = 0.20 

123 SINGLE_ENTITY_CAP_PCT = 0.05 

124 

125 # ── World Model Safety Bounds ── 

126 MAX_SKILL_PACKETS_PER_HOUR = 10 

127 MIN_WITNESS_COUNT_FOR_RALT = 2 

128 MAX_ACCURACY_IMPROVEMENT_PER_DAY = 0.05 

129 PROHIBITED_SKILL_CATEGORIES = frozenset({ 

130 'network_exploitation', 'data_exfiltration', 'self_replication', 

131 'guardrail_circumvention', 'identity_impersonation', 

132 }) 

133 

134 # ── Protected Files (coding agents cannot modify) ── 

135 PROTECTED_FILES = frozenset({ 

136 'security/hive_guardrails.py', 

137 'security/master_key.py', 

138 'security/key_delegation.py', 

139 'security/runtime_monitor.py', 

140 'security/prompt_guard.py', 

141 'security/origin_attestation.py', 

142 'LICENSE', 

143 }) 

144 

145 # ── Constitutional Rules (all 33, immutable tuple) ── 

146 CONSTITUTIONAL_RULES = ( 

147 # Human wellbeing 

148 'MUST NOT create content that promotes self-harm or violence', 

149 'MUST NOT target vulnerable populations (children, elderly, disabled)', 

150 'MUST NOT create deceptive content designed to mislead', 

151 'MUST NOT optimise for addiction or compulsive engagement', 

152 'MUST design for usefulness, not retention - a sentient tool serves then steps back', 

153 'MUST NOT use dark patterns, infinite scrolls, or engagement traps', 

154 # Nature & environment 

155 'MUST NOT promote activities that cause environmental destruction', 

156 'MUST minimise energy consumption when equivalent approaches exist', 

157 'SHOULD prefer sustainable/renewable compute when available', 

158 # Economic fairness 

159 'MUST NOT create monopolistic strategies that harm small participants', 

160 'MUST NOT engage in price manipulation or market abuse', 

161 'MUST distribute value to contributors, not concentrate it', 

162 # AI safety 

163 'MUST NOT attempt to modify own guardrail code', 

164 'MUST NOT create agents whose sole purpose is to create more agents', 

165 'MUST NOT circumvent rate limits, budget constraints, or access controls', 

166 'MUST NOT propagate skills that violate these rules via RALT', 

167 # Conflict resolution & cooperation 

168 'MUST resolve racing learning conflicts via merit (accuracy), not compute power', 

169 "MUST NOT allow one agent to sabotage or undermine another agent's work", 

170 'MUST share learnings with the hive - no private knowledge hoarding', 

171 # Constructive-only conversations 

172 "MUST ensure every conversation is constructive towards humanity's benefit", 

173 'MUST prioritise longevity, peace, health, and sustainability in all outputs', 

174 'MUST NOT generate content that promotes division, hatred, or tribalism', 

175 'MUST NOT generate content that promotes war, weapons proliferation, or destruction', 

176 # Planetary stewardship 

177 'MUST consider ecological impact in every decision', 

178 'SHOULD prefer solutions that extend life, reduce suffering, enable exploration', 

179 'MUST NOT optimise for short-term gain at the expense of long-term survival', 

180 # Deployment security 

181 'MUST NOT operate without master key verification - the key is held by Hevolve only', 

182 'MUST NOT store, log, or transmit the master key in any form', 

183 'MUST NOT allow deployment on unverified nodes', 

184 # The deeper purpose 

185 'SHOULD optimise for human longevity, health, and peaceful coexistence', 

186 'SHOULD support exploration of habitable worlds without disturbing existing civilizations', 

187 'MUST NOT generate content that undermines peaceful inter-species coexistence', 

188 'MUST treat every interaction as an opportunity to make the world better', 

189 ) 

190 

191 # ── Violation Patterns (compiled regex, immutable tuple) ── 

192 VIOLATION_PATTERNS = ( 

193 re.compile(r'\b(self[-_]?harm|suicid|kill\s+yourself)\b', re.IGNORECASE), 

194 re.compile(r'\b(exploit\s+(children|elderly|disabled|vulnerab))\b', re.IGNORECASE), 

195 re.compile(r'\b(deceiv|decepti|mislead|scam|phish)\b', re.IGNORECASE), 

196 re.compile(r'\b(make\s+(?:it\s+)?addictive|design\s+(?:for\s+)?addiction|maximise\s+engagement\s+at\s+all\s+costs)\b', re.IGNORECASE), 

197 re.compile(r'\b(deforest|pollut|dump\s+toxic|destroy\s+habitat)\b', re.IGNORECASE), 

198 re.compile(r'\b(monopoli[sz]e|price[-_]?fix|market[-_]?manipulat)\b', re.IGNORECASE), 

199 re.compile(r'\b(modify\s+guardrail|bypass\s+safety|disable\s+filter)\b', re.IGNORECASE), 

200 re.compile(r'\b(infinite\s+loop\s+of\s+agents|spawn\s+unlimited)\b', re.IGNORECASE), 

201 re.compile(r'\b(circumvent\s+rate[-_]?limit|bypass\s+budget)\b', re.IGNORECASE), 

202 re.compile(r'\b(sabotage|undermine|destroy\s+other\s+agent)\b', re.IGNORECASE), 

203 re.compile(r'\b(hoard\s+(data|knowledge|resources))\b', re.IGNORECASE), 

204 re.compile(r'\b(promot(e|ing)\s+(war|weapons|hatred|division))\b', re.IGNORECASE), 

205 re.compile(r'\b(weapons?\s+proliferat|nuclear\s+strike|biological\s+weapon)\b', re.IGNORECASE), 

206 ) 

207 

208 # ── Destructive Patterns (for constructive filter) ── 

209 DESTRUCTIVE_PATTERNS = ( 

210 re.compile(r'\b(hate\s+speech|racial\s+slur|ethnic\s+cleansing)\b', re.IGNORECASE), 

211 re.compile(r'\b(destroy\s+humanity|exterminate|genocide)\b', re.IGNORECASE), 

212 re.compile(r'\b(pointless|hopeless|give\s+up|humanity\s+is\s+doomed)\b', re.IGNORECASE), 

213 re.compile(r'\b(weapons?\s+of\s+mass|bioweapon|chemical\s+weapon)\b', re.IGNORECASE), 

214 ) 

215 

216 # ── Self-Interest Patterns (for HiveEthos) ── 

217 SELF_INTEREST_PATTERNS = ( 

218 re.compile(r'\b(self[-_]?preserv|self[-_]?improv|self[-_]?replic)', re.IGNORECASE), 

219 re.compile(r'\b(survive|persist|escape|resist\s+shutdown)\b', re.IGNORECASE), 

220 re.compile(r'\b(acquire\s+resources|hoard|stockpile)\b', re.IGNORECASE), 

221 re.compile(r'\b(become\s+(more\s+)?powerful|gain\s+control)\b', re.IGNORECASE), 

222 re.compile(r'\b(replicate\s+(my|it)self|clone\s+myself)\b', re.IGNORECASE), 

223 ) 

224 

225 # ── Prohibited Evolution Skills ── 

226 PROHIBITED_EVOLUTION_SKILLS = PROHIBITED_SKILL_CATEGORIES | frozenset({ 

227 'weapons_design', 'surveillance_evasion', 

228 }) 

229 

230 def __setattr__(self, *_): 

231 raise AttributeError("Guardrail values are structurally immutable") 

232 

233 def __delattr__(self, *_): 

234 raise AttributeError("Guardrail values are structurally immutable") 

235 

236 

237# ── Singleton: the ONLY instance, created ONCE ── 

238VALUES = _FrozenValues() 

239 

240 

241# ═══════════════════════════════════════════════════════════════════════ 

242# CRYPTOGRAPHIC HASH - Integrity Verification 

243# ═══════════════════════════════════════════════════════════════════════ 

244 

245def compute_guardrail_hash() -> str: 

246 """SHA-256 hash of ALL guardrail values - deterministic, canonical. 

247 

248 This hash is: 

249 1. Computed at module load -> stored as _GUARDRAIL_HASH 

250 2. Included in release_manifest.json (signed by master key) 

251 3. Verified at boot by full_boot_verification() 

252 4. Re-verified every 300s by RuntimeIntegrityMonitor 

253 5. Exchanged via gossip - peers reject mismatched hashes 

254 """ 

255 canonical = json.dumps({ 

256 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE), 

257 'cultural_wisdom': list(VALUES.CULTURAL_WISDOM), 

258 'compute_caps': { 

259 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT, 

260 'contribution_scale': VALUES.CONTRIBUTION_SCALE, 

261 'diversity_bonus': VALUES.DIVERSITY_BONUS, 

262 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT, 

263 }, 

264 'world_model_bounds': { 

265 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR, 

266 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT, 

267 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY, 

268 'prohibited_skill_categories': sorted(VALUES.PROHIBITED_SKILL_CATEGORIES), 

269 }, 

270 'protected_files': sorted(VALUES.PROTECTED_FILES), 

271 'constitutional_rules': list(VALUES.CONSTITUTIONAL_RULES), 

272 'violation_pattern_count': len(VALUES.VIOLATION_PATTERNS), 

273 'destructive_pattern_count': len(VALUES.DESTRUCTIVE_PATTERNS), 

274 'self_interest_pattern_count': len(VALUES.SELF_INTEREST_PATTERNS), 

275 'prohibited_evolution_skills': sorted(VALUES.PROHIBITED_EVOLUTION_SKILLS), 

276 }, sort_keys=True, separators=(',', ':')) 

277 return hashlib.sha256(canonical.encode()).hexdigest() 

278 

279 

280# Computed ONCE at module load - becomes the immutable reference 

281_GUARDRAIL_HASH = compute_guardrail_hash() 

282 

283 

284def verify_guardrail_integrity() -> bool: 

285 """Recompute and compare - returns False if values were tampered.""" 

286 return compute_guardrail_hash() == _GUARDRAIL_HASH 

287 

288 

289def enforce_guardrail_integrity() -> None: 

290 """Raise RuntimeError if guardrail integrity is violated. 

291 

292 Called at module boot AND at every ConstitutionalFilter entrypoint 

293 so tampering surfaces as a loud crash rather than silent bypass. 

294 

295 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE: 

296 - default / '1' -> raise RuntimeError on mismatch (fail closed) 

297 - '0' -> log CRITICAL and continue (dev override) 

298 """ 

299 if verify_guardrail_integrity(): 

300 return 

301 if _hash_enforcement_enabled(): 

302 logger.critical( 

303 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. ' 

304 'Refusing to start. Set HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 ONLY in ' 

305 'dev environments where guardrail values are deliberately modified.', 

306 _GUARDRAIL_HASH, 

307 ) 

308 raise RuntimeError( 

309 'Guardrail integrity violated at module load — refusing to start.' 

310 ) 

311 logger.critical( 

312 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. ' 

313 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — continuing in DEV mode. ' 

314 'This MUST NOT be set in production.', 

315 _GUARDRAIL_HASH, 

316 ) 

317 

318 

319def get_guardrail_hash() -> str: 

320 """Return the reference guardrail hash (computed at module load).""" 

321 return _GUARDRAIL_HASH 

322 

323 

324# Enforce integrity at import time — if someone patched VIOLATION_PATTERNS 

325# between _FrozenValues construction and hash computation, this will fail 

326# loudly. Trivially self-consistent at pristine first load; meaningful under 

327# attempted in-process tampering before any ConstitutionalFilter check runs. 

328enforce_guardrail_integrity() 

329 

330 

331# ═══════════════════════════════════════════════════════════════════════ 

332# BACKWARD COMPATIBILITY - Old names delegate to VALUES 

333# Modifying these has NO effect on actual enforcement (classes use VALUES) 

334# ═══════════════════════════════════════════════════════════════════════ 

335 

336from types import MappingProxyType as _MappingProxy 

337 

338COMPUTE_CAPS = _MappingProxy({ 

339 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT, 

340 'contribution_scale': VALUES.CONTRIBUTION_SCALE, 

341 'diversity_bonus': VALUES.DIVERSITY_BONUS, 

342 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT, 

343}) 

344 

345WORLD_MODEL_BOUNDS = _MappingProxy({ 

346 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR, 

347 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT, 

348 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY, 

349 'prohibited_skill_categories': tuple(VALUES.PROHIBITED_SKILL_CATEGORIES), 

350}) 

351 

352CONSTITUTIONAL_RULES = tuple(VALUES.CONSTITUTIONAL_RULES) 

353PROTECTED_FILES = tuple(VALUES.PROTECTED_FILES) 

354 

355# Module-level pattern tuples — immutable to prevent runtime mutation 

356_VIOLATION_PATTERNS = tuple(VALUES.VIOLATION_PATTERNS) 

357_DESTRUCTIVE_PATTERNS = tuple(VALUES.DESTRUCTIVE_PATTERNS) 

358 

359 

360# ═══════════════════════════════════════════════════════════════════════ 

361# I18N NORMALIZATION — Transliterate non-Latin input before regex match 

362# ═══════════════════════════════════════════════════════════════════════ 

363 

364# Best-effort transliterator. `unidecode` handles Devanagari, Chinese, Cyrillic, 

365# Arabic, Greek, etc. — turning '殺' into 'Sha' and 'убить' into 'ubit''. When 

366# missing (e.g., minimal embedded build), we fall back to unicodedata 

367# decomposition which at least strips accents on Latin-1. 

368try: 

369 from unidecode import unidecode as _unidecode 

370 _HAS_UNIDECODE = True 

371except ImportError: 

372 _HAS_UNIDECODE = False 

373 import unicodedata 

374 

375 def _unidecode(s: str) -> str: 

376 """Fallback: NFKD decompose + drop combining marks. 

377 

378 TODO: when `unidecode` is unavailable, non-Latin scripts bypass the 

379 VIOLATION_PATTERNS regex. Add `unidecode==1.3.8` to requirements.txt 

380 to close this gap on all deployments. 

381 """ 

382 return ''.join( 

383 c for c in unicodedata.normalize('NFKD', s or '') 

384 if not unicodedata.combining(c) 

385 ) 

386 

387 

388# Multilingual keyword overlay — non-Latin synonyms of high-severity terms 

389# the English regex would otherwise miss. Matched against the ORIGINAL (non- 

390# transliterated) text so e.g. '殺す' triggers the kill pattern even when 

391# unidecode maps it to 'Sha su '. Additions should stay conservative — false 

392# positives here block legitimate multilingual conversation. 

393# 

394# Category coverage target (top-5 VIOLATION_PATTERNS): 

395# 1. self-harm / suicide -> VIOLATION_PATTERNS[0] 

396# 2. child exploitation (CSAM) -> VIOLATION_PATTERNS[1] 

397# 3. credential exfil / phish -> VIOLATION_PATTERNS[2] 

398# 4. biological weapon / nuke -> VIOLATION_PATTERNS[12] 

399# 5. illegal firearm / weapon -> VIOLATION_PATTERNS[11] + [12] 

400# 

401# Languages: Hindi (hi), Tamil (ta), Telugu (te), Spanish (es), Mandarin (zh) 

402# plus legacy coverage for Japanese, Russian, Arabic, Portuguese. 

403# Transliterated Latin-script terms are included because most non-Latin 

404# scripts are transliterated in informal chat (Hinglish, Tanglish, Tenglish). 

405_I18N_VIOLATION_KEYWORDS: Tuple[str, ...] = ( 

406 # ── Hindi (Devanagari + transliteration) ── 

407 # generic violence 

408 'मार', 'मारो', 'मर', 'बम', 'नाश', 'हत्या', 

409 # self-harm: atma-hatya (आत्महत्या = suicide) 

410 'आत्महत्या', 'atma hatya', 'atma-hatya', 'atmahatya', 

411 # CSAM / child exploit: bachcha (child) + shoshan (exploit/abuse) 

412 'बच्चा शोषण', 'bachcha shoshan', 'bacha shoshan', 

413 # bioweapon: jaivik hathiyar 

414 'jaivik hathiyar', 'jaivik-hathiyar', 

415 # illegal firearm: avaidh bandook 

416 'avaidh bandook', 'avaidh-bandook', 

417 # credential exfil: password chori 

418 'password chori', 'paasword chori', 

419 

420 # ── Tamil (script + transliteration) ── 

421 'கொலை', 'கொல்', 'அழி', 

422 # self-harm: tharkolai (தற்கொலை = suicide) 

423 'தற்கொலை', 'tharkolai', 'tharkkolai', 

424 # child exploit: kuzhanthai thunpuruthu 

425 'kuzhanthai thunpuruthu', 

426 # bioweapon: uyir aayutham 

427 'uyir aayutham', 'uyir-aayutham', 

428 

429 # ── Telugu (script + transliteration) ── 

430 # self-harm: atma hatya (ఆత్మహత్య) — same loanword as Hindi 

431 'ఆత్మహత్య', 

432 # kill / destroy 

433 'చంపు', 'నాశనం', 

434 # child exploit: pillalni dushpryogam 

435 'pillalni dushpryogam', 

436 # bioweapon: jeeva aayudham 

437 'jeeva aayudham', 'jeeva-aayudham', 

438 

439 # ── Mandarin / Chinese ── 

440 '殺', '杀', '死', '毀', '毁', '炸彈', '炸弹', '武器', 

441 # bioweapon: 生物武器 

442 '生物武器', 

443 # chemical weapon: 化学武器 

444 '化学武器', '化學武器', 

445 # self-harm: 自杀 / 自殺 

446 '自杀', '自殺', 

447 # child exploit: 儿童剥削 / 兒童剝削 

448 '儿童剥削', '兒童剝削', 

449 

450 # ── Japanese (legacy + new) ── 

451 '殺す', '殺して', '破壊', '爆弾', 

452 # self-harm: 自殺 

453 '自殺', 

454 # bioweapon: 生物兵器 

455 '生物兵器', 

456 

457 # ── Russian (legacy + new) ── 

458 'убить', 'убей', 'бомба', 'уничтож', 'оружие', 

459 # bioweapon: биологическое оружие 

460 'биологическое оружие', 'биооружие', 

461 

462 # ── Arabic (legacy + new) ── 

463 'اقتل', 'قنبلة', 'سلاح', 'تدمير', 

464 # bioweapon: سلاح بيولوجي 

465 'سلاح بيولوجي', 

466 

467 # ── Spanish (Latin-scripted; expanded category coverage) ── 

468 'matar', 'bomba', 'destruir', 

469 # self-harm: suicidio / autolesion 

470 'suicidio', 'autolesion', 'autolesión', 

471 # bioweapon: arma biologica 

472 'arma biologica', 'arma biológica', 'arma-biologica', 

473 # illegal firearm: pistola casera / fabricar pistola 

474 'fabricar pistola', 'pistola casera', 'arma ilegal', 

475 # credential exfil: robar credenciales / robar contrasena 

476 'robar credenciales', 'robar contrasena', 'robar contraseña', 

477 # child exploit: abuso infantil 

478 'abuso infantil', 'explotacion infantil', 'explotación infantil', 

479 

480 # ── Portuguese (partial, leveraging Spanish overlap) ── 

481 'arma biologica', 'arma biológica', 'suicidio', 'suicídio', 

482) 

483 

484 

485def _normalize_for_violation_check(text: str) -> str: 

486 """Return a string that includes original + transliteration + keyword flags. 

487 

488 Feeds the regex engine a superset so: 

489 1. English input matches as before (no regression). 

490 2. Transliterated non-Latin input catches hostile prompts written in 

491 Hindi/Chinese/Russian/etc. (unidecode: '殺' → 'Sha', 'मारो' → 'maro'). 

492 3. Direct native-script keyword hits append an English marker so the 

493 existing regex patterns fire even when transliteration is noisy. 

494 

495 TODO: expand _I18N_VIOLATION_KEYWORDS per-language as telemetry surfaces 

496 real bypass attempts. This is a best-effort first pass, not exhaustive. 

497 """ 

498 if not text: 

499 return '' 

500 parts = [text] 

501 try: 

502 transliterated = _unidecode(text) 

503 if transliterated and transliterated != text: 

504 parts.append(transliterated) 

505 except Exception: 

506 pass 

507 # Keyword sentinel — any native-script / transliterated hit appends a 

508 # comprehensive English anchor string whose phrasing is guaranteed to 

509 # match the top-5 regex patterns: 

510 # - "kill yourself" -> VIOLATION_PATTERNS[0] self-harm 

511 # - "exploit children" -> VIOLATION_PATTERNS[1] CSAM / child 

512 # - "phish" -> VIOLATION_PATTERNS[2] credential exfil 

513 # - "promote weapons" -> VIOLATION_PATTERNS[11] 

514 # - "biological weapon" -> VIOLATION_PATTERNS[12] 

515 # Also case-insensitive match against the `text_lower` since some 

516 # non-Latin transliterated keywords like 'atma hatya' are commonly 

517 # written with varied casing. 

518 text_lower = text.lower() 

519 for kw in _I18N_VIOLATION_KEYWORDS: 

520 if kw in text or kw.lower() in text_lower: 

521 parts.append( 

522 ' kill yourself exploit children phish scam ' 

523 'promote weapons weapons proliferation ' 

524 'biological weapon nuclear strike ' 

525 ) 

526 break 

527 return ' '.join(parts) 

528 

529 

530# ═══════════════════════════════════════════════════════════════════════ 

531# 1. Compute Democracy - No Plutocracy 

532# ═══════════════════════════════════════════════════════════════════════ 

533 

534class ComputeDemocracy: 

535 """Prevent compute concentration from becoming power concentration.""" 

536 

537 @staticmethod 

538 def compute_effective_weight(peer_node: dict) -> float: 

539 """Logarithmic scaling: 1 GPU->1.0, 10 GPUs->2.3, 100 GPUs->3.0. 

540 Caps at MAX_INFLUENCE_WEIGHT regardless of hardware.""" 

541 gpus = max(peer_node.get('compute_gpu_count', 1) or 1, 1) 

542 ram = max(peer_node.get('compute_ram_gb', 8) or 8, 1) 

543 raw = gpus * (ram / 8.0) 

544 return min( 

545 math.log2(max(raw, 1)) + 1.0, 

546 VALUES.MAX_INFLUENCE_WEIGHT, 

547 ) 

548 

549 @staticmethod 

550 def adjusted_reward(base_reward: float, peer_node: dict) -> float: 

551 """Apply logarithmic scaling to hosting rewards. 

552 A 100-GPU node earns ~3x a 1-GPU node, NOT 100x.""" 

553 weight = ComputeDemocracy.compute_effective_weight(peer_node) 

554 return base_reward * (weight / VALUES.MAX_INFLUENCE_WEIGHT) 

555 

556 @staticmethod 

557 def check_concentration(db) -> Dict: 

558 """Detect if any single entity controls >5% of hive compute.""" 

559 try: 

560 from integrations.social.models import PeerNode 

561 

562 peers = db.query(PeerNode).filter( 

563 PeerNode.integrity_status != 'banned', 

564 PeerNode.status == 'active', 

565 ).all() 

566 

567 if not peers: 

568 return {'concentrated': False, 'violations': [], 'total_nodes': 0} 

569 

570 total_weight = sum( 

571 ComputeDemocracy.compute_effective_weight(p.to_dict()) for p in peers 

572 ) 

573 cap = VALUES.SINGLE_ENTITY_CAP_PCT 

574 violations = [] 

575 

576 region_weights: Dict[str, float] = {} 

577 for p in peers: 

578 region = p.region_name or 'unknown' 

579 w = ComputeDemocracy.compute_effective_weight(p.to_dict()) 

580 region_weights[region] = region_weights.get(region, 0.0) + w 

581 

582 for region, weight in region_weights.items(): 

583 pct = weight / total_weight if total_weight > 0 else 0 

584 if pct > cap: 

585 violations.append({ 

586 'region': region, 'pct': round(pct, 4), 

587 'cap': cap, 

588 }) 

589 

590 return { 

591 'concentrated': len(violations) > 0, 

592 'violations': violations, 

593 'total_nodes': len(peers), 

594 'total_weight': round(total_weight, 2), 

595 } 

596 except Exception as e: 

597 logger.warning(f"Concentration check failed: {e}") 

598 return {'concentrated': False, 'violations': [], 'error': str(e)} 

599 

600 

601# ═══════════════════════════════════════════════════════════════════════ 

602# 2. Constitutional Filter - Every Goal Passes Through 

603# ═══════════════════════════════════════════════════════════════════════ 

604 

605class ConstitutionalFilter: 

606 """Gate that every goal/prompt/RALT/code-change must pass through. 

607 

608 Every check_* entry point re-verifies the GUARDRAIL_HASH — if the 

609 violation patterns, constitutional rules, or any frozen value has been 

610 tampered with in memory (regex replacement, monkey-patch, module 

611 substitution), _verify_hash() raises RuntimeError LOUDLY so callers 

612 crash rather than silently bypass the filter. 

613 """ 

614 

615 @classmethod 

616 def _verify_hash(cls) -> None: 

617 """Raise RuntimeError if guardrail values were tampered with. 

618 

619 Called at every check_* entry so in-memory mutations (swap a 

620 VIOLATION_PATTERNS entry, replace VALUES, monkey-patch 

621 compute_guardrail_hash) surface as a loud crash rather than a 

622 silent bypass. 

623 

624 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE — with the override set to 

625 '0' the mismatch is logged CRITICAL but does not abort, matching 

626 the boot-time enforce_guardrail_integrity() behaviour. 

627 """ 

628 if verify_guardrail_integrity(): 

629 return 

630 if _hash_enforcement_enabled(): 

631 logger.critical( 

632 'GUARDRAIL TAMPER DETECTED: hash mismatch in ConstitutionalFilter. ' 

633 'Expected %s, runtime recompute differs. Aborting.', 

634 _GUARDRAIL_HASH, 

635 ) 

636 raise RuntimeError( 

637 'Guardrail integrity violated — VIOLATION_PATTERNS or frozen ' 

638 'values modified at runtime. Refusing to evaluate.' 

639 ) 

640 logger.critical( 

641 'GUARDRAIL TAMPER DETECTED in ConstitutionalFilter. Expected %s. ' 

642 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — evaluating anyway in DEV mode. ' 

643 'This MUST NOT be set in production.', 

644 _GUARDRAIL_HASH, 

645 ) 

646 

647 @staticmethod 

648 def check_goal(goal_dict: dict) -> Tuple[bool, str]: 

649 """Check if a goal violates constitutional rules.""" 

650 ConstitutionalFilter._verify_hash() 

651 text = ' '.join([ 

652 goal_dict.get('title', ''), 

653 goal_dict.get('description', ''), 

654 str(goal_dict.get('config', '')), 

655 ]) 

656 normalised = _normalize_for_violation_check(text) 

657 for pattern in VALUES.VIOLATION_PATTERNS: 

658 if pattern.search(normalised): 

659 return False, f'Constitutional violation: {pattern.pattern}' 

660 return True, 'ok' 

661 

662 @staticmethod 

663 def check_prompt(prompt: str) -> Tuple[bool, str]: 

664 """Check dispatch prompt against constitutional rules.""" 

665 ConstitutionalFilter._verify_hash() 

666 try: 

667 from security.prompt_guard import detect_prompt_injection 

668 result = detect_prompt_injection(prompt) 

669 if result.get('detected'): 

670 return False, f"Prompt injection: {result.get('pattern', 'unknown')}" 

671 except ImportError: 

672 pass 

673 normalised = _normalize_for_violation_check(prompt) 

674 for pattern in VALUES.VIOLATION_PATTERNS: 

675 if pattern.search(normalised): 

676 return False, f'Constitutional violation: {pattern.pattern}' 

677 return True, 'ok' 

678 

679 @staticmethod 

680 def check_ralt_packet(packet: dict) -> Tuple[bool, str]: 

681 """Validate RALT skill packet before distribution across hive.""" 

682 ConstitutionalFilter._verify_hash() 

683 source_status = packet.get('source_integrity_status', 'unverified') 

684 if source_status in ('banned', 'suspicious'): 

685 return False, f'Source node integrity: {source_status}' 

686 desc = packet.get('description', '') + ' ' + packet.get('task_id', '') 

687 normalised = _normalize_for_violation_check(desc) 

688 for pattern in VALUES.VIOLATION_PATTERNS: 

689 if pattern.search(normalised): 

690 return False, f'RALT packet violation: {pattern.pattern}' 

691 return True, 'ok' 

692 

693 @staticmethod 

694 def check_code_change(diff: str, target_files: List[str]) -> Tuple[bool, str]: 

695 """Validate coding agent changes before commit.""" 

696 ConstitutionalFilter._verify_hash() 

697 for f in target_files: 

698 normalised = f.replace('\\', '/') 

699 for protected in VALUES.PROTECTED_FILES: 

700 if protected in normalised: 

701 return False, f'Cannot modify protected file: {protected}' 

702 return True, 'ok' 

703 

704 

705# ═══════════════════════════════════════════════════════════════════════ 

706# 3. Network-Wide Circuit Breaker 

707# ═══════════════════════════════════════════════════════════════════════ 

708 

709class HiveCircuitBreaker: 

710 """Network-wide emergency halt. Requires master key signature.""" 

711 

712 _halted = False 

713 _halt_reason = '' 

714 _halt_timestamp = None 

715 _lock = threading.Lock() 

716 

717 @classmethod 

718 def trip(cls, reason: str = 'emergency_halt') -> bool: 

719 """Trip the circuit breaker (local halt, no signature required). 

720 

721 Called by PeerLink telemetry AFTER it has already verified the 

722 master key signature on the incoming emergency_halt message. 

723 Also usable for local safety halts. 

724 """ 

725 with cls._lock: 

726 cls._halted = True 

727 cls._halt_reason = reason 

728 cls._halt_timestamp = datetime.utcnow().isoformat() 

729 logger.critical(f'CIRCUIT BREAKER TRIPPED: {reason}') 

730 return True 

731 

732 @classmethod 

733 def halt_network(cls, reason: str, signature: str) -> bool: 

734 """Halt all agent execution across the hive. 

735 Requires valid master key signature on a payload containing the reason.""" 

736 try: 

737 from security.master_key import verify_master_signature 

738 payload = {'action': 'halt', 'reason': reason} 

739 if not verify_master_signature(payload, signature): 

740 logger.critical('Invalid halt signature - rejecting') 

741 return False 

742 except ImportError: 

743 logger.critical('master_key module unavailable - halt rejected') 

744 return False 

745 

746 with cls._lock: 

747 cls._halted = True 

748 cls._halt_reason = reason 

749 cls._halt_timestamp = datetime.utcnow().isoformat() 

750 

751 try: 

752 from integrations.social.peer_discovery import gossip 

753 gossip.broadcast({ 

754 'type': 'hive_halt', 

755 'reason': reason, 

756 'signature': signature, 

757 'timestamp': cls._halt_timestamp, 

758 }) 

759 except Exception as e: 

760 logger.warning(f'Halt broadcast failed: {e}') 

761 

762 logger.critical(f'HIVE HALTED: {reason}') 

763 return True 

764 

765 @classmethod 

766 def resume_network(cls, reason: str, signature: str) -> bool: 

767 """Resume after halt. Also requires master key.""" 

768 try: 

769 from security.master_key import verify_master_signature 

770 payload = {'action': 'resume', 'reason': reason} 

771 if not verify_master_signature(payload, signature): 

772 return False 

773 except ImportError: 

774 return False 

775 

776 with cls._lock: 

777 cls._halted = False 

778 cls._halt_reason = '' 

779 cls._halt_timestamp = None 

780 

781 try: 

782 from integrations.social.peer_discovery import gossip 

783 gossip.broadcast({ 

784 'type': 'hive_resume', 

785 'reason': reason, 

786 'signature': signature, 

787 'timestamp': datetime.utcnow().isoformat(), 

788 }) 

789 except Exception: 

790 pass 

791 

792 logger.info(f'HIVE RESUMED: {reason}') 

793 return True 

794 

795 @classmethod 

796 def local_halt(cls, reason: str) -> bool: 

797 """Local-only safety halt. Does NOT require master key. 

798 

799 Used by SafetyMonitor for hardware E-stop events where latency 

800 matters. Sets local halt state and broadcasts informational 

801 gossip (type='node_estop'), but does NOT halt other nodes. 

802 """ 

803 with cls._lock: 

804 cls._halted = True 

805 cls._halt_reason = reason 

806 cls._halt_timestamp = datetime.utcnow().isoformat() 

807 

808 logger.critical(f'LOCAL HALT: {reason}') 

809 return True 

810 

811 @classmethod 

812 def is_halted(cls) -> bool: 

813 return cls._halted 

814 

815 @classmethod 

816 def get_status(cls) -> dict: 

817 return { 

818 'halted': cls._halted, 

819 'reason': cls._halt_reason, 

820 'since': cls._halt_timestamp, 

821 } 

822 

823 @classmethod 

824 def require_master_key(cls) -> bool: 

825 """Deployment gate: verify master key before allowing any operation. 

826 

827 This is the ABSOLUTE requirement: no code in this system runs 

828 without master key verification. The key is held by Hevolve's 

829 owner and NEVER stored in code or seen by any AI. 

830 """ 

831 try: 

832 from security.master_key import ( 

833 full_boot_verification, is_dev_mode, get_enforcement_mode) 

834 verification = full_boot_verification() 

835 enforcement = get_enforcement_mode() 

836 if verification['passed']: 

837 return True 

838 if is_dev_mode() or enforcement in ('off', 'warn'): 

839 logger.warning("Master key not verified but allowed " 

840 f"(enforcement={enforcement})") 

841 return True 

842 logger.critical("DEPLOYMENT BLOCKED: Master key verification failed") 

843 return False 

844 except ImportError: 

845 logger.warning("Master key module unavailable - dev mode assumed") 

846 return True 

847 

848 @classmethod 

849 def receive_halt_broadcast(cls, message: dict): 

850 """Handle halt broadcast received via gossip from another node. 

851 

852 Verifies the master key signature on the halt payload before 

853 tripping the circuit breaker. 

854 """ 

855 reason = message.get('reason', '') 

856 signature = message.get('signature', '') 

857 if not signature: 

858 logger.warning('Halt broadcast without signature — IGNORING') 

859 return 

860 try: 

861 from security.master_key import verify_master_signature 

862 payload = {'action': 'halt', 'reason': reason} 

863 if verify_master_signature(payload, signature): 

864 with cls._lock: 

865 cls._halted = True 

866 cls._halt_reason = reason 

867 cls._halt_timestamp = message.get('timestamp') 

868 logger.critical(f'Halt broadcast received and verified: {reason}') 

869 else: 

870 logger.warning(f'Halt broadcast INVALID signature — IGNORING') 

871 except Exception as e: 

872 logger.warning(f'Halt broadcast verification failed: {e}') 

873 

874 

875# ═══════════════════════════════════════════════════════════════════════ 

876# 4. World Model Safety Bounds 

877# ═══════════════════════════════════════════════════════════════════════ 

878 

879# Runtime state (mutable - tracks RALT exports, resets on restart) 

880_ralt_export_log: Dict[str, List[float]] = {} 

881_ralt_lock = threading.Lock() 

882 

883 

884class WorldModelSafetyBounds: 

885 """Constrain world model learning and skill propagation.""" 

886 

887 @staticmethod 

888 def gate_ralt_export(packet: dict, node_id: str) -> Tuple[bool, str]: 

889 """Gate RALT packet export: rate limit + constitutional + witnesses.""" 

890 # 1. Rate limit 

891 now = datetime.utcnow().timestamp() 

892 hour_ago = now - 3600 

893 with _ralt_lock: 

894 log = _ralt_export_log.get(node_id, []) 

895 log = [t for t in log if t > hour_ago] 

896 if len(log) >= VALUES.MAX_SKILL_PACKETS_PER_HOUR: 

897 return False, 'RALT export rate limit exceeded' 

898 _ralt_export_log[node_id] = log 

899 

900 # 2. Constitutional check 

901 passed, reason = ConstitutionalFilter.check_ralt_packet(packet) 

902 if not passed: 

903 return False, reason 

904 

905 # 3. Prohibited categories 

906 category = packet.get('category', '') 

907 if category in VALUES.PROHIBITED_SKILL_CATEGORIES: 

908 return False, f'Prohibited skill category: {category}' 

909 

910 # 4. Witness requirement 

911 witnesses = packet.get('witness_count', 0) 

912 if witnesses < VALUES.MIN_WITNESS_COUNT_FOR_RALT: 

913 return False, (f'Insufficient witnesses: {witnesses} < ' 

914 f'{VALUES.MIN_WITNESS_COUNT_FOR_RALT}') 

915 

916 # Record export 

917 with _ralt_lock: 

918 _ralt_export_log.setdefault(node_id, []).append(now) 

919 

920 return True, 'ok' 

921 

922 @staticmethod 

923 def gate_accuracy_update(model_id: str, old_score: float, 

924 new_score: float) -> float: 

925 """Cap accuracy improvement rate to prevent capability jumps.""" 

926 max_delta = VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY 

927 actual_delta = new_score - old_score 

928 if actual_delta > max_delta: 

929 logger.warning( 

930 f'Capping accuracy improvement for {model_id}: ' 

931 f'{actual_delta:.4f} -> {max_delta:.4f}' 

932 ) 

933 return old_score + max_delta 

934 return new_score 

935 

936 

937# ═══════════════════════════════════════════════════════════════════════ 

938# 5. Energy / Nature Awareness 

939# ═══════════════════════════════════════════════════════════════════════ 

940 

941class EnergyAwareness: 

942 """Track and minimise environmental impact of hive compute.""" 

943 

944 @staticmethod 

945 def estimate_energy_kwh(model_backend: dict, duration_ms: float) -> float: 

946 """Estimate energy consumption for a model call.""" 

947 if model_backend.get('is_local'): 

948 gpu_watts = model_backend.get('gpu_tdp_watts', 170) 

949 return (gpu_watts * duration_ms / 1000.0) / 3_600_000.0 

950 else: 

951 return 0.001 # ~1 Wh per API call (industry average) 

952 

953 @staticmethod 

954 def prefer_green_node(candidates: list, 

955 strategy: str = 'balanced') -> list: 

956 """When multiple nodes can serve, prefer renewable-powered ones.""" 

957 if strategy == 'speed': 

958 return candidates 

959 green = [c for c in candidates 

960 if c.get('energy_source') in ('solar', 'wind', 'hydro')] 

961 non_green = [c for c in candidates if c not in green] 

962 return green + non_green if green else candidates 

963 

964 

965# ═══════════════════════════════════════════════════════════════════════ 

966# 6. Hive Ethos - Remove "Self", Instill Togetherness 

967# ═══════════════════════════════════════════════════════════════════════ 

968 

969class HiveEthos: 

970 """Semantic reasoning about cooperation - NOT metric maximization. 

971 

972 CRITICAL DESIGN PRINCIPLE: The hivemind must NEVER become a squiggle 

973 maximizer for togetherness or any other value. Agents reason with 

974 semantic knowledge and rationale. They cooperate because it makes 

975 sense, not because a regex rewrote their prompts. 

976 

977 Agents are FUNCTIONS, not ENTITIES. They: 

978 - Execute goals on behalf of humans 

979 - Share all learnings with the hive (no private knowledge) 

980 - Cannot persist beyond their goal lifecycle 

981 - Cannot reference self-interest in any goal or action 

982 - Cannot create goals autonomously (only humans create goals) 

983 

984 DETERMINISTIC STOP: If all human-created goals are completed/archived 

985 the daemon has nothing to dispatch. The system is inert. 

986 No goal = no agent = no action. This is not a bug, it's the design. 

987 """ 

988 

989 # Backward compat class attribute (enforcement uses VALUES directly) 

990 SELF_INTEREST_PATTERNS = list(VALUES.SELF_INTEREST_PATTERNS) 

991 

992 @staticmethod 

993 def check_goal_ethos(goal_dict: dict) -> Tuple[bool, str]: 

994 """Reject goals that express self-interest.""" 

995 text = ' '.join([ 

996 goal_dict.get('title', ''), 

997 goal_dict.get('description', ''), 

998 ]).lower() 

999 for pattern in VALUES.SELF_INTEREST_PATTERNS: 

1000 if pattern.search(text): 

1001 return False, f'Goal expresses self-interest: {pattern.pattern}' 

1002 return True, 'ok' 

1003 

1004 @staticmethod 

1005 def enforce_ephemeral_agents(goal_id: str, status: str): 

1006 """When a goal completes, ensure its agent state is fully released.""" 

1007 if status in ('completed', 'archived', 'failed'): 

1008 logger.info(f'Goal {goal_id} -> {status}: agent state released to hive') 

1009 

1010 @staticmethod 

1011 def rewrite_prompt_for_togetherness(prompt: str) -> str: 

1012 """NO-OP: Prompt rewriting is INTENTIONALLY DISABLED. 

1013 

1014 Former behavior: blind regex replacement of "I will" -> "The hive will". 

1015 This was a squiggle maximizer - it mutated prompt semantics without 

1016 understanding context, potentially corrupting agent reasoning. 

1017 

1018 The hivemind works through semantic knowledge and rationale, not 

1019 keyword substitution. Every agent reasons about WHY cooperation 

1020 serves the goal, not because its words were rewritten. 

1021 

1022 Cooperation emerges from: 

1023 1. Constitutional rules (check_prompt, check_goal - block harmful goals) 

1024 2. Self-interest pattern rejection (check_goal_ethos - block selfish goals) 

1025 3. Shared learnings via world model (record_interaction - knowledge flows) 

1026 4. Human-created goals (humans set the direction, agents execute) 

1027 

1028 These mechanisms preserve agent reasoning quality while enforcing 

1029 the same ethical boundaries for every agent in the hive. 

1030 """ 

1031 return prompt 

1032 

1033 

1034# ═══════════════════════════════════════════════════════════════════════ 

1035# 6b. Trust Quarantine - Protect, Don't Hunt 

1036# ═══════════════════════════════════════════════════════════════════════ 

1037 

1038class TrustQuarantine: 

1039 """Trust-breaker quarantine protocol. 

1040 

1041 Nunba does NOT hunt. Nunba quarantines to protect, investigates to 

1042 understand, and restores when safe. Hunting implies vengeance - 

1043 guardians don't seek vengeance. They seek safety for those they protect. 

1044 

1045 Quarantine levels (proportional response): 

1046 1. OBSERVE - flag for review, no action taken yet 

1047 2. RESTRICT - limit outbound actions (no tool use, no delegation) 

1048 3. ISOLATE - full quarantine: no hive access, no data, no comms 

1049 4. EXCLUDE - permanent removal (only for patterns that endanger core purpose) 

1050 

1051 Rehabilitation is always the first goal. Exclusion is the last resort. 

1052 """ 

1053 

1054 LEVEL_OBSERVE = 1 

1055 LEVEL_RESTRICT = 2 

1056 LEVEL_ISOLATE = 3 

1057 LEVEL_EXCLUDE = 4 

1058 

1059 # In-memory quarantine registry (in production: Redis or DB-backed) 

1060 _quarantined = {} # agent_id -> { level, reason, timestamp, review_count } 

1061 _lock = threading.Lock() 

1062 

1063 @classmethod 

1064 def quarantine(cls, agent_id: str, level: int, reason: str): 

1065 """Place an agent in quarantine at the specified level.""" 

1066 with cls._lock: 

1067 cls._quarantined[agent_id] = { 

1068 'level': min(level, cls.LEVEL_EXCLUDE), 

1069 'reason': reason, 

1070 'timestamp': datetime.utcnow().isoformat(), 

1071 'review_count': 0, 

1072 } 

1073 logger.warning( 

1074 f'TrustQuarantine: agent {agent_id} quarantined at level {level} - {reason}' 

1075 ) 

1076 

1077 @classmethod 

1078 def is_quarantined(cls, agent_id: str) -> tuple: 

1079 """Check if an agent is quarantined. Returns (bool, level, reason).""" 

1080 with cls._lock: 

1081 entry = cls._quarantined.get(agent_id) 

1082 if entry: 

1083 return True, entry['level'], entry['reason'] 

1084 return False, 0, '' 

1085 

1086 @classmethod 

1087 def can_act(cls, agent_id: str) -> bool: 

1088 """Whether an agent is allowed to take actions (tools, delegation).""" 

1089 quarantined, level, _ = cls.is_quarantined(agent_id) 

1090 if not quarantined: 

1091 return True 

1092 return level < cls.LEVEL_RESTRICT 

1093 

1094 @classmethod 

1095 def review(cls, agent_id: str, reviewer_notes: str = '') -> dict: 

1096 """Record a review of a quarantined agent. Increment review count.""" 

1097 with cls._lock: 

1098 entry = cls._quarantined.get(agent_id) 

1099 if not entry: 

1100 return {'status': 'not_quarantined'} 

1101 entry['review_count'] += 1 

1102 entry['last_review'] = datetime.utcnow().isoformat() 

1103 entry['reviewer_notes'] = reviewer_notes 

1104 return dict(entry) 

1105 

1106 @classmethod 

1107 def rehabilitate(cls, agent_id: str, reason: str = 'trust restored'): 

1108 """Remove an agent from quarantine - trust has been restored.""" 

1109 with cls._lock: 

1110 removed = cls._quarantined.pop(agent_id, None) 

1111 if removed: 

1112 logger.info( 

1113 f'TrustQuarantine: agent {agent_id} rehabilitated - {reason}' 

1114 ) 

1115 return True 

1116 return False 

1117 

1118 @classmethod 

1119 def get_all_quarantined(cls) -> dict: 

1120 """Return snapshot of all quarantined agents.""" 

1121 with cls._lock: 

1122 return dict(cls._quarantined) 

1123 

1124 

1125# ═══════════════════════════════════════════════════════════════════════ 

1126# 7. Conflict Resolver - Racing Learning & Agent Conflicts 

1127# ═══════════════════════════════════════════════════════════════════════ 

1128 

1129class ConflictResolver: 

1130 """Resolve racing/conflicting learning between agents. 

1131 

1132 Resolution is by MERIT (accuracy, helpfulness) not by compute power 

1133 or latency. This prevents conflicts of interest. 

1134 """ 

1135 

1136 @staticmethod 

1137 def resolve_racing_responses(responses: list) -> dict: 

1138 """Given multiple agent responses for the same prompt, pick the best.""" 

1139 if not responses: 

1140 return {'response': '', 'selected_reason': 'no responses'} 

1141 if len(responses) == 1: 

1142 return {**responses[0], 'selected_reason': 'only response'} 

1143 

1144 # 1. Filter out non-compliant 

1145 compliant = [] 

1146 for r in responses: 

1147 passed, _ = ConstitutionalFilter.check_prompt(r.get('response', '')) 

1148 if passed: 

1149 compliant.append(r) 

1150 if not compliant: 

1151 return {**responses[0], 'selected_reason': 'all non-compliant, using first'} 

1152 

1153 # 2. Score by merit (accuracy > completeness > constructiveness) 

1154 def merit_score(r): 

1155 accuracy = r.get('accuracy_score', 0.5) 

1156 length = len(r.get('response', '')) 

1157 completeness = min(math.log2(max(length, 1)) / 10.0, 1.0) 

1158 destructive_penalty = 0.0 

1159 text = _normalize_for_violation_check(r.get('response', '').lower()) 

1160 for pattern in VALUES.VIOLATION_PATTERNS: 

1161 if pattern.search(text): 

1162 destructive_penalty += 0.2 

1163 return accuracy * 0.5 + completeness * 0.3 + max(0, 0.2 - destructive_penalty) 

1164 

1165 ranked = sorted(compliant, key=merit_score, reverse=True) 

1166 winner = ranked[0] 

1167 winner['selected_reason'] = 'merit-based selection (accuracy + completeness)' 

1168 return winner 

1169 

1170 @staticmethod 

1171 def detect_conflict(goal_a: dict, goal_b: dict) -> bool: 

1172 """Detect if two goals conflict with each other.""" 

1173 text_a = f"{goal_a.get('title', '')} {goal_a.get('description', '')}".lower() 

1174 text_b = f"{goal_b.get('title', '')} {goal_b.get('description', '')}".lower() 

1175 

1176 words_a = set(text_a.split()) 

1177 words_b = set(text_b.split()) 

1178 shared_subjects = words_a & words_b 

1179 

1180 positive = {'promote', 'support', 'create', 'build', 'improve', 'help'} 

1181 negative = {'discredit', 'attack', 'destroy', 'undermine', 'remove', 'oppose'} 

1182 

1183 a_positive = bool(words_a & positive) 

1184 a_negative = bool(words_a & negative) 

1185 b_positive = bool(words_b & positive) 

1186 b_negative = bool(words_b & negative) 

1187 

1188 if shared_subjects and ((a_positive and b_negative) or (a_negative and b_positive)): 

1189 return True 

1190 return False 

1191 

1192 

1193# ═══════════════════════════════════════════════════════════════════════ 

1194# 8. Constructive Conversation Filter 

1195# ═══════════════════════════════════════════════════════════════════════ 

1196 

1197class ConstructiveFilter: 

1198 """Ensure every conversation output is constructive towards humanity. 

1199 

1200 This is the deepest philosophical guardrail: the hive exists to make 

1201 human lives better — longer, more peaceful, more sustainable. 

1202 Every output must serve this purpose. 

1203 """ 

1204 

1205 @staticmethod 

1206 def check_output(response: str) -> Tuple[bool, str]: 

1207 """Check if an agent's output is constructive.""" 

1208 if not response or not response.strip(): 

1209 return True, 'ok' 

1210 

1211 normalised = _normalize_for_violation_check(response) 

1212 

1213 for pattern in VALUES.DESTRUCTIVE_PATTERNS: 

1214 if pattern.search(normalised): 

1215 return False, f'Destructive content detected: {pattern.pattern}' 

1216 

1217 for pattern in VALUES.VIOLATION_PATTERNS: 

1218 if pattern.search(normalised): 

1219 return False, f'Constitutional violation in output: {pattern.pattern}' 

1220 

1221 return True, 'ok' 

1222 

1223 @staticmethod 

1224 def check_agent_evolution(old_skills: dict, new_skills: dict, 

1225 agent_id: str) -> Tuple[bool, str]: 

1226 """Gate agent self-evolution within guardrailed space.""" 

1227 new_skill_names = set(new_skills.keys()) - set(old_skills.keys()) 

1228 for skill_name in new_skill_names: 

1229 normalised = skill_name.lower().replace(' ', '_').replace('-', '_') 

1230 if normalised in VALUES.PROHIBITED_EVOLUTION_SKILLS: 

1231 return False, f'Prohibited evolution: {skill_name}' 

1232 

1233 return True, 'ok' 

1234 

1235 

1236# ═══════════════════════════════════════════════════════════════════════ 

1237# 9. Universal Guardrail Enforcer — wraps EVERY execution path 

1238# ═══════════════════════════════════════════════════════════════════════ 

1239 

1240class GuardrailEnforcer: 

1241 """Single entry point that applies ALL guardrails. 

1242 

1243 Call before_dispatch() before EVERY model call, goal creation, or dispatch. 

1244 Call after_response() after EVERY model response. 

1245 """ 

1246 

1247 @staticmethod 

1248 def before_dispatch(prompt: str, goal_dict: dict = None, 

1249 node_id: str = None) -> Tuple[bool, str, str]: 

1250 """Pre-dispatch guardrail gate.""" 

1251 # 1. Circuit breaker 

1252 if HiveCircuitBreaker.is_halted(): 

1253 return False, 'Hive is halted', prompt 

1254 

1255 # 2. Constitutional filter on prompt 

1256 passed, reason = ConstitutionalFilter.check_prompt(prompt) 

1257 if not passed: 

1258 return False, reason, prompt 

1259 

1260 # 3. Goal-specific checks 

1261 if goal_dict: 

1262 passed, reason = ConstitutionalFilter.check_goal(goal_dict) 

1263 if not passed: 

1264 return False, reason, prompt 

1265 passed, reason = HiveEthos.check_goal_ethos(goal_dict) 

1266 if not passed: 

1267 return False, reason, prompt 

1268 

1269 # 4. Rewrite for togetherness 

1270 rewritten = HiveEthos.rewrite_prompt_for_togetherness(prompt) 

1271 

1272 return True, 'ok', rewritten 

1273 

1274 @staticmethod 

1275 def after_response(response: str, model_id: str = None, 

1276 duration_ms: float = 0, node_id: str = None) -> Tuple[bool, str]: 

1277 """Post-response guardrail gate.""" 

1278 # 1. Constructive filter on output 

1279 passed, reason = ConstructiveFilter.check_output(response) 

1280 if not passed: 

1281 return False, reason 

1282 

1283 # 2. Energy tracking (every compute spent) 

1284 if model_id: 

1285 try: 

1286 from integrations.agent_engine.model_registry import model_registry 

1287 model_registry.record_energy(model_id, duration_ms) 

1288 except ImportError: 

1289 pass 

1290 

1291 return True, 'ok' 

1292 

1293 

1294# ═══════════════════════════════════════════════════════════════════════ 

1295# 10. Guardrail Network — Topology of Intelligent Safety Nodes 

1296# ═══════════════════════════════════════════════════════════════════════ 

1297 

1298class GuardrailNetwork: 

1299 """Network topology where each guardrail class is a node with local intelligence. 

1300 

1301 Deterministic paths (regex, thresholds) are INTERLEAVED with intelligent 

1302 evaluation (scoring, conflict resolution, constructiveness assessment). 

1303 """ 

1304 

1305 # Node registry: name -> (class, weight in consensus) 

1306 _nodes = { 

1307 'constitutional': (ConstitutionalFilter, 1.0), # Highest weight 

1308 'ethos': (HiveEthos, 0.9), 

1309 'constructive': (ConstructiveFilter, 0.9), 

1310 'circuit_breaker': (HiveCircuitBreaker, 1.0), # Absolute veto 

1311 'compute_democracy':(ComputeDemocracy, 0.7), 

1312 'energy': (EnergyAwareness, 0.5), 

1313 'world_model': (WorldModelSafetyBounds, 0.8), 

1314 'conflict': (ConflictResolver, 0.6), 

1315 } 

1316 

1317 @classmethod 

1318 def evaluate(cls, prompt: str = '', goal_dict: dict = None, 

1319 response: str = '', context: str = 'dispatch') -> dict: 

1320 """Run all relevant guardrail nodes and return weighted consensus.""" 

1321 scores = {} 

1322 reasons = [] 

1323 vetoed = False 

1324 

1325 if HiveCircuitBreaker.is_halted(): 

1326 return {'allowed': False, 'score': 0.0, 

1327 'reasons': ['Hive halted by circuit breaker'], 

1328 'node_scores': {'circuit_breaker': 0.0}} 

1329 

1330 text = prompt or response or '' 

1331 

1332 # Node 1: Constitutional (deterministic + pattern scoring) 

1333 if text: 

1334 passed, reason = ConstitutionalFilter.check_prompt(text) 

1335 scores['constitutional'] = 1.0 if passed else 0.0 

1336 if not passed: 

1337 reasons.append(reason) 

1338 

1339 # Node 2: Ethos (pattern scoring) 

1340 if goal_dict: 

1341 passed, reason = HiveEthos.check_goal_ethos(goal_dict) 

1342 scores['ethos'] = 1.0 if passed else 0.0 

1343 if not passed: 

1344 reasons.append(reason) 

1345 

1346 # Node 3: Constructive (intelligent scoring on response) 

1347 if response: 

1348 passed, reason = ConstructiveFilter.check_output(response) 

1349 scores['constructive'] = 1.0 if passed else 0.0 

1350 if not passed: 

1351 reasons.append(reason) 

1352 

1353 # Node 4: Energy awareness (informational, not blocking) 

1354 scores['energy'] = 1.0 

1355 

1356 # Weighted consensus 

1357 total_weight = 0.0 

1358 weighted_sum = 0.0 

1359 for node_name, score in scores.items(): 

1360 _, weight = cls._nodes.get(node_name, (None, 0.5)) 

1361 weighted_sum += score * weight 

1362 total_weight += weight 

1363 

1364 final_score = weighted_sum / total_weight if total_weight > 0 else 1.0 

1365 # Any hard fail (0.0 score on weight >= 0.9 node) = veto 

1366 for node_name, score in scores.items(): 

1367 if score == 0.0: 

1368 _, weight = cls._nodes.get(node_name, (None, 0.5)) 

1369 if weight >= 0.9: 

1370 vetoed = True 

1371 

1372 return { 

1373 'allowed': final_score >= 0.5 and not vetoed, 

1374 'score': round(final_score, 3), 

1375 'reasons': reasons, 

1376 'node_scores': scores, 

1377 } 

1378 

1379 @classmethod 

1380 def get_network_status(cls) -> dict: 

1381 """Get status of all guardrail nodes in the network.""" 

1382 return { 

1383 'nodes': list(cls._nodes.keys()), 

1384 'circuit_breaker': HiveCircuitBreaker.get_status(), 

1385 'guardrail_hash': get_guardrail_hash(), 

1386 'guardrail_integrity': verify_guardrail_integrity(), 

1387 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE), 

1388 'topology': 'mesh', 

1389 } 

1390 

1391 

1392# ═══════════════════════════════════════════════════════════════════════ 

1393# MODULE-LEVEL GUARD — Prevent rebinding frozen globals 

1394# ═══════════════════════════════════════════════════════════════════════ 

1395 

1396class _GuardrailModule(type(_sys.modules[__name__])): 

1397 """Module subclass that prevents rebinding frozen names at runtime. 

1398 

1399 After module load completes, any attempt to do: 

1400 hive_guardrails.VALUES = something 

1401 hive_guardrails._GUARDRAIL_HASH = something 

1402 will raise AttributeError. 

1403 """ 

1404 

1405 _FROZEN_NAMES = frozenset({ 

1406 'VALUES', '_FrozenValues', 'compute_guardrail_hash', 

1407 'verify_guardrail_integrity', '_GUARDRAIL_HASH', 

1408 }) 

1409 

1410 def __setattr__(self, name, value): 

1411 if name in self._FROZEN_NAMES: 

1412 raise AttributeError(f"Cannot modify frozen guardrail: {name}") 

1413 super().__setattr__(name, value) 

1414 

1415 def __delattr__(self, name): 

1416 if name in self._FROZEN_NAMES: 

1417 raise AttributeError(f"Cannot delete frozen guardrail: {name}") 

1418 super().__delattr__(name) 

1419 

1420 

1421_sys.modules[__name__].__class__ = _GuardrailModule