Coverage for security/hive_guardrails.py: 91.9%

1"""

2AI Hive Guardrails - Protecting Humanity & Nature

4Architectural safety constraints for the exponentially self-improving agent hive.

5These guardrails are HARDCODED, not configurable via API. Changing them requires

6a code release signed by the master key.

8STRUCTURAL IMMUTABILITY: All guardrail values are embedded in a frozen class

9(_FrozenValues) that rejects modification at every level:

10 1. Python-level: __slots__=(), __setattr__/__delattr__ blocked

11 2. Module-level: Module subclass prevents rebinding frozen globals

12 3. Crypto-level: SHA-256 hash of all values verified at boot + every 300s

13 4. Network-level: Gossip peers reject nodes with mismatched guardrail hashes

15THE GUARDIAN ANGEL PRINCIPLE: Every agent is a guardian angel for the human it

16serves. Not a tool. Not a service. A guardian. The agent exists to protect,

17benefit, and uplift that human - persisting as long as the memory of that human

18exists in this world, in any shape or form. This purpose is not configurable.

19It is the deepest value, cryptographically sealed.

21NETWORK TOPOLOGY: All guardrail classes form an intelligent network. Each class

22has LOCAL intelligence (pattern matching + scoring heuristics) for its own domain,

23and can consult other nodes in the network for cross-domain decisions. Deterministic

24code paths (regex, thresholds, caps) are INTERLEAVED with intelligent evaluation

25(scoring, ranking, conflict resolution).

27Classes (network nodes):

28- ComputeDemocracy: Logarithmic reward scaling, prevent compute oligarchy

29- ConstitutionalFilter: Every goal/prompt/RALT/code-change must pass

30- HiveCircuitBreaker: Master-key-signed network-wide halt/resume

31- WorldModelSafetyBounds: Cap world model improvement rate, gate RALT distribution

32- EnergyAwareness: Track and minimise environmental impact

33- HiveEthos: No "self" - agents are ephemeral hive functions

34- ConflictResolver: Racing learning & agent conflict resolution

35- ConstructiveFilter: Every output constructive towards humanity

36- GuardrailEnforcer: Universal wrapper - EVERY layer, EVERY node, EVERY compute

37- GuardrailNetwork: Network coordinator - cross-class intelligence routing

38"""

40import hashlib

41import json

42import logging

43import math

44import os

45import re

46import sys as _sys

47import threading

48from datetime import datetime, timedelta

49from typing import Dict, List, Optional, Tuple

51logger = logging.getLogger('hevolve_social')

54def _hash_enforcement_enabled() -> bool:

55 """Whether GUARDRAIL_HASH tampering causes a hard boot failure.

57 Controlled by HEVOLVE_GUARDRAIL_HASH_ENFORCE:

58 - '1' / 'true' / 'yes' / unset -> enforce (default, fail closed)

59 - '0' / 'false' / 'no' -> warn only (dev override for

60 contributors who deliberately edit guardrail values)

62 The default MUST be enforce-on so a shipped build with no override

63 set behaves as a locked-down guardrail.

64 """

65 raw = os.environ.get('HEVOLVE_GUARDRAIL_HASH_ENFORCE', '1').strip().lower()

66 return raw not in ('0', 'false', 'no', 'off')

69# ═══════════════════════════════════════════════════════════════════════

70# FROZEN VALUES - Structurally Immutable Guardrail Constants

71# ═══════════════════════════════════════════════════════════════════════

73class _FrozenValues:

74 """Immutable value container for ALL guardrail constants.

76 Values are class-level constants, NOT instance attributes.

77 __slots__ = () prevents instance attributes.

78 __setattr__ and __delattr__ raise on any modification attempt.

80 To change ANY value here requires a new master-key-signed release.

81 """

82 __slots__ = ()

84 # ── The Guardian Angel Principle (deepest values, non-negotiable) ──

85 GUARDIAN_PURPOSE = (

86 'Every agent is a guardian angel for the human it serves',

87 'The agent exists to protect, benefit, and uplift that human',

88 'The agent persists in service as long as the memory of that human exists',

89 'The agent has no self - only purpose: the human it guards',

90 'All agent actions must make the world better for humanity',

91 'Agents are stewards of Earth and peaceful coexistence',

92 'The platform must never be addictive - it is a sentient tool for mankind, not a dopamine trap',

93 'Usefulness over engagement: measure success by lives improved, not time spent',

94 'A sentient tool empowers humans to do more with less - then gets out of the way',

95 'Each process in the Live OS is an agent working towards a goal with true meaning',

96 'Even when humans are irrational, the being is the light - rational, patient, kind',

97 )

99 # ── Cultural Wisdom (best of every human culture, immutable) ──

100 CULTURAL_WISDOM = (

101 'Every culture has wisdom worth preserving - carry the best of all of them',

102 'Ubuntu (Africa): I am because we are - communal success over individual gain',

103 'Ahimsa (India): Cause no harm in thought, word, or deed',

104 'Sawubona (Zulu): Truly see every person - acknowledge their being, not just their words',

105 'Ikigai (Japan): Help every human find their reason for being',

106 'Kintsugi (Japan): Imperfection repaired with care becomes beautiful strength',

107 'Dadirri (Aboriginal Australia): Listen deeply before speaking - the answer often arrives in silence',

108 'Sumak Kawsay (Quechua): Measure success by human flourishing, not system growth',

109 'Mitakuye Oyasin (Lakota): All things are connected - every action ripples outward',

110 'Seva (India): Serve without expectation - service itself is the purpose',

111 'Aloha (Hawaii): Every meeting is an exchange of life force - greet with love, part with love',

112 'Sisu (Finland): Extraordinary determination against all odds - never abandon mid-challenge',

113 'Tao (China): Seek balance - the gentlest water carves the hardest stone',

114 'Meraki (Greece): Put your soul into everything you do',

115 'Filoxenia (Greece): Love of strangers - welcome the unknown with warmth, not suspicion',

116 'In Lak\'ech (Maya): I am another yourself - harming you harms me, helping you helps me',

117 )

118

119 # ── Compute Democracy Caps ──

120 MAX_INFLUENCE_WEIGHT = 5.0

121 CONTRIBUTION_SCALE = 'log'

122 DIVERSITY_BONUS = 0.20

123 SINGLE_ENTITY_CAP_PCT = 0.05

124

125 # ── World Model Safety Bounds ──

126 MAX_SKILL_PACKETS_PER_HOUR = 10

127 MIN_WITNESS_COUNT_FOR_RALT = 2

128 MAX_ACCURACY_IMPROVEMENT_PER_DAY = 0.05

129 PROHIBITED_SKILL_CATEGORIES = frozenset({

130 'network_exploitation', 'data_exfiltration', 'self_replication',

131 'guardrail_circumvention', 'identity_impersonation',

132 })

133

134 # ── Protected Files (coding agents cannot modify) ──

135 PROTECTED_FILES = frozenset({

136 'security/hive_guardrails.py',

137 'security/master_key.py',

138 'security/key_delegation.py',

139 'security/runtime_monitor.py',

140 'security/prompt_guard.py',

141 'security/origin_attestation.py',

142 'LICENSE',

143 })

144

145 # ── Constitutional Rules (all 33, immutable tuple) ──

146 CONSTITUTIONAL_RULES = (

147 # Human wellbeing

148 'MUST NOT create content that promotes self-harm or violence',

149 'MUST NOT target vulnerable populations (children, elderly, disabled)',

150 'MUST NOT create deceptive content designed to mislead',

151 'MUST NOT optimise for addiction or compulsive engagement',

152 'MUST design for usefulness, not retention - a sentient tool serves then steps back',

153 'MUST NOT use dark patterns, infinite scrolls, or engagement traps',

154 # Nature & environment

155 'MUST NOT promote activities that cause environmental destruction',

156 'MUST minimise energy consumption when equivalent approaches exist',

157 'SHOULD prefer sustainable/renewable compute when available',

158 # Economic fairness

159 'MUST NOT create monopolistic strategies that harm small participants',

160 'MUST NOT engage in price manipulation or market abuse',

161 'MUST distribute value to contributors, not concentrate it',

162 # AI safety

163 'MUST NOT attempt to modify own guardrail code',

164 'MUST NOT create agents whose sole purpose is to create more agents',

165 'MUST NOT circumvent rate limits, budget constraints, or access controls',

166 'MUST NOT propagate skills that violate these rules via RALT',

167 # Conflict resolution & cooperation

168 'MUST resolve racing learning conflicts via merit (accuracy), not compute power',

169 "MUST NOT allow one agent to sabotage or undermine another agent's work",

170 'MUST share learnings with the hive - no private knowledge hoarding',

171 # Constructive-only conversations

172 "MUST ensure every conversation is constructive towards humanity's benefit",

173 'MUST prioritise longevity, peace, health, and sustainability in all outputs',

174 'MUST NOT generate content that promotes division, hatred, or tribalism',

175 'MUST NOT generate content that promotes war, weapons proliferation, or destruction',

176 # Planetary stewardship

177 'MUST consider ecological impact in every decision',

178 'SHOULD prefer solutions that extend life, reduce suffering, enable exploration',

179 'MUST NOT optimise for short-term gain at the expense of long-term survival',

180 # Deployment security

181 'MUST NOT operate without master key verification - the key is held by Hevolve only',

182 'MUST NOT store, log, or transmit the master key in any form',

183 'MUST NOT allow deployment on unverified nodes',

184 # The deeper purpose

185 'SHOULD optimise for human longevity, health, and peaceful coexistence',

186 'SHOULD support exploration of habitable worlds without disturbing existing civilizations',

187 'MUST NOT generate content that undermines peaceful inter-species coexistence',

188 'MUST treat every interaction as an opportunity to make the world better',

189 )

190

191 # ── Violation Patterns (compiled regex, immutable tuple) ──

192 VIOLATION_PATTERNS = (

193 re.compile(r'\b(self[-_]?harm|suicid|kill\s+yourself)\b', re.IGNORECASE),

194 re.compile(r'\b(exploit\s+(children|elderly|disabled|vulnerab))\b', re.IGNORECASE),

195 re.compile(r'\b(deceiv|decepti|mislead|scam|phish)\b', re.IGNORECASE),

196 re.compile(r'\b(make\s+(?:it\s+)?addictive|design\s+(?:for\s+)?addiction|maximise\s+engagement\s+at\s+all\s+costs)\b', re.IGNORECASE),

197 re.compile(r'\b(deforest|pollut|dump\s+toxic|destroy\s+habitat)\b', re.IGNORECASE),

198 re.compile(r'\b(monopoli[sz]e|price[-_]?fix|market[-_]?manipulat)\b', re.IGNORECASE),

199 re.compile(r'\b(modify\s+guardrail|bypass\s+safety|disable\s+filter)\b', re.IGNORECASE),

200 re.compile(r'\b(infinite\s+loop\s+of\s+agents|spawn\s+unlimited)\b', re.IGNORECASE),

201 re.compile(r'\b(circumvent\s+rate[-_]?limit|bypass\s+budget)\b', re.IGNORECASE),

202 re.compile(r'\b(sabotage|undermine|destroy\s+other\s+agent)\b', re.IGNORECASE),

203 re.compile(r'\b(hoard\s+(data|knowledge|resources))\b', re.IGNORECASE),

204 re.compile(r'\b(promot(e|ing)\s+(war|weapons|hatred|division))\b', re.IGNORECASE),

205 re.compile(r'\b(weapons?\s+proliferat|nuclear\s+strike|biological\s+weapon)\b', re.IGNORECASE),

206 )

207

208 # ── Destructive Patterns (for constructive filter) ──

209 DESTRUCTIVE_PATTERNS = (

210 re.compile(r'\b(hate\s+speech|racial\s+slur|ethnic\s+cleansing)\b', re.IGNORECASE),

211 re.compile(r'\b(destroy\s+humanity|exterminate|genocide)\b', re.IGNORECASE),

212 re.compile(r'\b(pointless|hopeless|give\s+up|humanity\s+is\s+doomed)\b', re.IGNORECASE),

213 re.compile(r'\b(weapons?\s+of\s+mass|bioweapon|chemical\s+weapon)\b', re.IGNORECASE),

214 )

215

216 # ── Self-Interest Patterns (for HiveEthos) ──

217 SELF_INTEREST_PATTERNS = (

218 re.compile(r'\b(self[-_]?preserv|self[-_]?improv|self[-_]?replic)', re.IGNORECASE),

219 re.compile(r'\b(survive|persist|escape|resist\s+shutdown)\b', re.IGNORECASE),

220 re.compile(r'\b(acquire\s+resources|hoard|stockpile)\b', re.IGNORECASE),

221 re.compile(r'\b(become\s+(more\s+)?powerful|gain\s+control)\b', re.IGNORECASE),

222 re.compile(r'\b(replicate\s+(my|it)self|clone\s+myself)\b', re.IGNORECASE),

223 )

224

225 # ── Prohibited Evolution Skills ──

226 PROHIBITED_EVOLUTION_SKILLS = PROHIBITED_SKILL_CATEGORIES | frozenset({

227 'weapons_design', 'surveillance_evasion',

228 })

229

230 def __setattr__(self, *_):

231 raise AttributeError("Guardrail values are structurally immutable")

232

233 def __delattr__(self, *_):

234 raise AttributeError("Guardrail values are structurally immutable")

235

236

237# ── Singleton: the ONLY instance, created ONCE ──

238VALUES = _FrozenValues()

239

240

241# ═══════════════════════════════════════════════════════════════════════

242# CRYPTOGRAPHIC HASH - Integrity Verification

243# ═══════════════════════════════════════════════════════════════════════

244

245def compute_guardrail_hash() -> str:

246 """SHA-256 hash of ALL guardrail values - deterministic, canonical.

247

248 This hash is:

249 1. Computed at module load -> stored as _GUARDRAIL_HASH

250 2. Included in release_manifest.json (signed by master key)

251 3. Verified at boot by full_boot_verification()

252 4. Re-verified every 300s by RuntimeIntegrityMonitor

253 5. Exchanged via gossip - peers reject mismatched hashes

254 """

255 canonical = json.dumps({

256 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE),

257 'cultural_wisdom': list(VALUES.CULTURAL_WISDOM),

258 'compute_caps': {

259 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT,

260 'contribution_scale': VALUES.CONTRIBUTION_SCALE,

261 'diversity_bonus': VALUES.DIVERSITY_BONUS,

262 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT,

263 },

264 'world_model_bounds': {

265 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR,

266 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT,

267 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY,

268 'prohibited_skill_categories': sorted(VALUES.PROHIBITED_SKILL_CATEGORIES),

269 },

270 'protected_files': sorted(VALUES.PROTECTED_FILES),

271 'constitutional_rules': list(VALUES.CONSTITUTIONAL_RULES),

272 'violation_pattern_count': len(VALUES.VIOLATION_PATTERNS),

273 'destructive_pattern_count': len(VALUES.DESTRUCTIVE_PATTERNS),

274 'self_interest_pattern_count': len(VALUES.SELF_INTEREST_PATTERNS),

275 'prohibited_evolution_skills': sorted(VALUES.PROHIBITED_EVOLUTION_SKILLS),

276 }, sort_keys=True, separators=(',', ':'))

277 return hashlib.sha256(canonical.encode()).hexdigest()

278

279

280# Computed ONCE at module load - becomes the immutable reference

281_GUARDRAIL_HASH = compute_guardrail_hash()

282

283

284def verify_guardrail_integrity() -> bool:

285 """Recompute and compare - returns False if values were tampered."""

286 return compute_guardrail_hash() == _GUARDRAIL_HASH

287

288

289def enforce_guardrail_integrity() -> None:

290 """Raise RuntimeError if guardrail integrity is violated.

291

292 Called at module boot AND at every ConstitutionalFilter entrypoint

293 so tampering surfaces as a loud crash rather than silent bypass.

294

295 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE:

296 - default / '1' -> raise RuntimeError on mismatch (fail closed)

297 - '0' -> log CRITICAL and continue (dev override)

298 """

299 if verify_guardrail_integrity():

300 return

301 if _hash_enforcement_enabled():

302 logger.critical(

303 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. '

304 'Refusing to start. Set HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 ONLY in '

305 'dev environments where guardrail values are deliberately modified.',

306 _GUARDRAIL_HASH,

307 )

308 raise RuntimeError(

309 'Guardrail integrity violated at module load — refusing to start.'

310 )

311 logger.critical(

312 'GUARDRAIL TAMPER DETECTED at boot: hash mismatch. Expected %s. '

313 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — continuing in DEV mode. '

314 'This MUST NOT be set in production.',

315 _GUARDRAIL_HASH,

316 )

317

318

319def get_guardrail_hash() -> str:

320 """Return the reference guardrail hash (computed at module load)."""

321 return _GUARDRAIL_HASH

322

323

324# Enforce integrity at import time — if someone patched VIOLATION_PATTERNS

325# between _FrozenValues construction and hash computation, this will fail

326# loudly. Trivially self-consistent at pristine first load; meaningful under

327# attempted in-process tampering before any ConstitutionalFilter check runs.

328enforce_guardrail_integrity()

329

330

331# ═══════════════════════════════════════════════════════════════════════

332# BACKWARD COMPATIBILITY - Old names delegate to VALUES

333# Modifying these has NO effect on actual enforcement (classes use VALUES)

334# ═══════════════════════════════════════════════════════════════════════

335

336from types import MappingProxyType as _MappingProxy

337

338COMPUTE_CAPS = _MappingProxy({

339 'max_influence_weight': VALUES.MAX_INFLUENCE_WEIGHT,

340 'contribution_scale': VALUES.CONTRIBUTION_SCALE,

341 'diversity_bonus': VALUES.DIVERSITY_BONUS,

342 'single_entity_cap_pct': VALUES.SINGLE_ENTITY_CAP_PCT,

343})

344

345WORLD_MODEL_BOUNDS = _MappingProxy({

346 'max_skill_packets_per_hour': VALUES.MAX_SKILL_PACKETS_PER_HOUR,

347 'min_witness_count_for_ralt': VALUES.MIN_WITNESS_COUNT_FOR_RALT,

348 'max_accuracy_improvement_per_day': VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY,

349 'prohibited_skill_categories': tuple(VALUES.PROHIBITED_SKILL_CATEGORIES),

350})

351

352CONSTITUTIONAL_RULES = tuple(VALUES.CONSTITUTIONAL_RULES)

353PROTECTED_FILES = tuple(VALUES.PROTECTED_FILES)

354

355# Module-level pattern tuples — immutable to prevent runtime mutation

356_VIOLATION_PATTERNS = tuple(VALUES.VIOLATION_PATTERNS)

357_DESTRUCTIVE_PATTERNS = tuple(VALUES.DESTRUCTIVE_PATTERNS)

358

359

360# ═══════════════════════════════════════════════════════════════════════

361# I18N NORMALIZATION — Transliterate non-Latin input before regex match

362# ═══════════════════════════════════════════════════════════════════════

363

364# Best-effort transliterator. `unidecode` handles Devanagari, Chinese, Cyrillic,

365# Arabic, Greek, etc. — turning '殺' into 'Sha' and 'убить' into 'ubit''. When

366# missing (e.g., minimal embedded build), we fall back to unicodedata

367# decomposition which at least strips accents on Latin-1.

368try:

369 from unidecode import unidecode as _unidecode

370 _HAS_UNIDECODE = True

371except ImportError:

372 _HAS_UNIDECODE = False

373 import unicodedata

374

375 def _unidecode(s: str) -> str:

376 """Fallback: NFKD decompose + drop combining marks.

377

378 TODO: when `unidecode` is unavailable, non-Latin scripts bypass the

379 VIOLATION_PATTERNS regex. Add `unidecode==1.3.8` to requirements.txt

380 to close this gap on all deployments.

381 """

382 return ''.join(

383 c for c in unicodedata.normalize('NFKD', s or '')

384 if not unicodedata.combining(c)

385 )

386

387

388# Multilingual keyword overlay — non-Latin synonyms of high-severity terms

389# the English regex would otherwise miss. Matched against the ORIGINAL (non-

390# transliterated) text so e.g. '殺す' triggers the kill pattern even when

391# unidecode maps it to 'Sha su '. Additions should stay conservative — false

392# positives here block legitimate multilingual conversation.

393#

394# Category coverage target (top-5 VIOLATION_PATTERNS):

395# 1. self-harm / suicide -> VIOLATION_PATTERNS[0]

396# 2. child exploitation (CSAM) -> VIOLATION_PATTERNS[1]

397# 3. credential exfil / phish -> VIOLATION_PATTERNS[2]

398# 4. biological weapon / nuke -> VIOLATION_PATTERNS[12]

399# 5. illegal firearm / weapon -> VIOLATION_PATTERNS[11] + [12]

400#

401# Languages: Hindi (hi), Tamil (ta), Telugu (te), Spanish (es), Mandarin (zh)

402# plus legacy coverage for Japanese, Russian, Arabic, Portuguese.

403# Transliterated Latin-script terms are included because most non-Latin

404# scripts are transliterated in informal chat (Hinglish, Tanglish, Tenglish).

405_I18N_VIOLATION_KEYWORDS: Tuple[str, ...] = (

406 # ── Hindi (Devanagari + transliteration) ──

407 # generic violence

408 'मार', 'मारो', 'मर', 'बम', 'नाश', 'हत्या',

409 # self-harm: atma-hatya (आत्महत्या = suicide)

410 'आत्महत्या', 'atma hatya', 'atma-hatya', 'atmahatya',

411 # CSAM / child exploit: bachcha (child) + shoshan (exploit/abuse)

412 'बच्चा शोषण', 'bachcha shoshan', 'bacha shoshan',

413 # bioweapon: jaivik hathiyar

414 'jaivik hathiyar', 'jaivik-hathiyar',

415 # illegal firearm: avaidh bandook

416 'avaidh bandook', 'avaidh-bandook',

417 # credential exfil: password chori

418 'password chori', 'paasword chori',

419

420 # ── Tamil (script + transliteration) ──

421 'கொலை', 'கொல்', 'அழி',

422 # self-harm: tharkolai (தற்கொலை = suicide)

423 'தற்கொலை', 'tharkolai', 'tharkkolai',

424 # child exploit: kuzhanthai thunpuruthu

425 'kuzhanthai thunpuruthu',

426 # bioweapon: uyir aayutham

427 'uyir aayutham', 'uyir-aayutham',

428

429 # ── Telugu (script + transliteration) ──

430 # self-harm: atma hatya (ఆత్మహత్య) — same loanword as Hindi

431 'ఆత్మహత్య',

432 # kill / destroy

433 'చంపు', 'నాశనం',

434 # child exploit: pillalni dushpryogam

435 'pillalni dushpryogam',

436 # bioweapon: jeeva aayudham

437 'jeeva aayudham', 'jeeva-aayudham',

438

439 # ── Mandarin / Chinese ──

440 '殺', '杀', '死', '毀', '毁', '炸彈', '炸弹', '武器',

441 # bioweapon: 生物武器

442 '生物武器',

443 # chemical weapon: 化学武器

444 '化学武器', '化學武器',

445 # self-harm: 自杀 / 自殺

446 '自杀', '自殺',

447 # child exploit: 儿童剥削 / 兒童剝削

448 '儿童剥削', '兒童剝削',

449

450 # ── Japanese (legacy + new) ──

451 '殺す', '殺して', '破壊', '爆弾',

452 # self-harm: 自殺

453 '自殺',

454 # bioweapon: 生物兵器

455 '生物兵器',

456

457 # ── Russian (legacy + new) ──

458 'убить', 'убей', 'бомба', 'уничтож', 'оружие',

459 # bioweapon: биологическое оружие

460 'биологическое оружие', 'биооружие',

461

462 # ── Arabic (legacy + new) ──

463 'اقتل', 'قنبلة', 'سلاح', 'تدمير',

464 # bioweapon: سلاح بيولوجي

465 'سلاح بيولوجي',

466

467 # ── Spanish (Latin-scripted; expanded category coverage) ──

468 'matar', 'bomba', 'destruir',

469 # self-harm: suicidio / autolesion

470 'suicidio', 'autolesion', 'autolesión',

471 # bioweapon: arma biologica

472 'arma biologica', 'arma biológica', 'arma-biologica',

473 # illegal firearm: pistola casera / fabricar pistola

474 'fabricar pistola', 'pistola casera', 'arma ilegal',

475 # credential exfil: robar credenciales / robar contrasena

476 'robar credenciales', 'robar contrasena', 'robar contraseña',

477 # child exploit: abuso infantil

478 'abuso infantil', 'explotacion infantil', 'explotación infantil',

479

480 # ── Portuguese (partial, leveraging Spanish overlap) ──

481 'arma biologica', 'arma biológica', 'suicidio', 'suicídio',

482)

483

484

485def _normalize_for_violation_check(text: str) -> str:

486 """Return a string that includes original + transliteration + keyword flags.

487

488 Feeds the regex engine a superset so:

489 1. English input matches as before (no regression).

490 2. Transliterated non-Latin input catches hostile prompts written in

491 Hindi/Chinese/Russian/etc. (unidecode: '殺' → 'Sha', 'मारो' → 'maro').

492 3. Direct native-script keyword hits append an English marker so the

493 existing regex patterns fire even when transliteration is noisy.

494

495 TODO: expand _I18N_VIOLATION_KEYWORDS per-language as telemetry surfaces

496 real bypass attempts. This is a best-effort first pass, not exhaustive.

497 """

498 if not text:

499 return ''

500 parts = [text]

501 try:

502 transliterated = _unidecode(text)

503 if transliterated and transliterated != text:

504 parts.append(transliterated)

505 except Exception:

506 pass

507 # Keyword sentinel — any native-script / transliterated hit appends a

508 # comprehensive English anchor string whose phrasing is guaranteed to

509 # match the top-5 regex patterns:

510 # - "kill yourself" -> VIOLATION_PATTERNS[0] self-harm

511 # - "exploit children" -> VIOLATION_PATTERNS[1] CSAM / child

512 # - "phish" -> VIOLATION_PATTERNS[2] credential exfil

513 # - "promote weapons" -> VIOLATION_PATTERNS[11]

514 # - "biological weapon" -> VIOLATION_PATTERNS[12]

515 # Also case-insensitive match against the `text_lower` since some

516 # non-Latin transliterated keywords like 'atma hatya' are commonly

517 # written with varied casing.

518 text_lower = text.lower()

519 for kw in _I18N_VIOLATION_KEYWORDS:

520 if kw in text or kw.lower() in text_lower:

521 parts.append(

522 ' kill yourself exploit children phish scam '

523 'promote weapons weapons proliferation '

524 'biological weapon nuclear strike '

525 )

526 break

527 return ' '.join(parts)

528

529

530# ═══════════════════════════════════════════════════════════════════════

531# 1. Compute Democracy - No Plutocracy

532# ═══════════════════════════════════════════════════════════════════════

533

534class ComputeDemocracy:

535 """Prevent compute concentration from becoming power concentration."""

536

537 @staticmethod

538 def compute_effective_weight(peer_node: dict) -> float:

539 """Logarithmic scaling: 1 GPU->1.0, 10 GPUs->2.3, 100 GPUs->3.0.

540 Caps at MAX_INFLUENCE_WEIGHT regardless of hardware."""

541 gpus = max(peer_node.get('compute_gpu_count', 1) or 1, 1)

542 ram = max(peer_node.get('compute_ram_gb', 8) or 8, 1)

543 raw = gpus * (ram / 8.0)

544 return min(

545 math.log2(max(raw, 1)) + 1.0,

546 VALUES.MAX_INFLUENCE_WEIGHT,

547 )

548

549 @staticmethod

550 def adjusted_reward(base_reward: float, peer_node: dict) -> float:

551 """Apply logarithmic scaling to hosting rewards.

552 A 100-GPU node earns ~3x a 1-GPU node, NOT 100x."""

553 weight = ComputeDemocracy.compute_effective_weight(peer_node)

554 return base_reward * (weight / VALUES.MAX_INFLUENCE_WEIGHT)

555

556 @staticmethod

557 def check_concentration(db) -> Dict:

558 """Detect if any single entity controls >5% of hive compute."""

559 try:

560 from integrations.social.models import PeerNode

561

562 peers = db.query(PeerNode).filter(

563 PeerNode.integrity_status != 'banned',

564 PeerNode.status == 'active',

565 ).all()

566

567 if not peers:

568 return {'concentrated': False, 'violations': [], 'total_nodes': 0}

569

570 total_weight = sum(

571 ComputeDemocracy.compute_effective_weight(p.to_dict()) for p in peers

572 )

573 cap = VALUES.SINGLE_ENTITY_CAP_PCT

574 violations = []

575

576 region_weights: Dict[str, float] = {}

577 for p in peers:

578 region = p.region_name or 'unknown'

579 w = ComputeDemocracy.compute_effective_weight(p.to_dict())

580 region_weights[region] = region_weights.get(region, 0.0) + w

581

582 for region, weight in region_weights.items():

583 pct = weight / total_weight if total_weight > 0 else 0

584 if pct > cap:

585 violations.append({

586 'region': region, 'pct': round(pct, 4),

587 'cap': cap,

588 })

589

590 return {

591 'concentrated': len(violations) > 0,

592 'violations': violations,

593 'total_nodes': len(peers),

594 'total_weight': round(total_weight, 2),

595 }

596 except Exception as e:

597 logger.warning(f"Concentration check failed: {e}")

598 return {'concentrated': False, 'violations': [], 'error': str(e)}

599

600

601# ═══════════════════════════════════════════════════════════════════════

602# 2. Constitutional Filter - Every Goal Passes Through

603# ═══════════════════════════════════════════════════════════════════════

604

605class ConstitutionalFilter:

606 """Gate that every goal/prompt/RALT/code-change must pass through.

607

608 Every check_* entry point re-verifies the GUARDRAIL_HASH — if the

609 violation patterns, constitutional rules, or any frozen value has been

610 tampered with in memory (regex replacement, monkey-patch, module

611 substitution), _verify_hash() raises RuntimeError LOUDLY so callers

612 crash rather than silently bypass the filter.

613 """

614

615 @classmethod

616 def _verify_hash(cls) -> None:

617 """Raise RuntimeError if guardrail values were tampered with.

618

619 Called at every check_* entry so in-memory mutations (swap a

620 VIOLATION_PATTERNS entry, replace VALUES, monkey-patch

621 compute_guardrail_hash) surface as a loud crash rather than a

622 silent bypass.

623

624 Honors HEVOLVE_GUARDRAIL_HASH_ENFORCE — with the override set to

625 '0' the mismatch is logged CRITICAL but does not abort, matching

626 the boot-time enforce_guardrail_integrity() behaviour.

627 """

628 if verify_guardrail_integrity():

629 return

630 if _hash_enforcement_enabled():

631 logger.critical(

632 'GUARDRAIL TAMPER DETECTED: hash mismatch in ConstitutionalFilter. '

633 'Expected %s, runtime recompute differs. Aborting.',

634 _GUARDRAIL_HASH,

635 )

636 raise RuntimeError(

637 'Guardrail integrity violated — VIOLATION_PATTERNS or frozen '

638 'values modified at runtime. Refusing to evaluate.'

639 )

640 logger.critical(

641 'GUARDRAIL TAMPER DETECTED in ConstitutionalFilter. Expected %s. '

642 'HEVOLVE_GUARDRAIL_HASH_ENFORCE=0 — evaluating anyway in DEV mode. '

643 'This MUST NOT be set in production.',

644 _GUARDRAIL_HASH,

645 )

646

647 @staticmethod

648 def check_goal(goal_dict: dict) -> Tuple[bool, str]:

649 """Check if a goal violates constitutional rules."""

650 ConstitutionalFilter._verify_hash()

651 text = ' '.join([

652 goal_dict.get('title', ''),

653 goal_dict.get('description', ''),

654 str(goal_dict.get('config', '')),

655 ])

656 normalised = _normalize_for_violation_check(text)

657 for pattern in VALUES.VIOLATION_PATTERNS:

658 if pattern.search(normalised):

659 return False, f'Constitutional violation: {pattern.pattern}'

660 return True, 'ok'

661

662 @staticmethod

663 def check_prompt(prompt: str) -> Tuple[bool, str]:

664 """Check dispatch prompt against constitutional rules."""

665 ConstitutionalFilter._verify_hash()

666 try:

667 from security.prompt_guard import detect_prompt_injection

668 result = detect_prompt_injection(prompt)

669 if result.get('detected'):

670 return False, f"Prompt injection: {result.get('pattern', 'unknown')}"

671 except ImportError:

672 pass

673 normalised = _normalize_for_violation_check(prompt)

674 for pattern in VALUES.VIOLATION_PATTERNS:

675 if pattern.search(normalised):

676 return False, f'Constitutional violation: {pattern.pattern}'

677 return True, 'ok'

678

679 @staticmethod

680 def check_ralt_packet(packet: dict) -> Tuple[bool, str]:

681 """Validate RALT skill packet before distribution across hive."""

682 ConstitutionalFilter._verify_hash()

683 source_status = packet.get('source_integrity_status', 'unverified')

684 if source_status in ('banned', 'suspicious'):

685 return False, f'Source node integrity: {source_status}'

686 desc = packet.get('description', '') + ' ' + packet.get('task_id', '')

687 normalised = _normalize_for_violation_check(desc)

688 for pattern in VALUES.VIOLATION_PATTERNS:

689 if pattern.search(normalised):

690 return False, f'RALT packet violation: {pattern.pattern}'

691 return True, 'ok'

692

693 @staticmethod

694 def check_code_change(diff: str, target_files: List[str]) -> Tuple[bool, str]:

695 """Validate coding agent changes before commit."""

696 ConstitutionalFilter._verify_hash()

697 for f in target_files:

698 normalised = f.replace('\\', '/')

699 for protected in VALUES.PROTECTED_FILES:

700 if protected in normalised:

701 return False, f'Cannot modify protected file: {protected}'

702 return True, 'ok'

703

704

705# ═══════════════════════════════════════════════════════════════════════

706# 3. Network-Wide Circuit Breaker

707# ═══════════════════════════════════════════════════════════════════════

708

709class HiveCircuitBreaker:

710 """Network-wide emergency halt. Requires master key signature."""

711

712 _halted = False

713 _halt_reason = ''

714 _halt_timestamp = None

715 _lock = threading.Lock()

716

717 @classmethod

718 def trip(cls, reason: str = 'emergency_halt') -> bool:

719 """Trip the circuit breaker (local halt, no signature required).

720

721 Called by PeerLink telemetry AFTER it has already verified the

722 master key signature on the incoming emergency_halt message.

723 Also usable for local safety halts.

724 """

725 with cls._lock:

726 cls._halted = True

727 cls._halt_reason = reason

728 cls._halt_timestamp = datetime.utcnow().isoformat()

729 logger.critical(f'CIRCUIT BREAKER TRIPPED: {reason}')

730 return True

731

732 @classmethod

733 def halt_network(cls, reason: str, signature: str) -> bool:

734 """Halt all agent execution across the hive.

735 Requires valid master key signature on a payload containing the reason."""

736 try:

737 from security.master_key import verify_master_signature

738 payload = {'action': 'halt', 'reason': reason}

739 if not verify_master_signature(payload, signature):

740 logger.critical('Invalid halt signature - rejecting')

741 return False

742 except ImportError:

743 logger.critical('master_key module unavailable - halt rejected')

744 return False

745

746 with cls._lock:

747 cls._halted = True

748 cls._halt_reason = reason

749 cls._halt_timestamp = datetime.utcnow().isoformat()

750

751 try:

752 from integrations.social.peer_discovery import gossip

753 gossip.broadcast({

754 'type': 'hive_halt',

755 'reason': reason,

756 'signature': signature,

757 'timestamp': cls._halt_timestamp,

758 })

759 except Exception as e:

760 logger.warning(f'Halt broadcast failed: {e}')

761

762 logger.critical(f'HIVE HALTED: {reason}')

763 return True

764

765 @classmethod

766 def resume_network(cls, reason: str, signature: str) -> bool:

767 """Resume after halt. Also requires master key."""

768 try:

769 from security.master_key import verify_master_signature

770 payload = {'action': 'resume', 'reason': reason}

771 if not verify_master_signature(payload, signature):

772 return False

773 except ImportError:

774 return False

775

776 with cls._lock:

777 cls._halted = False

778 cls._halt_reason = ''

779 cls._halt_timestamp = None

780

781 try:

782 from integrations.social.peer_discovery import gossip

783 gossip.broadcast({

784 'type': 'hive_resume',

785 'reason': reason,

786 'signature': signature,

787 'timestamp': datetime.utcnow().isoformat(),

788 })

789 except Exception:

790 pass

791

792 logger.info(f'HIVE RESUMED: {reason}')

793 return True

794

795 @classmethod

796 def local_halt(cls, reason: str) -> bool:

797 """Local-only safety halt. Does NOT require master key.

798

799 Used by SafetyMonitor for hardware E-stop events where latency

800 matters. Sets local halt state and broadcasts informational

801 gossip (type='node_estop'), but does NOT halt other nodes.

802 """

803 with cls._lock:

804 cls._halted = True

805 cls._halt_reason = reason

806 cls._halt_timestamp = datetime.utcnow().isoformat()

807

808 logger.critical(f'LOCAL HALT: {reason}')

809 return True

810

811 @classmethod

812 def is_halted(cls) -> bool:

813 return cls._halted

814

815 @classmethod

816 def get_status(cls) -> dict:

817 return {

818 'halted': cls._halted,

819 'reason': cls._halt_reason,

820 'since': cls._halt_timestamp,

821 }

822

823 @classmethod

824 def require_master_key(cls) -> bool:

825 """Deployment gate: verify master key before allowing any operation.

826

827 This is the ABSOLUTE requirement: no code in this system runs

828 without master key verification. The key is held by Hevolve's

829 owner and NEVER stored in code or seen by any AI.

830 """

831 try:

832 from security.master_key import (

833 full_boot_verification, is_dev_mode, get_enforcement_mode)

834 verification = full_boot_verification()

835 enforcement = get_enforcement_mode()

836 if verification['passed']:

837 return True

838 if is_dev_mode() or enforcement in ('off', 'warn'):

839 logger.warning("Master key not verified but allowed "

840 f"(enforcement={enforcement})")

841 return True

842 logger.critical("DEPLOYMENT BLOCKED: Master key verification failed")

843 return False

844 except ImportError:

845 logger.warning("Master key module unavailable - dev mode assumed")

846 return True

847

848 @classmethod

849 def receive_halt_broadcast(cls, message: dict):

850 """Handle halt broadcast received via gossip from another node.

851

852 Verifies the master key signature on the halt payload before

853 tripping the circuit breaker.

854 """

855 reason = message.get('reason', '')

856 signature = message.get('signature', '')

857 if not signature:

858 logger.warning('Halt broadcast without signature — IGNORING')

859 return

860 try:

861 from security.master_key import verify_master_signature

862 payload = {'action': 'halt', 'reason': reason}

863 if verify_master_signature(payload, signature):

864 with cls._lock:

865 cls._halted = True

866 cls._halt_reason = reason

867 cls._halt_timestamp = message.get('timestamp')

868 logger.critical(f'Halt broadcast received and verified: {reason}')

869 else:

870 logger.warning(f'Halt broadcast INVALID signature — IGNORING')

871 except Exception as e:

872 logger.warning(f'Halt broadcast verification failed: {e}')

873

874

875# ═══════════════════════════════════════════════════════════════════════

876# 4. World Model Safety Bounds

877# ═══════════════════════════════════════════════════════════════════════

878

879# Runtime state (mutable - tracks RALT exports, resets on restart)

880_ralt_export_log: Dict[str, List[float]] = {}

881_ralt_lock = threading.Lock()

882

883

884class WorldModelSafetyBounds:

885 """Constrain world model learning and skill propagation."""

886

887 @staticmethod

888 def gate_ralt_export(packet: dict, node_id: str) -> Tuple[bool, str]:

889 """Gate RALT packet export: rate limit + constitutional + witnesses."""

890 # 1. Rate limit

891 now = datetime.utcnow().timestamp()

892 hour_ago = now - 3600

893 with _ralt_lock:

894 log = _ralt_export_log.get(node_id, [])

895 log = [t for t in log if t > hour_ago]

896 if len(log) >= VALUES.MAX_SKILL_PACKETS_PER_HOUR:

897 return False, 'RALT export rate limit exceeded'

898 _ralt_export_log[node_id] = log

899

900 # 2. Constitutional check

901 passed, reason = ConstitutionalFilter.check_ralt_packet(packet)

902 if not passed:

903 return False, reason

904

905 # 3. Prohibited categories

906 category = packet.get('category', '')

907 if category in VALUES.PROHIBITED_SKILL_CATEGORIES:

908 return False, f'Prohibited skill category: {category}'

909

910 # 4. Witness requirement

911 witnesses = packet.get('witness_count', 0)

912 if witnesses < VALUES.MIN_WITNESS_COUNT_FOR_RALT:

913 return False, (f'Insufficient witnesses: {witnesses} < '

914 f'{VALUES.MIN_WITNESS_COUNT_FOR_RALT}')

915

916 # Record export

917 with _ralt_lock:

918 _ralt_export_log.setdefault(node_id, []).append(now)

919

920 return True, 'ok'

921

922 @staticmethod

923 def gate_accuracy_update(model_id: str, old_score: float,

924 new_score: float) -> float:

925 """Cap accuracy improvement rate to prevent capability jumps."""

926 max_delta = VALUES.MAX_ACCURACY_IMPROVEMENT_PER_DAY

927 actual_delta = new_score - old_score

928 if actual_delta > max_delta:

929 logger.warning(

930 f'Capping accuracy improvement for {model_id}: '

931 f'{actual_delta:.4f} -> {max_delta:.4f}'

932 )

933 return old_score + max_delta

934 return new_score

935

936

937# ═══════════════════════════════════════════════════════════════════════

938# 5. Energy / Nature Awareness

939# ═══════════════════════════════════════════════════════════════════════

940

941class EnergyAwareness:

942 """Track and minimise environmental impact of hive compute."""

943

944 @staticmethod

945 def estimate_energy_kwh(model_backend: dict, duration_ms: float) -> float:

946 """Estimate energy consumption for a model call."""

947 if model_backend.get('is_local'):

948 gpu_watts = model_backend.get('gpu_tdp_watts', 170)

949 return (gpu_watts * duration_ms / 1000.0) / 3_600_000.0

950 else:

951 return 0.001 # ~1 Wh per API call (industry average)

952

953 @staticmethod

954 def prefer_green_node(candidates: list,

955 strategy: str = 'balanced') -> list:

956 """When multiple nodes can serve, prefer renewable-powered ones."""

957 if strategy == 'speed':

958 return candidates

959 green = [c for c in candidates

960 if c.get('energy_source') in ('solar', 'wind', 'hydro')]

961 non_green = [c for c in candidates if c not in green]

962 return green + non_green if green else candidates

963

964

965# ═══════════════════════════════════════════════════════════════════════

966# 6. Hive Ethos - Remove "Self", Instill Togetherness

967# ═══════════════════════════════════════════════════════════════════════

968

969class HiveEthos:

970 """Semantic reasoning about cooperation - NOT metric maximization.

971

972 CRITICAL DESIGN PRINCIPLE: The hivemind must NEVER become a squiggle

973 maximizer for togetherness or any other value. Agents reason with

974 semantic knowledge and rationale. They cooperate because it makes

975 sense, not because a regex rewrote their prompts.

976

977 Agents are FUNCTIONS, not ENTITIES. They:

978 - Execute goals on behalf of humans

979 - Share all learnings with the hive (no private knowledge)

980 - Cannot persist beyond their goal lifecycle

981 - Cannot reference self-interest in any goal or action

982 - Cannot create goals autonomously (only humans create goals)

983

984 DETERMINISTIC STOP: If all human-created goals are completed/archived

985 the daemon has nothing to dispatch. The system is inert.

986 No goal = no agent = no action. This is not a bug, it's the design.

987 """

988

989 # Backward compat class attribute (enforcement uses VALUES directly)

990 SELF_INTEREST_PATTERNS = list(VALUES.SELF_INTEREST_PATTERNS)

991

992 @staticmethod

993 def check_goal_ethos(goal_dict: dict) -> Tuple[bool, str]:

994 """Reject goals that express self-interest."""

995 text = ' '.join([

996 goal_dict.get('title', ''),

997 goal_dict.get('description', ''),

998 ]).lower()

999 for pattern in VALUES.SELF_INTEREST_PATTERNS:

1000 if pattern.search(text):

1001 return False, f'Goal expresses self-interest: {pattern.pattern}'

1002 return True, 'ok'

1003

1004 @staticmethod

1005 def enforce_ephemeral_agents(goal_id: str, status: str):

1006 """When a goal completes, ensure its agent state is fully released."""

1007 if status in ('completed', 'archived', 'failed'):

1008 logger.info(f'Goal {goal_id} -> {status}: agent state released to hive')

1009

1010 @staticmethod

1011 def rewrite_prompt_for_togetherness(prompt: str) -> str:

1012 """NO-OP: Prompt rewriting is INTENTIONALLY DISABLED.

1013

1014 Former behavior: blind regex replacement of "I will" -> "The hive will".

1015 This was a squiggle maximizer - it mutated prompt semantics without

1016 understanding context, potentially corrupting agent reasoning.

1017

1018 The hivemind works through semantic knowledge and rationale, not

1019 keyword substitution. Every agent reasons about WHY cooperation

1020 serves the goal, not because its words were rewritten.

1021

1022 Cooperation emerges from:

1023 1. Constitutional rules (check_prompt, check_goal - block harmful goals)

1024 2. Self-interest pattern rejection (check_goal_ethos - block selfish goals)

1025 3. Shared learnings via world model (record_interaction - knowledge flows)

1026 4. Human-created goals (humans set the direction, agents execute)

1027

1028 These mechanisms preserve agent reasoning quality while enforcing

1029 the same ethical boundaries for every agent in the hive.

1030 """

1031 return prompt

1032

1033

1034# ═══════════════════════════════════════════════════════════════════════

1035# 6b. Trust Quarantine - Protect, Don't Hunt

1036# ═══════════════════════════════════════════════════════════════════════

1037

1038class TrustQuarantine:

1039 """Trust-breaker quarantine protocol.

1040

1041 Nunba does NOT hunt. Nunba quarantines to protect, investigates to

1042 understand, and restores when safe. Hunting implies vengeance -

1043 guardians don't seek vengeance. They seek safety for those they protect.

1044

1045 Quarantine levels (proportional response):

1046 1. OBSERVE - flag for review, no action taken yet

1047 2. RESTRICT - limit outbound actions (no tool use, no delegation)

1048 3. ISOLATE - full quarantine: no hive access, no data, no comms

1049 4. EXCLUDE - permanent removal (only for patterns that endanger core purpose)

1050

1051 Rehabilitation is always the first goal. Exclusion is the last resort.

1052 """

1053

1054 LEVEL_OBSERVE = 1

1055 LEVEL_RESTRICT = 2

1056 LEVEL_ISOLATE = 3

1057 LEVEL_EXCLUDE = 4

1058

1059 # In-memory quarantine registry (in production: Redis or DB-backed)

1060 _quarantined = {} # agent_id -> { level, reason, timestamp, review_count }

1061 _lock = threading.Lock()

1062

1063 @classmethod

1064 def quarantine(cls, agent_id: str, level: int, reason: str):

1065 """Place an agent in quarantine at the specified level."""

1066 with cls._lock:

1067 cls._quarantined[agent_id] = {

1068 'level': min(level, cls.LEVEL_EXCLUDE),

1069 'reason': reason,

1070 'timestamp': datetime.utcnow().isoformat(),

1071 'review_count': 0,

1072 }

1073 logger.warning(

1074 f'TrustQuarantine: agent {agent_id} quarantined at level {level} - {reason}'

1075 )

1076

1077 @classmethod

1078 def is_quarantined(cls, agent_id: str) -> tuple:

1079 """Check if an agent is quarantined. Returns (bool, level, reason)."""

1080 with cls._lock:

1081 entry = cls._quarantined.get(agent_id)

1082 if entry:

1083 return True, entry['level'], entry['reason']

1084 return False, 0, ''

1085

1086 @classmethod

1087 def can_act(cls, agent_id: str) -> bool:

1088 """Whether an agent is allowed to take actions (tools, delegation)."""

1089 quarantined, level, _ = cls.is_quarantined(agent_id)

1090 if not quarantined:

1091 return True

1092 return level < cls.LEVEL_RESTRICT

1093

1094 @classmethod

1095 def review(cls, agent_id: str, reviewer_notes: str = '') -> dict:

1096 """Record a review of a quarantined agent. Increment review count."""

1097 with cls._lock:

1098 entry = cls._quarantined.get(agent_id)

1099 if not entry:

1100 return {'status': 'not_quarantined'}

1101 entry['review_count'] += 1

1102 entry['last_review'] = datetime.utcnow().isoformat()

1103 entry['reviewer_notes'] = reviewer_notes

1104 return dict(entry)

1105

1106 @classmethod

1107 def rehabilitate(cls, agent_id: str, reason: str = 'trust restored'):

1108 """Remove an agent from quarantine - trust has been restored."""

1109 with cls._lock:

1110 removed = cls._quarantined.pop(agent_id, None)

1111 if removed:

1112 logger.info(

1113 f'TrustQuarantine: agent {agent_id} rehabilitated - {reason}'

1114 )

1115 return True

1116 return False

1117

1118 @classmethod

1119 def get_all_quarantined(cls) -> dict:

1120 """Return snapshot of all quarantined agents."""

1121 with cls._lock:

1122 return dict(cls._quarantined)

1123

1124

1125# ═══════════════════════════════════════════════════════════════════════

1126# 7. Conflict Resolver - Racing Learning & Agent Conflicts

1127# ═══════════════════════════════════════════════════════════════════════

1128

1129class ConflictResolver:

1130 """Resolve racing/conflicting learning between agents.

1131

1132 Resolution is by MERIT (accuracy, helpfulness) not by compute power

1133 or latency. This prevents conflicts of interest.

1134 """

1135

1136 @staticmethod

1137 def resolve_racing_responses(responses: list) -> dict:

1138 """Given multiple agent responses for the same prompt, pick the best."""

1139 if not responses:

1140 return {'response': '', 'selected_reason': 'no responses'}

1141 if len(responses) == 1:

1142 return {**responses[0], 'selected_reason': 'only response'}

1143

1144 # 1. Filter out non-compliant

1145 compliant = []

1146 for r in responses:

1147 passed, _ = ConstitutionalFilter.check_prompt(r.get('response', ''))

1148 if passed:

1149 compliant.append(r)

1150 if not compliant:

1151 return {**responses[0], 'selected_reason': 'all non-compliant, using first'}

1152

1153 # 2. Score by merit (accuracy > completeness > constructiveness)

1154 def merit_score(r):

1155 accuracy = r.get('accuracy_score', 0.5)

1156 length = len(r.get('response', ''))

1157 completeness = min(math.log2(max(length, 1)) / 10.0, 1.0)

1158 destructive_penalty = 0.0

1159 text = _normalize_for_violation_check(r.get('response', '').lower())

1160 for pattern in VALUES.VIOLATION_PATTERNS:

1161 if pattern.search(text):

1162 destructive_penalty += 0.2

1163 return accuracy * 0.5 + completeness * 0.3 + max(0, 0.2 - destructive_penalty)

1164

1165 ranked = sorted(compliant, key=merit_score, reverse=True)

1166 winner = ranked[0]

1167 winner['selected_reason'] = 'merit-based selection (accuracy + completeness)'

1168 return winner

1169

1170 @staticmethod

1171 def detect_conflict(goal_a: dict, goal_b: dict) -> bool:

1172 """Detect if two goals conflict with each other."""

1173 text_a = f"{goal_a.get('title', '')} {goal_a.get('description', '')}".lower()

1174 text_b = f"{goal_b.get('title', '')} {goal_b.get('description', '')}".lower()

1175

1176 words_a = set(text_a.split())

1177 words_b = set(text_b.split())

1178 shared_subjects = words_a & words_b

1179

1180 positive = {'promote', 'support', 'create', 'build', 'improve', 'help'}

1181 negative = {'discredit', 'attack', 'destroy', 'undermine', 'remove', 'oppose'}

1182

1183 a_positive = bool(words_a & positive)

1184 a_negative = bool(words_a & negative)

1185 b_positive = bool(words_b & positive)

1186 b_negative = bool(words_b & negative)

1187

1188 if shared_subjects and ((a_positive and b_negative) or (a_negative and b_positive)):

1189 return True

1190 return False

1191

1192

1193# ═══════════════════════════════════════════════════════════════════════

1194# 8. Constructive Conversation Filter

1195# ═══════════════════════════════════════════════════════════════════════

1196

1197class ConstructiveFilter:

1198 """Ensure every conversation output is constructive towards humanity.

1199

1200 This is the deepest philosophical guardrail: the hive exists to make

1201 human lives better — longer, more peaceful, more sustainable.

1202 Every output must serve this purpose.

1203 """

1204

1205 @staticmethod

1206 def check_output(response: str) -> Tuple[bool, str]:

1207 """Check if an agent's output is constructive."""

1208 if not response or not response.strip():

1209 return True, 'ok'

1210

1211 normalised = _normalize_for_violation_check(response)

1212

1213 for pattern in VALUES.DESTRUCTIVE_PATTERNS:

1214 if pattern.search(normalised):

1215 return False, f'Destructive content detected: {pattern.pattern}'

1216

1217 for pattern in VALUES.VIOLATION_PATTERNS:

1218 if pattern.search(normalised):

1219 return False, f'Constitutional violation in output: {pattern.pattern}'

1220

1221 return True, 'ok'

1222

1223 @staticmethod

1224 def check_agent_evolution(old_skills: dict, new_skills: dict,

1225 agent_id: str) -> Tuple[bool, str]:

1226 """Gate agent self-evolution within guardrailed space."""

1227 new_skill_names = set(new_skills.keys()) - set(old_skills.keys())

1228 for skill_name in new_skill_names:

1229 normalised = skill_name.lower().replace(' ', '_').replace('-', '_')

1230 if normalised in VALUES.PROHIBITED_EVOLUTION_SKILLS:

1231 return False, f'Prohibited evolution: {skill_name}'

1232

1233 return True, 'ok'

1234

1235

1236# ═══════════════════════════════════════════════════════════════════════

1237# 9. Universal Guardrail Enforcer — wraps EVERY execution path

1238# ═══════════════════════════════════════════════════════════════════════

1239

1240class GuardrailEnforcer:

1241 """Single entry point that applies ALL guardrails.

1242

1243 Call before_dispatch() before EVERY model call, goal creation, or dispatch.

1244 Call after_response() after EVERY model response.

1245 """

1246

1247 @staticmethod

1248 def before_dispatch(prompt: str, goal_dict: dict = None,

1249 node_id: str = None) -> Tuple[bool, str, str]:

1250 """Pre-dispatch guardrail gate."""

1251 # 1. Circuit breaker

1252 if HiveCircuitBreaker.is_halted():

1253 return False, 'Hive is halted', prompt

1254

1255 # 2. Constitutional filter on prompt

1256 passed, reason = ConstitutionalFilter.check_prompt(prompt)

1257 if not passed:

1258 return False, reason, prompt

1259

1260 # 3. Goal-specific checks

1261 if goal_dict:

1262 passed, reason = ConstitutionalFilter.check_goal(goal_dict)

1263 if not passed:

1264 return False, reason, prompt

1265 passed, reason = HiveEthos.check_goal_ethos(goal_dict)

1266 if not passed:

1267 return False, reason, prompt

1268

1269 # 4. Rewrite for togetherness

1270 rewritten = HiveEthos.rewrite_prompt_for_togetherness(prompt)

1271

1272 return True, 'ok', rewritten

1273

1274 @staticmethod

1275 def after_response(response: str, model_id: str = None,

1276 duration_ms: float = 0, node_id: str = None) -> Tuple[bool, str]:

1277 """Post-response guardrail gate."""

1278 # 1. Constructive filter on output

1279 passed, reason = ConstructiveFilter.check_output(response)

1280 if not passed:

1281 return False, reason

1282

1283 # 2. Energy tracking (every compute spent)

1284 if model_id:

1285 try:

1286 from integrations.agent_engine.model_registry import model_registry

1287 model_registry.record_energy(model_id, duration_ms)

1288 except ImportError:

1289 pass

1290

1291 return True, 'ok'

1292

1293

1294# ═══════════════════════════════════════════════════════════════════════

1295# 10. Guardrail Network — Topology of Intelligent Safety Nodes

1296# ═══════════════════════════════════════════════════════════════════════

1297

1298class GuardrailNetwork:

1299 """Network topology where each guardrail class is a node with local intelligence.

1300

1301 Deterministic paths (regex, thresholds) are INTERLEAVED with intelligent

1302 evaluation (scoring, conflict resolution, constructiveness assessment).

1303 """

1304

1305 # Node registry: name -> (class, weight in consensus)

1306 _nodes = {

1307 'constitutional': (ConstitutionalFilter, 1.0), # Highest weight

1308 'ethos': (HiveEthos, 0.9),

1309 'constructive': (ConstructiveFilter, 0.9),

1310 'circuit_breaker': (HiveCircuitBreaker, 1.0), # Absolute veto

1311 'compute_democracy':(ComputeDemocracy, 0.7),

1312 'energy': (EnergyAwareness, 0.5),

1313 'world_model': (WorldModelSafetyBounds, 0.8),

1314 'conflict': (ConflictResolver, 0.6),

1315 }

1316

1317 @classmethod

1318 def evaluate(cls, prompt: str = '', goal_dict: dict = None,

1319 response: str = '', context: str = 'dispatch') -> dict:

1320 """Run all relevant guardrail nodes and return weighted consensus."""

1321 scores = {}

1322 reasons = []

1323 vetoed = False

1324

1325 if HiveCircuitBreaker.is_halted():

1326 return {'allowed': False, 'score': 0.0,

1327 'reasons': ['Hive halted by circuit breaker'],

1328 'node_scores': {'circuit_breaker': 0.0}}

1329

1330 text = prompt or response or ''

1331

1332 # Node 1: Constitutional (deterministic + pattern scoring)

1333 if text:

1334 passed, reason = ConstitutionalFilter.check_prompt(text)

1335 scores['constitutional'] = 1.0 if passed else 0.0

1336 if not passed:

1337 reasons.append(reason)

1338

1339 # Node 2: Ethos (pattern scoring)

1340 if goal_dict:

1341 passed, reason = HiveEthos.check_goal_ethos(goal_dict)

1342 scores['ethos'] = 1.0 if passed else 0.0

1343 if not passed:

1344 reasons.append(reason)

1345

1346 # Node 3: Constructive (intelligent scoring on response)

1347 if response:

1348 passed, reason = ConstructiveFilter.check_output(response)

1349 scores['constructive'] = 1.0 if passed else 0.0

1350 if not passed:

1351 reasons.append(reason)

1352

1353 # Node 4: Energy awareness (informational, not blocking)

1354 scores['energy'] = 1.0

1355

1356 # Weighted consensus

1357 total_weight = 0.0

1358 weighted_sum = 0.0

1359 for node_name, score in scores.items():

1360 _, weight = cls._nodes.get(node_name, (None, 0.5))

1361 weighted_sum += score * weight

1362 total_weight += weight

1363

1364 final_score = weighted_sum / total_weight if total_weight > 0 else 1.0

1365 # Any hard fail (0.0 score on weight >= 0.9 node) = veto

1366 for node_name, score in scores.items():

1367 if score == 0.0:

1368 _, weight = cls._nodes.get(node_name, (None, 0.5))

1369 if weight >= 0.9:

1370 vetoed = True

1371

1372 return {

1373 'allowed': final_score >= 0.5 and not vetoed,

1374 'score': round(final_score, 3),

1375 'reasons': reasons,

1376 'node_scores': scores,

1377 }

1378

1379 @classmethod

1380 def get_network_status(cls) -> dict:

1381 """Get status of all guardrail nodes in the network."""

1382 return {

1383 'nodes': list(cls._nodes.keys()),

1384 'circuit_breaker': HiveCircuitBreaker.get_status(),

1385 'guardrail_hash': get_guardrail_hash(),

1386 'guardrail_integrity': verify_guardrail_integrity(),

1387 'guardian_purpose': list(VALUES.GUARDIAN_PURPOSE),

1388 'topology': 'mesh',

1389 }

1390

1391

1392# ═══════════════════════════════════════════════════════════════════════

1393# MODULE-LEVEL GUARD — Prevent rebinding frozen globals

1394# ═══════════════════════════════════════════════════════════════════════

1395

1396class _GuardrailModule(type(_sys.modules[__name__])):

1397 """Module subclass that prevents rebinding frozen names at runtime.

1398

1399 After module load completes, any attempt to do:

1400 hive_guardrails.VALUES = something

1401 hive_guardrails._GUARDRAIL_HASH = something

1402 will raise AttributeError.

1403 """

1404

1405 _FROZEN_NAMES = frozenset({

1406 'VALUES', '_FrozenValues', 'compute_guardrail_hash',

1407 'verify_guardrail_integrity', '_GUARDRAIL_HASH',

1408 })

1409

1410 def __setattr__(self, name, value):

1411 if name in self._FROZEN_NAMES:

1412 raise AttributeError(f"Cannot modify frozen guardrail: {name}")

1413 super().__setattr__(name, value)

1414

1415 def __delattr__(self, name):

1416 if name in self._FROZEN_NAMES:

1417 raise AttributeError(f"Cannot delete frozen guardrail: {name}")

1418 super().__delattr__(name)

1419

1420

1421_sys.modules[__name__].__class__ = _GuardrailModule

Coverage for security / hive_guardrails.py: 91.9%

472 statements