Coverage for security / key_delegation.py: 89.3%
309 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Key Delegation: Hierarchical certificate chain for 3-tier HevolveSocial network.
4Central (hevolve.ai) signs certificates for Regional hosts.
5Regional hosts are verified via certificate chain back to master key
6AND/OR trusted-keys registry lookup (hybrid model).
7Local nodes (Nunba) connect to their assigned regional host.
9Certificate format:
10{
11 "node_id": "...",
12 "public_key": "<hex>",
13 "tier": "regional",
14 "region_name": "us-east-1",
15 "issued_at": "ISO8601",
16 "expires_at": "ISO8601",
17 "capabilities": ["registry", "gossip_hub", "agent_host"],
18 "parent_public_key": "<hex>",
19 "parent_signature": "<hex>"
20}
21"""
22import os
23import json
24import logging
25import secrets
26import socket
27import threading
28from pathlib import Path
29from typing import Optional, Dict, Tuple
30from datetime import datetime, timedelta, timezone
32from cryptography.hazmat.primitives.asymmetric.ed25519 import (
33 Ed25519PrivateKey, Ed25519PublicKey,
34)
35from cryptography.hazmat.primitives import serialization
36from cryptography.exceptions import InvalidSignature
38logger = logging.getLogger('hevolve_security')
40_DEFAULT_CERT_PATH = os.path.join(
41 os.environ.get('HEVOLVE_KEY_DIR', 'agent_data'), 'node_certificate.json')
43# Trusted Hevolve infrastructure domains. HARDCODED — not configurable via
44# env var because this repo is open-sourced and env vars are trivially spoofed.
45# Domain match alone does NOT grant regional authorization; it grants
46# PROVISIONAL status that central must confirm via challenge-response.
47_TRUSTED_DOMAINS = ('hevolve.ai', 'hertzai.com')
49# Challenge-response protocol constants (used by DomainChallengeVerifier
50# and _generate_domain_nonce).
51_CHALLENGE_TTL_SECONDS = 60
52_CHALLENGE_NONCE_BYTES = 32
53_MAX_CHALLENGES_PER_FQDN_PER_HOUR = 5
54_PROVISIONAL_CERT_VALIDITY_DAYS = 7
57def _detect_node_domain() -> str:
58 """Detect the FQDN of this node via OS-level resolution only.
60 No env var override — that would be trivially spoofable once the repo
61 is public. Uses socket.getfqdn() which queries the machine's actual
62 DNS configuration.
64 Returns lowercase FQDN string, or empty string if undetectable.
65 """
66 try:
67 fqdn = socket.getfqdn().lower()
68 logger.debug(f"Node domain from socket.getfqdn(): {fqdn}")
69 return fqdn
70 except Exception as e:
71 logger.warning(f"Failed to detect node FQDN: {e}")
72 return ''
75def _is_trusted_domain(fqdn: str) -> bool:
76 """Check if an FQDN belongs to a trusted Hevolve infrastructure domain.
78 Matches if the FQDN ends with any entry in _TRUSTED_DOMAINS.
79 For example, 'regional-us.hevolve.ai' matches 'hevolve.ai'.
81 Rejects empty strings, bare names without dots, and domains that
82 merely contain the suffix (e.g. 'malicioushevolve.ai').
83 """
84 if not fqdn or '.' not in fqdn:
85 return False
86 for domain in _TRUSTED_DOMAINS:
87 if fqdn == domain:
88 return True
89 if fqdn.endswith('.' + domain):
90 return True
91 return False
94def _generate_domain_nonce() -> str:
95 """Generate a cryptographic nonce for domain challenge-response verification.
97 Uses 32 bytes (256 bits) of cryptographic randomness, consistent with
98 the DomainChallengeVerifier challenge nonce size.
99 """
100 return secrets.token_hex(_CHALLENGE_NONCE_BYTES)
103def get_node_tier() -> str:
104 """Return node TOPOLOGY mode from HEVOLVE_NODE_TIER env var.
106 Returns one of: 'flat', 'regional', 'central', 'local'
108 NOTE: Despite the name, this returns TOPOLOGY MODE, not capability tier.
109 For capability tier (embedded/lite/standard/full/compute_host), use
110 security.system_requirements.get_tier() instead.
111 Legacy name retained for backward compatibility.
113 Auto-promotion: If HEVOLVE_MASTER_PRIVATE_KEY is set and valid,
114 the node auto-promotes to central (the key IS the authority).
115 """
116 tier = os.environ.get('HEVOLVE_NODE_TIER', 'flat').lower()
118 # Central tier requires cryptographic proof — the master private key.
119 # Setting HEVOLVE_NODE_TIER=central alone is NOT enough.
120 # The key is the crown — whoever holds it is king. No key, no crown.
121 priv_hex = os.environ.get('HEVOLVE_MASTER_PRIVATE_KEY', '')
122 if priv_hex and len(priv_hex) >= 64:
123 try:
124 from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
125 priv = Ed25519PrivateKey.from_private_bytes(bytes.fromhex(priv_hex))
126 pub_hex = priv.public_key().public_bytes(
127 encoding=serialization.Encoding.Raw,
128 format=serialization.PublicFormat.Raw,
129 ).hex()
130 from security.master_key import MASTER_PUBLIC_KEY_HEX
131 if pub_hex == MASTER_PUBLIC_KEY_HEX:
132 os.environ['HEVOLVE_NODE_TIER'] = 'central'
133 return 'central'
134 except Exception:
135 pass # Invalid key — don't promote
137 # Central claimed without proof — reject and refuse to start
138 if tier == 'central':
139 logger.critical(
140 "HEVOLVE_NODE_TIER=central set without valid HEVOLVE_MASTER_PRIVATE_KEY. "
141 "Central tier requires the master key. Refusing to start as central. "
142 "Falling back to 'flat'. If you are the steward, set the key."
143 )
144 return 'flat'
146 # Regional requires a valid certificate from central (checked at boot)
147 if tier == 'regional':
148 cert_path = os.environ.get('HEVOLVE_REGIONAL_CERT', '')
149 if not cert_path or not os.path.isfile(cert_path):
150 logger.warning(
151 "HEVOLVE_NODE_TIER=regional set without HEVOLVE_REGIONAL_CERT. "
152 "Falling back to 'flat'."
153 )
154 return 'flat'
156 if tier in ('regional', 'local', 'flat'):
157 return tier
158 return 'flat'
161def create_child_certificate(
162 parent_private_key: Ed25519PrivateKey,
163 child_public_key_hex: str,
164 node_id: str,
165 tier: str,
166 region_name: str,
167 capabilities: list = None,
168 validity_days: int = 365,
169) -> dict:
170 """Create a certificate for a child node, signed by the parent's private key.
172 Used by central to certify regional hosts, or by regional to certify locals.
173 """
174 MAX_CERT_VALIDITY_DAYS = 365
175 validity_days = min(validity_days, MAX_CERT_VALIDITY_DAYS)
177 parent_pub_bytes = parent_private_key.public_key().public_bytes(
178 encoding=serialization.Encoding.Raw,
179 format=serialization.PublicFormat.Raw,
180 )
181 now = datetime.now(timezone.utc)
182 cert = {
183 'node_id': node_id,
184 'public_key': child_public_key_hex,
185 'tier': tier,
186 'region_name': region_name,
187 'issued_at': now.isoformat(),
188 'expires_at': (now + timedelta(days=validity_days)).isoformat(),
189 'capabilities': capabilities or ['gossip_hub', 'agent_host'],
190 'parent_public_key': parent_pub_bytes.hex(),
191 }
193 # Sign all fields except parent_signature
194 canonical = json.dumps(cert, sort_keys=True, separators=(',', ':'))
195 sig = parent_private_key.sign(canonical.encode('utf-8'))
196 cert['parent_signature'] = sig.hex()
197 return cert
200def verify_certificate_signature(certificate: dict) -> bool:
201 """Verify that a certificate's parent_signature is valid.
203 Checks signature against the parent_public_key embedded in the certificate.
204 """
205 try:
206 parent_sig = certificate.get('parent_signature', '')
207 parent_pub_hex = certificate.get('parent_public_key', '')
208 if not parent_sig or not parent_pub_hex:
209 return False
211 clean = {k: v for k, v in certificate.items() if k != 'parent_signature'}
212 canonical = json.dumps(clean, sort_keys=True, separators=(',', ':'))
214 pub_bytes = bytes.fromhex(parent_pub_hex)
215 pub_key = Ed25519PublicKey.from_public_bytes(pub_bytes)
216 sig_bytes = bytes.fromhex(parent_sig)
217 pub_key.verify(sig_bytes, canonical.encode('utf-8'))
218 return True
219 except (InvalidSignature, ValueError, Exception):
220 return False
223def verify_certificate_chain(
224 certificate: dict,
225 trusted_keys: dict = None,
226) -> dict:
227 """Verify a certificate using hybrid approach.
229 Path 1 (Certificate chain): Verify parent_signature, then check if
230 parent_public_key traces back to MASTER_PUBLIC_KEY_HEX.
232 Path 2 (Registry lookup): Check if certificate's public_key is in
233 the trusted_keys dict.
235 Either path succeeding = valid.
237 Returns: {'valid': bool, 'path': str, 'details': str}
238 """
239 node_id = certificate.get('node_id', 'unknown')
240 pub_key = certificate.get('public_key', '')
242 # Path 1: Certificate chain verification
243 chain_valid = False
244 chain_details = ''
245 try:
246 # Step 1: Verify signature on certificate
247 if verify_certificate_signature(certificate):
248 # Step 2: Check if parent_public_key is the master key
249 from security.master_key import MASTER_PUBLIC_KEY_HEX
250 parent_pub = certificate.get('parent_public_key', '')
251 if parent_pub == MASTER_PUBLIC_KEY_HEX:
252 chain_valid = True
253 chain_details = 'Certificate signed by master key'
254 else:
255 chain_details = 'Certificate signed by non-master key'
256 else:
257 chain_details = 'Invalid certificate signature'
258 except Exception as e:
259 chain_details = f'Chain verification error: {e}'
261 # Check expiry (expires_at is mandatory - perpetual certs are rejected)
262 if chain_valid:
263 try:
264 expires_str = certificate.get('expires_at', '')
265 if not expires_str:
266 chain_valid = False
267 chain_details = 'Certificate missing expires_at field'
268 elif expires_str:
269 expires = datetime.fromisoformat(expires_str)
270 if expires.tzinfo is None:
271 expires = expires.replace(tzinfo=timezone.utc)
272 if datetime.now(timezone.utc) > expires:
273 chain_valid = False
274 chain_details = 'Certificate expired'
275 except (ValueError, TypeError):
276 chain_valid = False
277 chain_details = 'Malformed certificate expiry date'
279 # Path 2: Registry lookup (fallback)
280 registry_valid = False
281 registry_details = ''
282 if trusted_keys and pub_key:
283 if trusted_keys.get(node_id) == pub_key:
284 registry_valid = True
285 registry_details = 'Public key found in trusted registry'
286 else:
287 registry_details = 'Public key not in trusted registry'
289 # Hybrid: either path succeeding = valid
290 valid = chain_valid or registry_valid
291 if valid:
292 path = 'chain' if chain_valid else 'registry'
293 details = chain_details if chain_valid else registry_details
294 else:
295 details = f'Chain: {chain_details}; Registry: {registry_details or "not checked"}'
296 path = 'none'
298 return {'valid': valid, 'path': path, 'details': details}
301def verify_tier_authorization() -> dict:
302 """Verify this node has proper credentials for its claimed tier.
304 Enforcement rules:
305 - central: Must have master private key (HSM or HEVOLVE_MASTER_PRIVATE_KEY).
306 Public key must match MASTER_PUBLIC_KEY_HEX.
307 - regional: Tries two paths in order:
308 1. Certificate (node_certificate.json) signed by master key → FULL auth.
309 2. Trusted domain (socket.getfqdn() matches hardcoded *.hevolve.ai /
310 *.hertzai.com) → PROVISIONAL auth only. Provisional nodes can
311 operate but central must verify via challenge-response and
312 auto-issue a short-lived certificate. Domain list is hardcoded
313 (not env-var configurable) because the repo is open-sourced.
314 - local/flat: Always authorized.
316 Returns: {'authorized': bool, 'tier': str, 'provisional': bool, 'details': str}
317 """
318 tier = get_node_tier()
320 if tier in ('local', 'flat'):
321 return {'authorized': True, 'tier': tier,
322 'details': 'Local/flat tier - no credentials required'}
324 if tier == 'central':
325 # Check HSM provider first (production path)
326 try:
327 from security.hsm_provider import get_hsm_provider
328 provider = get_hsm_provider()
329 from security.master_key import MASTER_PUBLIC_KEY_HEX
330 hsm_pub = provider.get_public_key_hex()
331 if hsm_pub == MASTER_PUBLIC_KEY_HEX:
332 return {'authorized': True, 'tier': tier,
333 'details': f'Central tier authorized - HSM ({provider.get_provider_name()})'}
334 else:
335 return {'authorized': False, 'tier': tier,
336 'details': 'HSM public key does not match trust anchor'}
337 except Exception:
338 pass
340 # Legacy fallback: check env var (dev mode)
341 priv_hex = os.environ.get('HEVOLVE_MASTER_PRIVATE_KEY', '')
342 if not priv_hex:
343 return {'authorized': False, 'tier': tier,
344 'details': 'Central tier requires HSM or HEVOLVE_MASTER_PRIVATE_KEY'}
345 try:
346 from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
347 priv = Ed25519PrivateKey.from_private_bytes(bytes.fromhex(priv_hex))
348 pub_hex = priv.public_key().public_bytes(
349 encoding=serialization.Encoding.Raw,
350 format=serialization.PublicFormat.Raw,
351 ).hex()
352 from security.master_key import MASTER_PUBLIC_KEY_HEX
353 if pub_hex != MASTER_PUBLIC_KEY_HEX:
354 return {'authorized': False, 'tier': tier,
355 'details': 'Master private key does not match hardcoded public key'}
356 return {'authorized': True, 'tier': tier,
357 'details': 'Central tier authorized - env var fallback (use HSM in production)'}
358 except (ValueError, Exception) as e:
359 return {'authorized': False, 'tier': tier,
360 'details': f'Invalid master private key: {e}'}
362 if tier == 'regional':
363 # Path 1: Certificate-based authorization (FULL — preferred)
364 cert = load_node_certificate()
365 if cert:
366 chain_result = verify_certificate_chain(cert)
367 if chain_result['valid']:
368 if cert.get('tier') != 'regional':
369 return {'authorized': False, 'tier': tier,
370 'details': f'Certificate tier mismatch: cert says '
371 f'{cert.get("tier")}, node claims regional'}
372 return {'authorized': True, 'tier': tier,
373 'provisional': False,
374 'details': f'Regional tier authorized via certificate '
375 f'({chain_result["path"]})'}
376 else:
377 logger.warning(
378 f'Regional certificate invalid ({chain_result["details"]}), '
379 f'trying domain-based provisional authorization')
381 # Path 2: Trusted domain — PROVISIONAL only
382 #
383 # Domain detection (socket.getfqdn) is not cryptographic proof.
384 # An attacker who controls their own DNS could spoof this.
385 # Therefore domain match grants PROVISIONAL status:
386 # - Node can join the gossip network
387 # - Node can host agents locally
388 # - Node CANNOT sign certificates or act as authority
389 # - Central will issue a challenge-response nonce to verify the
390 # node is actually reachable at the claimed FQDN, then auto-issue
391 # a short-lived certificate if verification passes.
392 fqdn = _detect_node_domain()
393 if fqdn and _is_trusted_domain(fqdn):
394 nonce = _generate_domain_nonce()
395 logger.info(
396 f'PROVISIONAL REGIONAL AUTH: {fqdn} matches trusted domain. '
397 f'Node authorized provisionally pending central verification. '
398 f'Challenge nonce: {nonce[:8]}...')
399 return {'authorized': True, 'tier': tier,
400 'provisional': True,
401 'fqdn': fqdn,
402 'challenge_nonce': nonce,
403 'details': f'Regional tier PROVISIONAL via trusted domain '
404 f'({fqdn}) — pending central challenge-response'}
406 # Neither path succeeded
407 if cert:
408 return {'authorized': False, 'tier': tier,
409 'details': f'Certificate invalid: {chain_result["details"]}; '
410 f'domain "{fqdn or "(undetectable)"}" not in '
411 f'trusted list'}
412 return {'authorized': False, 'tier': tier,
413 'details': f'Regional tier requires a signed certificate or '
414 f'trusted domain FQDN; detected domain: '
415 f'"{fqdn or "(undetectable)"}"'}
417 return {'authorized': False, 'tier': tier, 'details': f'Unknown tier: {tier}'}
420def load_node_certificate(cert_path: str = None) -> Optional[dict]:
421 """Load this node's certificate from disk."""
422 path = Path(cert_path or os.environ.get('HEVOLVE_NODE_CERT_PATH', _DEFAULT_CERT_PATH))
423 if not path.exists():
424 return None
425 try:
426 with open(path, 'r', encoding='utf-8') as f:
427 return json.load(f)
428 except (json.JSONDecodeError, IOError) as e:
429 logger.warning(f"Failed to load node certificate: {e}")
430 return None
433def save_node_certificate(certificate: dict, cert_path: str = None):
434 """Persist node certificate to disk."""
435 path = Path(cert_path or os.environ.get('HEVOLVE_NODE_CERT_PATH', _DEFAULT_CERT_PATH))
436 path.parent.mkdir(parents=True, exist_ok=True)
437 with open(path, 'w', encoding='utf-8') as f:
438 json.dump(certificate, f, indent=2)
439 logger.info(f"Node certificate saved to {path}")
442# =========================================================================
443# Domain Challenge-Response Verifier
444# =========================================================================
445#
446# 4-step handshake for promoting provisional regional nodes to full status:
447#
448# 1. Node -> Central: REGISTER {fqdn, public_key_hex, tier_claim}
449# 2. Central -> Node: CHALLENGE (nonce delivered to http://{fqdn}:{port}/.well-known/hart-challenge)
450# 3. Node -> Central: CHALLENGE_RESPONSE {fqdn, public_key_hex, nonce, signature_hex}
451# 4. Central -> Node: VERIFIED (short-lived certificate) or REJECTED
452#
453# Security properties:
454# - Nonce is 32 bytes of cryptographic randomness (secrets.token_bytes).
455# - Challenge is single-use: deleted after verification attempt or on expiry.
456# - 60-second TTL prevents replay of stale challenges.
457# - Rate limit of 5 challenges per FQDN per hour mitigates DoS / probing.
458# - HTTP callback to the FQDN proves the node controls that DNS name.
459# - Ed25519 signature binds the nonce to the node's keypair.
460# - Issued certificates are short-lived (7 days) forcing periodic re-verification.
462class DomainChallengeVerifier:
463 """Thread-safe challenge-response verifier for provisional regional nodes.
465 Instantiated once on the central node. All public methods are safe to call
466 from concurrent Flask request threads.
467 """
469 def __init__(self):
470 # {nonce_hex: {fqdn, public_key_hex, nonce_hex, created_at, expires_at}}
471 self._pending: Dict[str, dict] = {}
472 # {fqdn: [timestamp, ...]} — timestamps of recent challenge creations
473 self._rate_log: Dict[str, list] = {}
474 self._lock = threading.Lock()
476 # -----------------------------------------------------------------
477 # Step 1+2: Central creates a challenge for a registering node
478 # -----------------------------------------------------------------
480 def create_challenge(
481 self,
482 fqdn: str,
483 public_key_hex: str,
484 ) -> Tuple[bool, dict]:
485 """Create a challenge nonce for a provisional regional node.
487 Called when a node sends a REGISTER request claiming regional tier.
489 Args:
490 fqdn: The fully qualified domain name the node claims.
491 public_key_hex: The node's Ed25519 public key in hex.
493 Returns:
494 (success, result_dict). On success result_dict contains the
495 nonce_hex that must be delivered to the node. On failure it
496 contains an 'error' key with a human-readable reason.
497 """
498 # --- Validate FQDN against trusted domains ---
499 if not _is_trusted_domain(fqdn):
500 logger.warning(
501 f"CHALLENGE REJECTED: FQDN '{fqdn}' is not in trusted domains")
502 return False, {
503 'error': f'FQDN "{fqdn}" is not a trusted Hevolve domain',
504 'fqdn': fqdn,
505 }
507 # --- Validate public key format (must be 32-byte Ed25519 key) ---
508 try:
509 key_bytes = bytes.fromhex(public_key_hex)
510 if len(key_bytes) != 32:
511 raise ValueError(f'Expected 32 bytes, got {len(key_bytes)}')
512 Ed25519PublicKey.from_public_bytes(key_bytes)
513 except (ValueError, Exception) as e:
514 logger.warning(
515 f"CHALLENGE REJECTED: invalid public key from {fqdn}: {e}")
516 return False, {
517 'error': f'Invalid Ed25519 public key: {e}',
518 'fqdn': fqdn,
519 }
521 now = datetime.now(timezone.utc)
523 with self._lock:
524 # --- Rate limit ---
525 self._prune_rate_log(fqdn, now)
526 recent_count = len(self._rate_log.get(fqdn, []))
527 if recent_count >= _MAX_CHALLENGES_PER_FQDN_PER_HOUR:
528 logger.warning(
529 f"CHALLENGE RATE LIMITED: {fqdn} has {recent_count} "
530 f"challenges in the last hour (max {_MAX_CHALLENGES_PER_FQDN_PER_HOUR})")
531 return False, {
532 'error': 'Rate limit exceeded: too many challenge requests',
533 'fqdn': fqdn,
534 'retry_after_seconds': 3600,
535 }
537 # --- Purge any existing expired challenges ---
538 self._purge_expired(now)
540 # --- Generate cryptographic nonce ---
541 nonce_bytes = secrets.token_bytes(_CHALLENGE_NONCE_BYTES)
542 nonce_hex = nonce_bytes.hex()
544 expires_at = now + timedelta(seconds=_CHALLENGE_TTL_SECONDS)
546 challenge_record = {
547 'fqdn': fqdn,
548 'public_key_hex': public_key_hex,
549 'nonce_hex': nonce_hex,
550 'created_at': now.isoformat(),
551 'expires_at': expires_at.isoformat(),
552 }
553 self._pending[nonce_hex] = challenge_record
555 # Record in rate log
556 self._rate_log.setdefault(fqdn, []).append(now)
558 logger.info(
559 f"CHALLENGE CREATED: fqdn={fqdn} nonce={nonce_hex[:16]}... "
560 f"expires={expires_at.isoformat()} "
561 f"pubkey={public_key_hex[:16]}...")
563 return True, {
564 'nonce_hex': nonce_hex,
565 'fqdn': fqdn,
566 'expires_at': expires_at.isoformat(),
567 }
569 # -----------------------------------------------------------------
570 # Step 3+4: Central verifies the signed challenge response
571 # -----------------------------------------------------------------
573 def verify_response(
574 self,
575 fqdn: str,
576 public_key_hex: str,
577 nonce_hex: str,
578 signature_hex: str,
579 ) -> Tuple[bool, dict]:
580 """Verify a challenge response from a provisional regional node.
582 The node must sign the raw nonce bytes with its Ed25519 private key.
583 The nonce is single-use: consumed (deleted) regardless of outcome.
585 Args:
586 fqdn: The FQDN the node claims.
587 public_key_hex: The node's Ed25519 public key in hex.
588 nonce_hex: The nonce that was issued in the challenge.
589 signature_hex: Hex-encoded Ed25519 signature of the nonce bytes.
591 Returns:
592 (success, result_dict). On failure, result_dict has 'error'.
593 """
594 now = datetime.now(timezone.utc)
596 with self._lock:
597 # --- Look up and consume the challenge (single-use) ---
598 challenge = self._pending.pop(nonce_hex, None)
600 if challenge is None:
601 logger.warning(
602 f"CHALLENGE RESPONSE REJECTED: unknown or already-consumed "
603 f"nonce from {fqdn} (nonce={nonce_hex[:16]}...)")
604 return False, {
605 'error': 'Unknown or already-consumed challenge nonce',
606 'fqdn': fqdn,
607 }
609 # --- Check expiry ---
610 expires_at = datetime.fromisoformat(challenge['expires_at'])
611 if expires_at.tzinfo is None:
612 expires_at = expires_at.replace(tzinfo=timezone.utc)
613 if now > expires_at:
614 logger.warning(
615 f"CHALLENGE RESPONSE REJECTED: nonce expired for {fqdn} "
616 f"(expired {expires_at.isoformat()}, now {now.isoformat()})")
617 return False, {
618 'error': 'Challenge nonce has expired',
619 'fqdn': fqdn,
620 'expired_at': expires_at.isoformat(),
621 }
623 # --- Verify FQDN matches ---
624 if challenge['fqdn'] != fqdn:
625 logger.warning(
626 f"CHALLENGE RESPONSE REJECTED: FQDN mismatch "
627 f"(challenge={challenge['fqdn']}, response={fqdn})")
628 return False, {
629 'error': 'FQDN does not match the challenge',
630 'fqdn': fqdn,
631 }
633 # --- Verify public key matches ---
634 if challenge['public_key_hex'] != public_key_hex:
635 logger.warning(
636 f"CHALLENGE RESPONSE REJECTED: public key mismatch for {fqdn}")
637 return False, {
638 'error': 'Public key does not match the challenge',
639 'fqdn': fqdn,
640 }
642 # --- Verify Ed25519 signature on the raw nonce bytes ---
643 try:
644 nonce_bytes = bytes.fromhex(nonce_hex)
645 sig_bytes = bytes.fromhex(signature_hex)
646 pub_bytes = bytes.fromhex(public_key_hex)
647 pub_key = Ed25519PublicKey.from_public_bytes(pub_bytes)
648 pub_key.verify(sig_bytes, nonce_bytes)
649 except InvalidSignature:
650 logger.warning(
651 f"CHALLENGE RESPONSE REJECTED: invalid signature for {fqdn} "
652 f"(nonce={nonce_hex[:16]}...)")
653 return False, {
654 'error': 'Invalid signature: does not match public key and nonce',
655 'fqdn': fqdn,
656 }
657 except (ValueError, Exception) as e:
658 logger.warning(
659 f"CHALLENGE RESPONSE REJECTED: signature verification error "
660 f"for {fqdn}: {e}")
661 return False, {
662 'error': f'Signature verification error: {e}',
663 'fqdn': fqdn,
664 }
666 logger.info(
667 f"CHALLENGE RESPONSE VERIFIED: fqdn={fqdn} "
668 f"pubkey={public_key_hex[:16]}... "
669 f"nonce={nonce_hex[:16]}...")
671 return True, {
672 'verified': True,
673 'fqdn': fqdn,
674 'public_key_hex': public_key_hex,
675 }
677 # -----------------------------------------------------------------
678 # Step 4 (continued): Issue a short-lived certificate on success
679 # -----------------------------------------------------------------
681 def issue_provisional_cert(
682 self,
683 parent_private_key: Ed25519PrivateKey,
684 fqdn: str,
685 public_key_hex: str,
686 region_name: str = '',
687 ) -> dict:
688 """Issue a short-lived regional certificate after successful verification.
690 Called by central after verify_response() returns success.
691 The certificate is valid for 7 days, forcing periodic re-verification.
693 Args:
694 parent_private_key: Central's Ed25519 private key (the signing authority).
695 fqdn: The verified FQDN of the regional node.
696 public_key_hex: The node's verified Ed25519 public key.
697 region_name: Optional region name (e.g. 'us-east-1'). Defaults to the
698 FQDN's first subdomain label if not provided.
700 Returns:
701 A signed certificate dict suitable for save_node_certificate().
702 """
703 if not region_name:
704 # Derive region from FQDN: 'us-east-1.hevolve.ai' -> 'us-east-1'
705 parts = fqdn.split('.')
706 region_name = parts[0] if len(parts) > 2 else fqdn
708 node_id = f'regional-{fqdn}'
710 cert = create_child_certificate(
711 parent_private_key=parent_private_key,
712 child_public_key_hex=public_key_hex,
713 node_id=node_id,
714 tier='regional',
715 region_name=region_name,
716 capabilities=['gossip_hub', 'agent_host'],
717 validity_days=_PROVISIONAL_CERT_VALIDITY_DAYS,
718 )
720 logger.info(
721 f"PROVISIONAL CERT ISSUED: fqdn={fqdn} node_id={node_id} "
722 f"region={region_name} validity={_PROVISIONAL_CERT_VALIDITY_DAYS}d "
723 f"expires={cert['expires_at']}")
725 return cert
727 # -----------------------------------------------------------------
728 # Full handshake orchestrator (convenience for central controller)
729 # -----------------------------------------------------------------
731 def handle_register(
732 self,
733 fqdn: str,
734 public_key_hex: str,
735 challenge_port: int = 6777,
736 ) -> Tuple[bool, dict]:
737 """Handle a full REGISTER request: validate, create challenge, and
738 deliver it to the node via HTTP callback.
740 This is the entry point for step 1+2 of the handshake. Central calls
741 this when a node sends a registration request.
743 Args:
744 fqdn: The FQDN the node claims.
745 public_key_hex: The node's Ed25519 public key.
746 challenge_port: The port the node's Flask server listens on.
748 Returns:
749 (success, result_dict). On success, result_dict includes
750 'nonce_hex' and 'callback_status'.
751 """
752 ok, result = self.create_challenge(fqdn, public_key_hex)
753 if not ok:
754 return False, result
756 nonce_hex = result['nonce_hex']
758 # Deliver challenge to the node via HTTP GET
759 callback_url = f'http://{fqdn}:{challenge_port}/.well-known/hart-challenge'
760 try:
761 from core.http_pool import pooled_get
762 resp = pooled_get(
763 callback_url,
764 params={'nonce': nonce_hex},
765 timeout=10,
766 )
767 if resp.status_code != 200:
768 logger.warning(
769 f"CHALLENGE DELIVERY FAILED: {callback_url} returned "
770 f"HTTP {resp.status_code}")
771 # Clean up the pending challenge since delivery failed
772 with self._lock:
773 self._pending.pop(nonce_hex, None)
774 return False, {
775 'error': f'Challenge delivery failed: HTTP {resp.status_code}',
776 'fqdn': fqdn,
777 'callback_url': callback_url,
778 }
780 logger.info(
781 f"CHALLENGE DELIVERED: {callback_url} returned HTTP 200 "
782 f"(nonce={nonce_hex[:16]}...)")
784 result['callback_status'] = 'delivered'
785 result['callback_url'] = callback_url
786 return True, result
788 except Exception as e:
789 logger.warning(
790 f"CHALLENGE DELIVERY FAILED: could not reach {callback_url}: {e}")
791 # Clean up the pending challenge since delivery failed
792 with self._lock:
793 self._pending.pop(nonce_hex, None)
794 return False, {
795 'error': f'Cannot reach node at {callback_url}: {e}',
796 'fqdn': fqdn,
797 'callback_url': callback_url,
798 }
800 def handle_challenge_response(
801 self,
802 fqdn: str,
803 public_key_hex: str,
804 nonce_hex: str,
805 signature_hex: str,
806 parent_private_key: Ed25519PrivateKey,
807 region_name: str = '',
808 ) -> Tuple[bool, dict]:
809 """Handle a CHALLENGE_RESPONSE: verify signature and issue certificate.
811 This is the entry point for steps 3+4 of the handshake.
813 Args:
814 fqdn: The FQDN the node claims.
815 public_key_hex: The node's Ed25519 public key.
816 nonce_hex: The nonce from the challenge.
817 signature_hex: Ed25519 signature of the nonce bytes.
818 parent_private_key: Central's private key for signing certs.
819 region_name: Optional region name for the certificate.
821 Returns:
822 (success, result_dict). On success, result_dict includes
823 'certificate' containing the signed short-lived cert.
824 """
825 ok, result = self.verify_response(
826 fqdn, public_key_hex, nonce_hex, signature_hex)
827 if not ok:
828 return False, result
830 cert = self.issue_provisional_cert(
831 parent_private_key=parent_private_key,
832 fqdn=fqdn,
833 public_key_hex=public_key_hex,
834 region_name=region_name,
835 )
837 result['certificate'] = cert
838 return True, result
840 # -----------------------------------------------------------------
841 # Internal helpers
842 # -----------------------------------------------------------------
844 def _purge_expired(self, now: datetime) -> int:
845 """Remove expired challenges. Caller must hold self._lock."""
846 expired_keys = []
847 for nonce_hex, record in self._pending.items():
848 expires_at = datetime.fromisoformat(record['expires_at'])
849 if expires_at.tzinfo is None:
850 expires_at = expires_at.replace(tzinfo=timezone.utc)
851 if now > expires_at:
852 expired_keys.append(nonce_hex)
853 for key in expired_keys:
854 del self._pending[key]
855 if expired_keys:
856 logger.debug(f"Purged {len(expired_keys)} expired challenges")
857 return len(expired_keys)
859 def _prune_rate_log(self, fqdn: str, now: datetime):
860 """Remove rate-log entries older than 1 hour. Caller must hold self._lock."""
861 cutoff = now - timedelta(hours=1)
862 if fqdn in self._rate_log:
863 self._rate_log[fqdn] = [
864 ts for ts in self._rate_log[fqdn] if ts > cutoff
865 ]
867 def get_pending_count(self) -> int:
868 """Return the number of pending (not yet verified) challenges."""
869 with self._lock:
870 return len(self._pending)
872 def get_pending_for_fqdn(self, fqdn: str) -> int:
873 """Return the number of pending challenges for a specific FQDN."""
874 with self._lock:
875 return sum(
876 1 for r in self._pending.values() if r['fqdn'] == fqdn)