Coverage for integrations / agent_engine / auto_evolve.py: 62.3%

247 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Auto Evolve Orchestrator — Democratic thought experiment → autonomous iteration. 

3 

4Single entry point for the full evolution loop: 

51. GATHER — collect eligible thought experiments 

62. FILTER — constitutional gate (ConstitutionalFilter) 

73. VOTE — tally democratic votes (human + agent, weighted) 

84. SELECT — top-N experiments by approval score 

95. DISPATCH — route each winner to its type-aware iteration loop 

106. TRACK — monitor progress, feed results back to evolution stack 

11 

12Triggered by: 

13- Admin "Auto Evolve" button 

14- Agent tool `start_auto_evolve` 

15- Scheduled cron (optional) 

16 

17All iteration is agent-native — the AutoEvolveOrchestrator doesn't run 

18experiments itself. It selects which experiments DESERVE to run, then 

19dispatches them through the existing agent goal system. 

20""" 

21import json 

22import logging 

23import threading 

24import time 

25import uuid 

26from concurrent.futures import ThreadPoolExecutor, as_completed 

27from dataclasses import dataclass, field 

28from typing import Dict, List, Optional 

29 

30logger = logging.getLogger('hevolve.auto_evolve') 

31 

32# PRODUCT_MAP §10: DISPATCH uses parallel_dispatch, bounded by this constant. 

33# Keep conservative — each dispatched experiment spawns an agent goal which may 

34# fan out further downstream. 4 keeps the blast radius small on flat-tier 

35# desktops while still honouring the "parallel" contract from the spec. 

36AUTO_EVOLVE_MAX_PARALLEL_DISPATCH = 4 

37 

38# PRODUCT_MAP §10: super-majority threshold for VOTE stage — candidates must 

39# clear 2/3 of the weighted tally, not a simple majority. Expressed as a 

40# fraction of the maximum possible score so callers can still tune per-session 

41# via min_approval_score (which is applied as an absolute-score floor in 

42# addition to this ratio). 

43AUTO_EVOLVE_SUPERMAJORITY_RATIO = 2.0 / 3.0 

44 

45# Active-learning bias for the VOTE stage. When the world model 

46# (HevolveAI side, queried via world_model_bridge.get_learning_feedback) 

47# reports HIGH epistemic_uncertainty, the system has the most to learn 

48# from running new experiments — so we boost approval scores slightly 

49# before sort, biasing the SELECT phase toward the experiments that 

50# would reduce uncertainty fastest. Classic AL acquisition. 

51# 

52# Bounded multiplier: the bias never re-orders candidates that the 

53# super-majority gate already rejected, and the boost is small enough 

54# (≤ 1 + AL_MAX_BOOST) that a high-quality consensus pick still beats 

55# a marginally-passing high-uncertainty pick. The point is a NUDGE on 

56# tie-breaking, not a coup. 

57AUTO_EVOLVE_AL_MAX_BOOST = 0.25 # +25% upper bound on the AL multiplier 

58 

59# Trust gate for the AL signal itself. Prediction-error estimates 

60# from a half-trained world model are themselves unreliable — the 

61# model can be over-confident on things it has never actually seen, 

62# or under-confident on things it has seen but not yet generalized. 

63# Using such an uncalibrated signal to bias selection would amplify 

64# the model's own blind spots into the experiment queue. 

65# 

66# Maturity signal: HevolveAI's EmbodiedLearner exposes a 

67# `learning_steps` counter via get_stats() — the actual count of 

68# learning updates the provider has executed. We require at least 

69# AL_TRUST_MIN_STEPS before trusting avg_prediction_error as an AL 

70# acquisition signal; below that, the multiplier is forced to 1.0 

71# (pure democratic vote decides the rank). Above the floor, the 

72# boost ramps in linearly to its full AL_MAX_BOOST value at 

73# AL_TRUST_FULL_STEPS. 

74# 

75# Step threshold tuning: 100 = "model has done enough updates to 

76# have a non-trivial loss surface but isn't yet generalized"; 

77# 10_000 = "fully matured", at which point the prediction-error 

78# signal is trusted at full weight. Both tunable once we have 

79# real-world calibration data — kept conservative so the bias 

80# doesn't fire prematurely. 

81AUTO_EVOLVE_AL_TRUST_MIN_STEPS = 100 

82AUTO_EVOLVE_AL_TRUST_FULL_STEPS = 10_000 

83 

84 

85def _is_sqlite_backend() -> bool: 

86 """Return True when the active DB engine is SQLite (flat tier). 

87 

88 FIX-1.4a support: SQLite serializes writes at the file level, so 

89 parallel dispatch that commits concurrently produces ``database is 

90 locked`` errors. Callers use this to fall back to a single worker 

91 and preserve the DISPATCH contract without the race. 

92 

93 Detection order: 

94 1. Ask SQLAlchemy directly via the shared ``engine`` (authoritative). 

95 2. Sniff the ``HEVOLVE_DB_URL`` env var (covers first-boot path 

96 where the engine hasn't been created yet). 

97 3. Default to ``True`` — SQLite is the flat-tier default, so the 

98 safer assumption on failure is "serialize." 

99 """ 

100 try: 

101 from integrations.social.models import engine as _engine 

102 dialect = getattr(getattr(_engine, 'dialect', None), 'name', '') or '' 

103 if dialect: 

104 return dialect.lower() == 'sqlite' 

105 except Exception: 

106 pass 

107 try: 

108 import os 

109 url = os.environ.get('HEVOLVE_DB_URL', '') or '' 

110 if url: 

111 return url.lower().startswith('sqlite') 

112 except Exception: 

113 pass 

114 return True # flat-tier default = SQLite 

115 

116 

117@dataclass 

118class EvolveSession: 

119 """Tracks one auto-evolve cycle.""" 

120 session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) 

121 status: str = 'pending' # pending | selecting | dispatching | running | completed | failed 

122 started_at: float = 0.0 

123 candidates: int = 0 

124 filtered: int = 0 

125 selected: int = 0 

126 dispatched: int = 0 

127 completed: int = 0 

128 failed: int = 0 

129 experiments: List[Dict] = field(default_factory=list) 

130 errors: List[str] = field(default_factory=list) 

131 

132 def to_dict(self) -> Dict: 

133 return { 

134 'session_id': self.session_id, 

135 'status': self.status, 

136 'elapsed_s': round(time.time() - self.started_at, 1) if self.started_at else 0, 

137 'candidates': self.candidates, 

138 'filtered': self.filtered, 

139 'selected': self.selected, 

140 'dispatched': self.dispatched, 

141 'completed': self.completed, 

142 'failed': self.failed, 

143 'experiments': self.experiments, 

144 'errors': self.errors, 

145 } 

146 

147 

148class AutoEvolveOrchestrator: 

149 """Democratic selection + autonomous iteration dispatch. 

150 

151 The orchestrator doesn't run experiments — it selects which ones 

152 to run based on constitutional + democratic criteria, then dispatches 

153 them through the agent goal system for type-aware iteration. 

154 """ 

155 

156 def __init__(self): 

157 self._active_session: Optional[EvolveSession] = None 

158 self._lock = threading.Lock() 

159 

160 def start(self, max_experiments: int = 5, 

161 min_approval_score: float = 0.3, 

162 statuses: List[str] = None, 

163 user_id: str = 'system') -> Dict: 

164 """Start an auto-evolve cycle. 

165 

166 Args: 

167 max_experiments: Max experiments to dispatch in this cycle 

168 min_approval_score: Minimum weighted approval score to qualify 

169 statuses: Which experiment statuses to consider 

170 (default: ['voting', 'evaluating']) 

171 user_id: Who triggered the evolve cycle 

172 

173 Returns: 

174 Session info dict 

175 """ 

176 with self._lock: 

177 if self._active_session and self._active_session.status == 'running': 

178 return { 

179 'success': False, 

180 'reason': 'Auto-evolve cycle already running', 

181 'session': self._active_session.to_dict(), 

182 } 

183 

184 session = EvolveSession() 

185 session.started_at = time.time() 

186 session.status = 'selecting' 

187 

188 with self._lock: 

189 self._active_session = session 

190 

191 # Run in background thread 

192 def _run(): 

193 try: 

194 self._execute_cycle(session, max_experiments, 

195 min_approval_score, 

196 statuses or ['voting', 'evaluating'], 

197 user_id) 

198 except Exception as e: 

199 session.status = 'failed' 

200 session.errors.append(str(e)) 

201 logger.exception(f"[{session.session_id}] Auto-evolve failed: {e}") 

202 

203 t = threading.Thread(target=_run, daemon=True, 

204 name=f'auto-evolve-{session.session_id}') 

205 t.start() 

206 

207 return { 

208 'success': True, 

209 'session_id': session.session_id, 

210 'status': 'selecting', 

211 } 

212 

213 def get_status(self) -> Dict: 

214 """Get current auto-evolve session status.""" 

215 with self._lock: 

216 if self._active_session: 

217 return self._active_session.to_dict() 

218 return {'status': 'idle', 'message': 'No active auto-evolve session'} 

219 

220 def _execute_cycle(self, session: EvolveSession, 

221 max_experiments: int, 

222 min_approval_score: float, 

223 statuses: List[str], 

224 user_id: str): 

225 """Execute the full auto-evolve cycle.""" 

226 

227 # Phase 1: GATHER candidates 

228 candidates = self._gather_candidates(session, statuses) 

229 session.candidates = len(candidates) 

230 if not candidates: 

231 session.status = 'completed' 

232 session.errors.append('No eligible experiments found') 

233 self._emit_event('auto_evolve.no_candidates', session.to_dict()) 

234 return 

235 

236 # Phase 2: FILTER through constitutional gate 

237 approved = self._constitutional_filter(session, candidates) 

238 session.filtered = len(approved) 

239 

240 # Phase 3: VOTE tally + rank 

241 ranked = self._rank_by_votes(session, approved, min_approval_score) 

242 session.selected = len(ranked) 

243 

244 if not ranked: 

245 session.status = 'completed' 

246 session.errors.append( 

247 f'No experiments met approval threshold ({min_approval_score})') 

248 self._emit_event('auto_evolve.none_approved', session.to_dict()) 

249 return 

250 

251 # Phase 4: SELECT top-N 

252 winners = ranked[:max_experiments] 

253 

254 # Phase 5: DISPATCH to type-aware iteration (parallel per PRODUCT_MAP §10) 

255 session.status = 'dispatching' 

256 self._emit_event('auto_evolve.dispatching', { 

257 'count': len(winners), 

258 'experiments': [w['id'] for w in winners], 

259 }) 

260 

261 self._dispatch_winners_parallel(session, winners, user_id) 

262 

263 session.status = 'running' if session.dispatched > 0 else 'failed' 

264 self._emit_event('auto_evolve.started', session.to_dict()) 

265 

266 logger.info(f"[{session.session_id}] Auto-evolve dispatched " 

267 f"{session.dispatched}/{len(winners)} experiments") 

268 

269 def _gather_candidates(self, session: EvolveSession, 

270 statuses: List[str]) -> List[Dict]: 

271 """Gather eligible thought experiments from DB.""" 

272 try: 

273 from integrations.social.models import db_session 

274 from integrations.social.thought_experiment_service import ( 

275 ThoughtExperimentService) 

276 

277 with db_session(commit=False) as db: 

278 all_experiments = [] 

279 for status in statuses: 

280 exps = ThoughtExperimentService.get_active_experiments( 

281 db, status=status, limit=50) 

282 all_experiments.extend(exps) 

283 return all_experiments 

284 except Exception as e: 

285 logger.warning(f"[{session.session_id}] Gather failed: {e}") 

286 return [] 

287 

288 def _constitutional_filter(self, session: EvolveSession, 

289 candidates: List[Dict]) -> List[Dict]: 

290 """Filter candidates through ConstitutionalFilter.""" 

291 approved = [] 

292 for exp in candidates: 

293 try: 

294 from security.hive_guardrails import ConstitutionalFilter 

295 text = f"{exp.get('title', '')}: {exp.get('hypothesis', '')}" 

296 check = ConstitutionalFilter.check_prompt(text) 

297 ok = check[0] if isinstance(check, tuple) else check.get('approved', True) 

298 if ok: 

299 approved.append(exp) 

300 else: 

301 logger.debug(f"[{session.session_id}] Filtered out: {exp.get('id')}") 

302 except ImportError: 

303 # No filter available — pass through 

304 approved.append(exp) 

305 return approved 

306 

307 def _rank_by_votes(self, session: EvolveSession, 

308 candidates: List[Dict], 

309 min_score: float) -> List[Dict]: 

310 """Tally votes and rank by approval score. 

311 

312 Two gates apply per PRODUCT_MAP §10: 

313 1. weighted_score >= min_score (caller-provided absolute floor) 

314 2. total_for / (total_for + total_against) >= 2/3 super-majority 

315 

316 Both must hold for an experiment to qualify for dispatch. The 

317 super-majority gate protects against a small but highly-weighted 

318 vocal minority flipping a low-participation tally into approval. 

319 

320 Once the gates pass, the rank is biased by an active-learning 

321 signal pulled from the world model (HevolveAI) — see 

322 _active_learning_multiplier docstring. The bias only nudges 

323 order; it never re-admits a candidate the gates rejected. 

324 """ 

325 scored = [] 

326 try: 

327 from integrations.social.models import db_session 

328 from integrations.social.thought_experiment_service import ( 

329 ThoughtExperimentService) 

330 

331 with db_session(commit=False) as db: 

332 for exp in candidates: 

333 tally = ThoughtExperimentService.tally_votes( 

334 db, exp['id']) 

335 score = tally.get('weighted_score', 0) 

336 total_for = tally.get('total_for', 0) or 0 

337 total_against = tally.get('total_against', 0) or 0 

338 decisive = total_for + total_against 

339 # Super-majority: ≥ 2/3 of DECISIVE (non-abstain) weight 

340 # must be FOR. Abstains are excluded from denominator. 

341 super_ratio = (total_for / decisive) if decisive > 0 else 0.0 

342 exp['_approval_score'] = score 

343 exp['_super_majority'] = round(super_ratio, 4) 

344 exp['_tally'] = tally 

345 if (score >= min_score 

346 and super_ratio >= AUTO_EVOLVE_SUPERMAJORITY_RATIO): 

347 scored.append(exp) 

348 else: 

349 logger.debug( 

350 f"[{session.session_id}] Rejected {exp.get('id')}: " 

351 f"score={score} super_ratio={super_ratio:.3f} " 

352 f"(need score>={min_score} and " 

353 f"ratio>={AUTO_EVOLVE_SUPERMAJORITY_RATIO:.3f})" 

354 ) 

355 except Exception as e: 

356 logger.warning(f"[{session.session_id}] Vote tally failed: {e}") 

357 return candidates # Fall through unranked 

358 

359 # Active-learning bias: pull a global epistemic-uncertainty 

360 # multiplier from the world model and apply it as a small 

361 # boost to each gated-in candidate's approval score before 

362 # sort. No-op when the world model isn't reachable. 

363 al_mult = self._active_learning_multiplier(session) 

364 for exp in scored: 

365 base = exp.get('_approval_score', 0) or 0 

366 biased = base * al_mult 

367 exp['_active_learning_multiplier'] = round(al_mult, 4) 

368 exp['_biased_score'] = round(biased, 4) 

369 

370 # Sort by biased score (falls back to unbiased when al_mult=1.0) 

371 scored.sort( 

372 key=lambda e: e.get('_biased_score', 

373 e.get('_approval_score', 0)), 

374 reverse=True, 

375 ) 

376 return scored 

377 

378 def _active_learning_multiplier(self, session: EvolveSession) -> float: 

379 """Pull the world model's current avg_prediction_error + 

380 learning_steps via agent_baseline_service._collect_world_model_metrics() 

381 — the same normalized signals persisted into every baseline 

382 snapshot — and project them into a [1.0, 1+AL_MAX_BOOST] 

383 multiplier, gated on model maturity. 

384 

385 Why colocated with _rank_by_votes (not in baseline_service): 

386 AL acquisition is a SELECTION concern (which candidates does 

387 the system want to run NEXT), not a SNAPSHOT concern (what did 

388 the system look like at this moment). baseline_service owns 

389 the COLLECTOR (single source for HevolveAI feedback shape); 

390 this method owns the SELECTION USE of that signal. 

391 

392 Two-stage trust gate: 

393 stage 1 — does the AL signal exist? 

394 No: return 1.0 (no bias). 

395 stage 2 — has the world model done enough learning_steps to 

396 produce a CALIBRATED prediction-error estimate? 

397 < AL_TRUST_MIN_STEPS: return 1.0 (uncertainty about 

398 uncertainty — using the signal would amplify the 

399 model's own blind spots into the experiment queue). 

400 >= AL_TRUST_FULL_STEPS: full AL_MAX_BOOST. 

401 in between: linear ramp. 

402 

403 Returns 1.0 in any failure path so a missing / immature world 

404 model falls back to pure democratic vote — preserves the 

405 system's pre-wiring behavior. 

406 

407 HevolveAI field names verified against 

408 embodied_learner.py::EmbodiedLearner.get_stats — do not 

409 invent (`epistemic_uncertainty`, `learning_progress`, etc. 

410 do NOT exist on get_stats; the real keys are 

411 avg_prediction_error and learning_steps). 

412 """ 

413 try: 

414 from integrations.agent_engine.agent_baseline_service import ( 

415 AgentBaselineService) 

416 wm = AgentBaselineService._collect_world_model_metrics() 

417 except Exception: 

418 return 1.0 

419 if not wm: 

420 return 1.0 

421 

422 # Stage 1: AL signal present? Higher prediction error = 

423 # more to learn from this domain = stronger acquisition 

424 # value. Already clamped to [0,1] by the collector. 

425 err = wm.get('al_signal') 

426 if not isinstance(err, (int, float)): 

427 return 1.0 

428 

429 # Stage 2: has the model trained enough that its prediction- 

430 # error estimate is calibrated? 

431 steps = wm.get('learning_steps') 

432 if not isinstance(steps, int) or steps < AUTO_EVOLVE_AL_TRUST_MIN_STEPS: 

433 logger.debug( 

434 f"[{session.session_id}] World model still in bootstrap " 

435 f"(learning_steps={steps} < {AUTO_EVOLVE_AL_TRUST_MIN_STEPS}) " 

436 f"— skipping AL bias (al_signal={err:.4f} considered uncalibrated)" 

437 ) 

438 return 1.0 

439 

440 # Linear ramp from 0 at MIN_STEPS to 1 at FULL_STEPS 

441 ramp_span = AUTO_EVOLVE_AL_TRUST_FULL_STEPS - AUTO_EVOLVE_AL_TRUST_MIN_STEPS 

442 trust_factor = (steps - AUTO_EVOLVE_AL_TRUST_MIN_STEPS) / ramp_span 

443 trust_factor = max(0.0, min(1.0, trust_factor)) 

444 

445 mult = 1.0 + (err * AUTO_EVOLVE_AL_MAX_BOOST * trust_factor) 

446 logger.debug( 

447 f"[{session.session_id}] AL bias active: " 

448 f"al_signal={err:.4f}, learning_steps={steps}, " 

449 f"trust_factor={trust_factor:.4f} → mult={mult:.4f}" 

450 ) 

451 return mult 

452 

453 def _dispatch_winners_parallel(self, session: EvolveSession, 

454 winners: List[Dict], user_id: str) -> None: 

455 """Fan-out winning experiments to type-aware iteration in parallel. 

456 

457 PRODUCT_MAP §10 DISPATCH stage. Uses a bounded ThreadPoolExecutor 

458 (cap = AUTO_EVOLVE_MAX_PARALLEL_DISPATCH) so a large approved set 

459 can't stampede the agent goal system. Each experiment still goes 

460 through _dispatch_experiment (unchanged contract) — this method 

461 only adds the concurrency primitive. 

462 

463 Mutations on `session` (dispatched/failed/experiments/errors) are 

464 serialized by acquiring self._lock around each bookkeeping update. 

465 

466 FIX-1.4a: On SQLite (flat tier), ``db.commit()`` serializes the 

467 whole database file, and multiple concurrent commits produce 

468 ``database is locked`` SQLALchemy errors. Detect the engine URL 

469 and drop ``max_workers`` to 1 when SQLite is the backend. MySQL 

470 (regional/central) keeps the bounded parallel dispatch. 

471 

472 FIX-1.4b: ``_dispatch_experiment`` returns ``{'success': False, 

473 'reason': ...}`` on logical failure (e.g. already-dispatched 

474 experiment, missing goal recipe) WITHOUT raising. The earlier 

475 code counted those as successful dispatches, so a session with 

476 100 % logical-failure returns would sit at status='running' with 

477 ``dispatched == N`` and ``failed == 0`` forever. Branch on the 

478 ``success`` flag: only count dispatched on True, else route to 

479 the ``failed`` counter + errors list so the terminal-state rule 

480 at line 192 (``running if dispatched > 0 else failed``) fires 

481 correctly. 

482 """ 

483 if not winners: 

484 return 

485 

486 max_parallel = AUTO_EVOLVE_MAX_PARALLEL_DISPATCH 

487 if _is_sqlite_backend(): 

488 max_parallel = 1 # FIX-1.4a: serialize on flat/SQLite 

489 logger.debug( 

490 f"[{session.session_id}] SQLite backend detected — " 

491 f"serializing dispatch to avoid 'database is locked'") 

492 

493 max_workers = min(len(winners), max_parallel) 

494 with ThreadPoolExecutor( 

495 max_workers=max_workers, 

496 thread_name_prefix=f'auto-evolve-{session.session_id}') as pool: 

497 futures = { 

498 pool.submit(self._dispatch_experiment, session, exp, user_id): exp 

499 for exp in winners 

500 } 

501 for future in as_completed(futures): 

502 exp = futures[future] 

503 try: 

504 goal_result = future.result() 

505 # FIX-1.4b: distinguish logical success from logical failure. 

506 # _dispatch_experiment returns dict — missing/falsy 'success' 

507 # key = logical failure, not dispatched. 

508 success = ( 

509 isinstance(goal_result, dict) 

510 and bool(goal_result.get('success')) 

511 ) 

512 with self._lock: 

513 if success: 

514 session.dispatched += 1 

515 session.experiments.append({ 

516 'id': exp['id'], 

517 'title': exp.get('title', ''), 

518 'type': exp.get('experiment_type', 'traditional'), 

519 'approval_score': exp.get('_approval_score', 0), 

520 'super_majority': exp.get('_super_majority', 0), 

521 'goal_id': goal_result.get('goal_id') 

522 if isinstance(goal_result, dict) else None, 

523 'status': 'dispatched', 

524 }) 

525 else: 

526 session.failed += 1 

527 reason = ( 

528 goal_result.get('reason') 

529 if isinstance(goal_result, dict) 

530 else 'non-dict result' 

531 ) or 'unknown' 

532 session.errors.append( 

533 f"Dispatch {exp['id']}: {reason}") 

534 session.experiments.append({ 

535 'id': exp['id'], 

536 'title': exp.get('title', ''), 

537 'type': exp.get('experiment_type', 'traditional'), 

538 'approval_score': exp.get('_approval_score', 0), 

539 'super_majority': exp.get('_super_majority', 0), 

540 'goal_id': None, 

541 'status': 'failed', 

542 'reason': reason, 

543 }) 

544 logger.info( 

545 f"[{session.session_id}] Dispatch declined " 

546 f"for {exp['id']}: {reason}") 

547 except Exception as e: 

548 with self._lock: 

549 session.failed += 1 

550 session.errors.append(f"Dispatch {exp['id']}: {e}") 

551 logger.warning( 

552 f"[{session.session_id}] Failed to dispatch " 

553 f"{exp['id']}: {e}") 

554 

555 def _dispatch_experiment(self, session: EvolveSession, 

556 exp: Dict, user_id: str) -> Dict: 

557 """Dispatch a winning experiment to its type-aware iteration loop. 

558 

559 Uses ThoughtExperimentService.request_agent_evaluation() which 

560 creates an agent goal with the type-aware iteration recipe. 

561 

562 Special case: experiments with experiment_type='code_evolution' and 

563 target_repo='hevolveai' are dispatched to the HevolveAI evolution 

564 orchestrator inshghtead of the standard evaluation pipeline. 

565 """ 

566 from integrations.social.models import db_session 

567 from integrations.social.thought_experiment_service import ( 

568 ThoughtExperimentService) 

569 

570 with db_session(commit=False) as db: 

571 result = ThoughtExperimentService.request_agent_evaluation( 

572 db, exp['id']) 

573 if result.get('success'): 

574 db.commit() 

575 return result 

576 

577 def _emit_event(self, topic: str, data: Dict): 

578 """Emit progress event via EventBus.""" 

579 try: 

580 from core.platform.events import emit_event 

581 emit_event(topic, data) 

582 except Exception: 

583 pass 

584 

585 

586# ── Singleton ──────────────────────────────────────────────── 

587 

588_orchestrator: Optional[AutoEvolveOrchestrator] = None 

589_lock = threading.Lock() 

590 

591 

592def get_auto_evolve_orchestrator() -> AutoEvolveOrchestrator: 

593 """Get or create the singleton AutoEvolveOrchestrator.""" 

594 global _orchestrator 

595 if _orchestrator is None: 

596 with _lock: 

597 if _orchestrator is None: 

598 _orchestrator = AutoEvolveOrchestrator() 

599 return _orchestrator 

600 

601 

602# ── Owner Pause/Resume ─────────────────────────────────────── 

603 

604# Paused experiment IDs — owner can pause their experiment's iteration 

605_paused_experiments: Dict[str, str] = {} # experiment_id → paused_by_user_id 

606_pause_lock = threading.Lock() 

607 

608 

609def pause_experiment_evolution(experiment_id: str, user_id: str) -> Dict: 

610 """Pause a running experiment's iteration (owner only). 

611 

612 The experiment stays in 'evaluating' status but the agent goal 

613 is signalled to stop iterating. 

614 """ 

615 # Verify ownership 

616 try: 

617 from integrations.social.models import db_session 

618 from integrations.social.thought_experiment_service import ThoughtExperimentService 

619 with db_session(commit=False) as db: 

620 detail = ThoughtExperimentService.get_experiment_detail( 

621 db, experiment_id) 

622 if not detail: 

623 return {'success': False, 'reason': 'not_found'} 

624 if detail.get('creator_id') != user_id: 

625 return {'success': False, 'reason': 'not_owner', 

626 'message': 'Only the experiment creator can pause it'} 

627 except Exception as e: 

628 return {'success': False, 'reason': str(e)} 

629 

630 with _pause_lock: 

631 _paused_experiments[experiment_id] = user_id 

632 

633 logger.info(f"Experiment {experiment_id} paused by {user_id}") 

634 return {'success': True, 'experiment_id': experiment_id, 'status': 'paused'} 

635 

636 

637def resume_experiment_evolution(experiment_id: str, user_id: str) -> Dict: 

638 """Resume a paused experiment's iteration (owner only).""" 

639 with _pause_lock: 

640 paused_by = _paused_experiments.get(experiment_id) 

641 if not paused_by: 

642 return {'success': False, 'reason': 'not_paused'} 

643 if paused_by != user_id: 

644 return {'success': False, 'reason': 'not_owner', 

645 'message': 'Only the user who paused can resume'} 

646 del _paused_experiments[experiment_id] 

647 

648 logger.info(f"Experiment {experiment_id} resumed by {user_id}") 

649 return {'success': True, 'experiment_id': experiment_id, 'status': 'resumed'} 

650 

651 

652def is_experiment_paused(experiment_id: str) -> bool: 

653 """Check if an experiment's evolution is paused.""" 

654 with _pause_lock: 

655 return experiment_id in _paused_experiments 

656 

657 

658def get_paused_experiments() -> List[str]: 

659 """Get list of paused experiment IDs.""" 

660 with _pause_lock: 

661 return list(_paused_experiments.keys()) 

662 

663 

664# ── Agent Tool Functions ───────────────────────────────────── 

665 

666def start_auto_evolve(max_experiments: int = 5, 

667 min_approval_score: float = 0.3, 

668 user_id: str = 'system') -> str: 

669 """Start an auto-evolve cycle: democratically select thought experiments 

670 and dispatch them to autonomous iteration loops. 

671 

672 The orchestrator: 

673 1. Gathers eligible thought experiments (voting/evaluating phase) 

674 2. Filters through ConstitutionalFilter 

675 3. Tallies democratic votes (human + agent, weighted) 

676 4. Selects top-N by approval score 

677 5. Dispatches each to its type-aware iteration loop 

678 

679 Software experiments → autoresearch (edit→run→metric→iterate) 

680 Traditional experiments → reason_and_refine (hypothesize→score→refine) 

681 Physical AI experiments → observe_and_measure 

682 

683 Args: 

684 max_experiments: Max experiments to dispatch (default 5) 

685 min_approval_score: Minimum weighted vote score to qualify (default 0.3) 

686 user_id: Who triggered the cycle 

687 

688 Returns: 

689 JSON with session_id and status 

690 """ 

691 orch = get_auto_evolve_orchestrator() 

692 result = orch.start( 

693 max_experiments=int(max_experiments), 

694 min_approval_score=float(min_approval_score), 

695 user_id=user_id, 

696 ) 

697 return json.dumps(result) 

698 

699 

700def get_auto_evolve_status() -> str: 

701 """Get the status of the current auto-evolve cycle. 

702 

703 Returns progress including: candidates gathered, filtered, selected, 

704 dispatched, and per-experiment status. 

705 """ 

706 orch = get_auto_evolve_orchestrator() 

707 return json.dumps(orch.get_status()) 

708 

709 

710def pause_evolve_experiment(experiment_id: str, user_id: str) -> str: 

711 """Pause a running thought experiment's evolution loop. 

712 

713 Only the experiment creator (owner) can pause their experiment. 

714 The experiment stays in 'evaluating' status but iteration stops. 

715 

716 Args: 

717 experiment_id: The ThoughtExperiment ID to pause 

718 user_id: ID of the user requesting pause (must be creator) 

719 

720 Returns: 

721 JSON with success status 

722 """ 

723 result = pause_experiment_evolution(experiment_id, user_id) 

724 return json.dumps(result) 

725 

726 

727def resume_evolve_experiment(experiment_id: str, user_id: str) -> str: 

728 """Resume a paused thought experiment's evolution loop. 

729 

730 Only the user who paused it can resume. 

731 

732 Args: 

733 experiment_id: The ThoughtExperiment ID to resume 

734 user_id: ID of the user requesting resume (must be pauser) 

735 

736 Returns: 

737 JSON with success status 

738 """ 

739 result = resume_experiment_evolution(experiment_id, user_id) 

740 return json.dumps(result) 

741 

742 

743# Tool registration for ServiceToolRegistry 

744AUTO_EVOLVE_TOOLS = [ 

745 { 

746 'name': 'start_auto_evolve', 

747 'func': start_auto_evolve, 

748 'description': ( 

749 'Start democratic auto-evolve cycle: gather thought experiments, ' 

750 'constitutional filter, vote tally, dispatch winners to ' 

751 'autonomous iteration loops.' 

752 ), 

753 'tags': ['auto_evolve', 'thought_experiment'], 

754 }, 

755 { 

756 'name': 'get_auto_evolve_status', 

757 'func': get_auto_evolve_status, 

758 'description': 'Get progress of the current auto-evolve cycle.', 

759 'tags': ['auto_evolve'], 

760 }, 

761 { 

762 'name': 'pause_evolve_experiment', 

763 'func': pause_evolve_experiment, 

764 'description': ( 

765 'Pause a running thought experiment evolution (owner only). ' 

766 'Stops iteration but keeps evaluating status.' 

767 ), 

768 'tags': ['auto_evolve', 'thought_experiment'], 

769 }, 

770 { 

771 'name': 'resume_evolve_experiment', 

772 'func': resume_evolve_experiment, 

773 'description': 'Resume a paused thought experiment evolution (owner only).', 

774 'tags': ['auto_evolve', 'thought_experiment'], 

775 }, 

776]