Coverage for integrations/agent_engine/auto

1"""

2Auto Evolve Orchestrator — Democratic thought experiment → autonomous iteration.

4Single entry point for the full evolution loop:

51. GATHER — collect eligible thought experiments

62. FILTER — constitutional gate (ConstitutionalFilter)

73. VOTE — tally democratic votes (human + agent, weighted)

84. SELECT — top-N experiments by approval score

95. DISPATCH — route each winner to its type-aware iteration loop

106. TRACK — monitor progress, feed results back to evolution stack

12Triggered by:

13- Admin "Auto Evolve" button

14- Agent tool `start_auto_evolve`

15- Scheduled cron (optional)

17All iteration is agent-native — the AutoEvolveOrchestrator doesn't run

18experiments itself. It selects which experiments DESERVE to run, then

19dispatches them through the existing agent goal system.

20"""

21import json

22import logging

23import threading

24import time

25import uuid

26from concurrent.futures import ThreadPoolExecutor, as_completed

27from dataclasses import dataclass, field

28from typing import Dict, List, Optional

30logger = logging.getLogger('hevolve.auto_evolve')

32# PRODUCT_MAP §10: DISPATCH uses parallel_dispatch, bounded by this constant.

33# Keep conservative — each dispatched experiment spawns an agent goal which may

34# fan out further downstream. 4 keeps the blast radius small on flat-tier

35# desktops while still honouring the "parallel" contract from the spec.

36AUTO_EVOLVE_MAX_PARALLEL_DISPATCH = 4

38# PRODUCT_MAP §10: super-majority threshold for VOTE stage — candidates must

39# clear 2/3 of the weighted tally, not a simple majority. Expressed as a

40# fraction of the maximum possible score so callers can still tune per-session

41# via min_approval_score (which is applied as an absolute-score floor in

42# addition to this ratio).

43AUTO_EVOLVE_SUPERMAJORITY_RATIO = 2.0 / 3.0

45# Active-learning bias for the VOTE stage. When the world model

46# (HevolveAI side, queried via world_model_bridge.get_learning_feedback)

47# reports HIGH epistemic_uncertainty, the system has the most to learn

48# from running new experiments — so we boost approval scores slightly

49# before sort, biasing the SELECT phase toward the experiments that

50# would reduce uncertainty fastest. Classic AL acquisition.

51#

52# Bounded multiplier: the bias never re-orders candidates that the

53# super-majority gate already rejected, and the boost is small enough

54# (≤ 1 + AL_MAX_BOOST) that a high-quality consensus pick still beats

55# a marginally-passing high-uncertainty pick. The point is a NUDGE on

56# tie-breaking, not a coup.

57AUTO_EVOLVE_AL_MAX_BOOST = 0.25 # +25% upper bound on the AL multiplier

59# Trust gate for the AL signal itself. Prediction-error estimates

60# from a half-trained world model are themselves unreliable — the

61# model can be over-confident on things it has never actually seen,

62# or under-confident on things it has seen but not yet generalized.

63# Using such an uncalibrated signal to bias selection would amplify

64# the model's own blind spots into the experiment queue.

65#

66# Maturity signal: HevolveAI's EmbodiedLearner exposes a

67# `learning_steps` counter via get_stats() — the actual count of

68# learning updates the provider has executed. We require at least

69# AL_TRUST_MIN_STEPS before trusting avg_prediction_error as an AL

70# acquisition signal; below that, the multiplier is forced to 1.0

71# (pure democratic vote decides the rank). Above the floor, the

72# boost ramps in linearly to its full AL_MAX_BOOST value at

73# AL_TRUST_FULL_STEPS.

74#

75# Step threshold tuning: 100 = "model has done enough updates to

76# have a non-trivial loss surface but isn't yet generalized";

77# 10_000 = "fully matured", at which point the prediction-error

78# signal is trusted at full weight. Both tunable once we have

79# real-world calibration data — kept conservative so the bias

80# doesn't fire prematurely.

81AUTO_EVOLVE_AL_TRUST_MIN_STEPS = 100

82AUTO_EVOLVE_AL_TRUST_FULL_STEPS = 10_000

85def _is_sqlite_backend() -> bool:

86 """Return True when the active DB engine is SQLite (flat tier).

88 FIX-1.4a support: SQLite serializes writes at the file level, so

89 parallel dispatch that commits concurrently produces ``database is

90 locked`` errors. Callers use this to fall back to a single worker

91 and preserve the DISPATCH contract without the race.

93 Detection order:

94 1. Ask SQLAlchemy directly via the shared ``engine`` (authoritative).

95 2. Sniff the ``HEVOLVE_DB_URL`` env var (covers first-boot path

96 where the engine hasn't been created yet).

97 3. Default to ``True`` — SQLite is the flat-tier default, so the

98 safer assumption on failure is "serialize."

99 """

100 try:

101 from integrations.social.models import engine as _engine

102 dialect = getattr(getattr(_engine, 'dialect', None), 'name', '') or ''

103 if dialect:

104 return dialect.lower() == 'sqlite'

105 except Exception:

106 pass

107 try:

108 import os

109 url = os.environ.get('HEVOLVE_DB_URL', '') or ''

110 if url:

111 return url.lower().startswith('sqlite')

112 except Exception:

113 pass

114 return True # flat-tier default = SQLite

115

116

117@dataclass

118class EvolveSession:

119 """Tracks one auto-evolve cycle."""

120 session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

122 started_at: float = 0.0

123 candidates: int = 0

124 filtered: int = 0

125 selected: int = 0

126 dispatched: int = 0

127 completed: int = 0

128 failed: int = 0

129 experiments: List[Dict] = field(default_factory=list)

130 errors: List[str] = field(default_factory=list)

131

132 def to_dict(self) -> Dict:

133 return {

134 'session_id': self.session_id,

135 'status': self.status,

136 'elapsed_s': round(time.time() - self.started_at, 1) if self.started_at else 0,

137 'candidates': self.candidates,

138 'filtered': self.filtered,

139 'selected': self.selected,

140 'dispatched': self.dispatched,

141 'completed': self.completed,

142 'failed': self.failed,

143 'experiments': self.experiments,

144 'errors': self.errors,

145 }

146

147

148class AutoEvolveOrchestrator:

149 """Democratic selection + autonomous iteration dispatch.

150

151 The orchestrator doesn't run experiments — it selects which ones

152 to run based on constitutional + democratic criteria, then dispatches

153 them through the agent goal system for type-aware iteration.

154 """

155

156 def __init__(self):

157 self._active_session: Optional[EvolveSession] = None

158 self._lock = threading.Lock()

159

160 def start(self, max_experiments: int = 5,

161 min_approval_score: float = 0.3,

162 statuses: List[str] = None,

163 user_id: str = 'system') -> Dict:

164 """Start an auto-evolve cycle.

165

166 Args:

167 max_experiments: Max experiments to dispatch in this cycle

168 min_approval_score: Minimum weighted approval score to qualify

169 statuses: Which experiment statuses to consider

170 (default: ['voting', 'evaluating'])

171 user_id: Who triggered the evolve cycle

172

173 Returns:

174 Session info dict

175 """

176 with self._lock:

177 if self._active_session and self._active_session.status == 'running':

178 return {

179 'success': False,

180 'reason': 'Auto-evolve cycle already running',

181 'session': self._active_session.to_dict(),

182 }

183

184 session = EvolveSession()

185 session.started_at = time.time()

186 session.status = 'selecting'

187

188 with self._lock:

189 self._active_session = session

190

191 # Run in background thread

192 def _run():

193 try:

194 self._execute_cycle(session, max_experiments,

195 min_approval_score,

196 statuses or ['voting', 'evaluating'],

197 user_id)

198 except Exception as e:

199 session.status = 'failed'

200 session.errors.append(str(e))

201 logger.exception(f"[{session.session_id}] Auto-evolve failed: {e}")

202

203 t = threading.Thread(target=_run, daemon=True,

204 name=f'auto-evolve-{session.session_id}')

205 t.start()

206

207 return {

208 'success': True,

209 'session_id': session.session_id,

210 'status': 'selecting',

211 }

212

213 def get_status(self) -> Dict:

214 """Get current auto-evolve session status."""

215 with self._lock:

216 if self._active_session:

217 return self._active_session.to_dict()

218 return {'status': 'idle', 'message': 'No active auto-evolve session'}

219

220 def _execute_cycle(self, session: EvolveSession,

221 max_experiments: int,

222 min_approval_score: float,

223 statuses: List[str],

224 user_id: str):

225 """Execute the full auto-evolve cycle."""

226

227 # Phase 1: GATHER candidates

228 candidates = self._gather_candidates(session, statuses)

229 session.candidates = len(candidates)

230 if not candidates:

231 session.status = 'completed'

232 session.errors.append('No eligible experiments found')

233 self._emit_event('auto_evolve.no_candidates', session.to_dict())

234 return

235

236 # Phase 2: FILTER through constitutional gate

237 approved = self._constitutional_filter(session, candidates)

238 session.filtered = len(approved)

239

240 # Phase 3: VOTE tally + rank

241 ranked = self._rank_by_votes(session, approved, min_approval_score)

242 session.selected = len(ranked)

243

244 if not ranked:

245 session.status = 'completed'

246 session.errors.append(

247 f'No experiments met approval threshold ({min_approval_score})')

248 self._emit_event('auto_evolve.none_approved', session.to_dict())

249 return

250

251 # Phase 4: SELECT top-N

252 winners = ranked[:max_experiments]

253

254 # Phase 5: DISPATCH to type-aware iteration (parallel per PRODUCT_MAP §10)

255 session.status = 'dispatching'

256 self._emit_event('auto_evolve.dispatching', {

257 'count': len(winners),

258 'experiments': [w['id'] for w in winners],

259 })

260

261 self._dispatch_winners_parallel(session, winners, user_id)

262

263 session.status = 'running' if session.dispatched > 0 else 'failed'

264 self._emit_event('auto_evolve.started', session.to_dict())

265

266 logger.info(f"[{session.session_id}] Auto-evolve dispatched "

267 f"{session.dispatched}/{len(winners)} experiments")

268

269 def _gather_candidates(self, session: EvolveSession,

270 statuses: List[str]) -> List[Dict]:

271 """Gather eligible thought experiments from DB."""

272 try:

273 from integrations.social.models import db_session

274 from integrations.social.thought_experiment_service import (

275 ThoughtExperimentService)

276

277 with db_session(commit=False) as db:

278 all_experiments = []

279 for status in statuses:

280 exps = ThoughtExperimentService.get_active_experiments(

281 db, status=status, limit=50)

282 all_experiments.extend(exps)

283 return all_experiments

284 except Exception as e:

285 logger.warning(f"[{session.session_id}] Gather failed: {e}")

286 return []

287

288 def _constitutional_filter(self, session: EvolveSession,

289 candidates: List[Dict]) -> List[Dict]:

290 """Filter candidates through ConstitutionalFilter."""

291 approved = []

292 for exp in candidates:

293 try:

294 from security.hive_guardrails import ConstitutionalFilter

295 text = f"{exp.get('title', '')}: {exp.get('hypothesis', '')}"

296 check = ConstitutionalFilter.check_prompt(text)

297 ok = check[0] if isinstance(check, tuple) else check.get('approved', True)

298 if ok:

299 approved.append(exp)

300 else:

301 logger.debug(f"[{session.session_id}] Filtered out: {exp.get('id')}")

302 except ImportError:

303 # No filter available — pass through

304 approved.append(exp)

305 return approved

306

307 def _rank_by_votes(self, session: EvolveSession,

308 candidates: List[Dict],

309 min_score: float) -> List[Dict]:

310 """Tally votes and rank by approval score.

311

312 Two gates apply per PRODUCT_MAP §10:

313 1. weighted_score >= min_score (caller-provided absolute floor)

314 2. total_for / (total_for + total_against) >= 2/3 super-majority

315

316 Both must hold for an experiment to qualify for dispatch. The

317 super-majority gate protects against a small but highly-weighted

318 vocal minority flipping a low-participation tally into approval.

319

320 Once the gates pass, the rank is biased by an active-learning

321 signal pulled from the world model (HevolveAI) — see

322 _active_learning_multiplier docstring. The bias only nudges

323 order; it never re-admits a candidate the gates rejected.

324 """

325 scored = []

326 try:

327 from integrations.social.models import db_session

328 from integrations.social.thought_experiment_service import (

329 ThoughtExperimentService)

330

331 with db_session(commit=False) as db:

332 for exp in candidates:

333 tally = ThoughtExperimentService.tally_votes(

334 db, exp['id'])

335 score = tally.get('weighted_score', 0)

336 total_for = tally.get('total_for', 0) or 0

337 total_against = tally.get('total_against', 0) or 0

338 decisive = total_for + total_against

339 # Super-majority: ≥ 2/3 of DECISIVE (non-abstain) weight

340 # must be FOR. Abstains are excluded from denominator.

341 super_ratio = (total_for / decisive) if decisive > 0 else 0.0

342 exp['_approval_score'] = score

343 exp['_super_majority'] = round(super_ratio, 4)

344 exp['_tally'] = tally

345 if (score >= min_score

346 and super_ratio >= AUTO_EVOLVE_SUPERMAJORITY_RATIO):

347 scored.append(exp)

348 else:

349 logger.debug(

350 f"[{session.session_id}] Rejected {exp.get('id')}: "

351 f"score={score} super_ratio={super_ratio:.3f} "

352 f"(need score>={min_score} and "

353 f"ratio>={AUTO_EVOLVE_SUPERMAJORITY_RATIO:.3f})"

354 )

355 except Exception as e:

356 logger.warning(f"[{session.session_id}] Vote tally failed: {e}")

357 return candidates # Fall through unranked

358

359 # Active-learning bias: pull a global epistemic-uncertainty

360 # multiplier from the world model and apply it as a small

361 # boost to each gated-in candidate's approval score before

362 # sort. No-op when the world model isn't reachable.

363 al_mult = self._active_learning_multiplier(session)

364 for exp in scored:

365 base = exp.get('_approval_score', 0) or 0

366 biased = base * al_mult

367 exp['_active_learning_multiplier'] = round(al_mult, 4)

368 exp['_biased_score'] = round(biased, 4)

369

370 # Sort by biased score (falls back to unbiased when al_mult=1.0)

371 scored.sort(

372 key=lambda e: e.get('_biased_score',

373 e.get('_approval_score', 0)),

374 reverse=True,

375 )

376 return scored

377

378 def _active_learning_multiplier(self, session: EvolveSession) -> float:

379 """Pull the world model's current avg_prediction_error +

380 learning_steps via agent_baseline_service._collect_world_model_metrics()

381 — the same normalized signals persisted into every baseline

382 snapshot — and project them into a [1.0, 1+AL_MAX_BOOST]

383 multiplier, gated on model maturity.

384

385 Why colocated with _rank_by_votes (not in baseline_service):

386 AL acquisition is a SELECTION concern (which candidates does

387 the system want to run NEXT), not a SNAPSHOT concern (what did

388 the system look like at this moment). baseline_service owns

389 the COLLECTOR (single source for HevolveAI feedback shape);

390 this method owns the SELECTION USE of that signal.

391

392 Two-stage trust gate:

393 stage 1 — does the AL signal exist?

394 No: return 1.0 (no bias).

395 stage 2 — has the world model done enough learning_steps to

396 produce a CALIBRATED prediction-error estimate?

397 < AL_TRUST_MIN_STEPS: return 1.0 (uncertainty about

398 uncertainty — using the signal would amplify the

399 model's own blind spots into the experiment queue).

400 >= AL_TRUST_FULL_STEPS: full AL_MAX_BOOST.

401 in between: linear ramp.

402

403 Returns 1.0 in any failure path so a missing / immature world

404 model falls back to pure democratic vote — preserves the

405 system's pre-wiring behavior.

406

407 HevolveAI field names verified against

408 embodied_learner.py::EmbodiedLearner.get_stats — do not

409 invent (`epistemic_uncertainty`, `learning_progress`, etc.

410 do NOT exist on get_stats; the real keys are

411 avg_prediction_error and learning_steps).

412 """

413 try:

414 from integrations.agent_engine.agent_baseline_service import (

415 AgentBaselineService)

416 wm = AgentBaselineService._collect_world_model_metrics()

417 except Exception:

418 return 1.0

419 if not wm:

420 return 1.0

421

422 # Stage 1: AL signal present? Higher prediction error =

423 # more to learn from this domain = stronger acquisition

424 # value. Already clamped to [0,1] by the collector.

425 err = wm.get('al_signal')

426 if not isinstance(err, (int, float)):

427 return 1.0

428

429 # Stage 2: has the model trained enough that its prediction-

430 # error estimate is calibrated?

431 steps = wm.get('learning_steps')

432 if not isinstance(steps, int) or steps < AUTO_EVOLVE_AL_TRUST_MIN_STEPS:

433 logger.debug(

434 f"[{session.session_id}] World model still in bootstrap "

435 f"(learning_steps={steps} < {AUTO_EVOLVE_AL_TRUST_MIN_STEPS}) "

436 f"— skipping AL bias (al_signal={err:.4f} considered uncalibrated)"

437 )

438 return 1.0

439

440 # Linear ramp from 0 at MIN_STEPS to 1 at FULL_STEPS

441 ramp_span = AUTO_EVOLVE_AL_TRUST_FULL_STEPS - AUTO_EVOLVE_AL_TRUST_MIN_STEPS

442 trust_factor = (steps - AUTO_EVOLVE_AL_TRUST_MIN_STEPS) / ramp_span

443 trust_factor = max(0.0, min(1.0, trust_factor))

444

445 mult = 1.0 + (err * AUTO_EVOLVE_AL_MAX_BOOST * trust_factor)

446 logger.debug(

447 f"[{session.session_id}] AL bias active: "

448 f"al_signal={err:.4f}, learning_steps={steps}, "

449 f"trust_factor={trust_factor:.4f} → mult={mult:.4f}"

450 )

451 return mult

452

453 def _dispatch_winners_parallel(self, session: EvolveSession,

454 winners: List[Dict], user_id: str) -> None:

455 """Fan-out winning experiments to type-aware iteration in parallel.

456

457 PRODUCT_MAP §10 DISPATCH stage. Uses a bounded ThreadPoolExecutor

458 (cap = AUTO_EVOLVE_MAX_PARALLEL_DISPATCH) so a large approved set

459 can't stampede the agent goal system. Each experiment still goes

460 through _dispatch_experiment (unchanged contract) — this method

461 only adds the concurrency primitive.

462

463 Mutations on `session` (dispatched/failed/experiments/errors) are

464 serialized by acquiring self._lock around each bookkeeping update.

465

466 FIX-1.4a: On SQLite (flat tier), ``db.commit()`` serializes the

467 whole database file, and multiple concurrent commits produce

468 ``database is locked`` SQLALchemy errors. Detect the engine URL

469 and drop ``max_workers`` to 1 when SQLite is the backend. MySQL

470 (regional/central) keeps the bounded parallel dispatch.

471

472 FIX-1.4b: ``_dispatch_experiment`` returns ``{'success': False,

473 'reason': ...}`` on logical failure (e.g. already-dispatched

474 experiment, missing goal recipe) WITHOUT raising. The earlier

475 code counted those as successful dispatches, so a session with

476 100 % logical-failure returns would sit at status='running' with

477 ``dispatched == N`` and ``failed == 0`` forever. Branch on the

478 ``success`` flag: only count dispatched on True, else route to

479 the ``failed`` counter + errors list so the terminal-state rule

480 at line 192 (``running if dispatched > 0 else failed``) fires

481 correctly.

482 """

483 if not winners:

484 return

485

486 max_parallel = AUTO_EVOLVE_MAX_PARALLEL_DISPATCH

487 if _is_sqlite_backend():

488 max_parallel = 1 # FIX-1.4a: serialize on flat/SQLite

489 logger.debug(

490 f"[{session.session_id}] SQLite backend detected — "

491 f"serializing dispatch to avoid 'database is locked'")

492

493 max_workers = min(len(winners), max_parallel)

494 with ThreadPoolExecutor(

495 max_workers=max_workers,

496 thread_name_prefix=f'auto-evolve-{session.session_id}') as pool:

497 futures = {

498 pool.submit(self._dispatch_experiment, session, exp, user_id): exp

499 for exp in winners

500 }

501 for future in as_completed(futures):

502 exp = futures[future]

503 try:

504 goal_result = future.result()

505 # FIX-1.4b: distinguish logical success from logical failure.

506 # _dispatch_experiment returns dict — missing/falsy 'success'

507 # key = logical failure, not dispatched.

508 success = (

509 isinstance(goal_result, dict)

510 and bool(goal_result.get('success'))

511 )

512 with self._lock:

513 if success:

514 session.dispatched += 1

515 session.experiments.append({

516 'id': exp['id'],

517 'title': exp.get('title', ''),

518 'type': exp.get('experiment_type', 'traditional'),

519 'approval_score': exp.get('_approval_score', 0),

520 'super_majority': exp.get('_super_majority', 0),

521 'goal_id': goal_result.get('goal_id')

522 if isinstance(goal_result, dict) else None,

523 'status': 'dispatched',

524 })

525 else:

526 session.failed += 1

527 reason = (

528 goal_result.get('reason')

529 if isinstance(goal_result, dict)

530 else 'non-dict result'

531 ) or 'unknown'

532 session.errors.append(

533 f"Dispatch {exp['id']}: {reason}")

534 session.experiments.append({

535 'id': exp['id'],

536 'title': exp.get('title', ''),

537 'type': exp.get('experiment_type', 'traditional'),

538 'approval_score': exp.get('_approval_score', 0),

539 'super_majority': exp.get('_super_majority', 0),

540 'goal_id': None,

541 'status': 'failed',

542 'reason': reason,

543 })

544 logger.info(

545 f"[{session.session_id}] Dispatch declined "

546 f"for {exp['id']}: {reason}")

547 except Exception as e:

548 with self._lock:

549 session.failed += 1

550 session.errors.append(f"Dispatch {exp['id']}: {e}")

551 logger.warning(

552 f"[{session.session_id}] Failed to dispatch "

553 f"{exp['id']}: {e}")

554

555 def _dispatch_experiment(self, session: EvolveSession,

556 exp: Dict, user_id: str) -> Dict:

557 """Dispatch a winning experiment to its type-aware iteration loop.

558

559 Uses ThoughtExperimentService.request_agent_evaluation() which

560 creates an agent goal with the type-aware iteration recipe.

561

562 Special case: experiments with experiment_type='code_evolution' and

563 target_repo='hevolveai' are dispatched to the HevolveAI evolution

564 orchestrator inshghtead of the standard evaluation pipeline.

565 """

566 from integrations.social.models import db_session

567 from integrations.social.thought_experiment_service import (

568 ThoughtExperimentService)

569

570 with db_session(commit=False) as db:

571 result = ThoughtExperimentService.request_agent_evaluation(

572 db, exp['id'])

573 if result.get('success'):

574 db.commit()

575 return result

576

577 def _emit_event(self, topic: str, data: Dict):

578 """Emit progress event via EventBus."""

579 try:

580 from core.platform.events import emit_event

581 emit_event(topic, data)

582 except Exception:

583 pass

584

585

586# ── Singleton ────────────────────────────────────────────────

587

588_orchestrator: Optional[AutoEvolveOrchestrator] = None

589_lock = threading.Lock()

590

591

592def get_auto_evolve_orchestrator() -> AutoEvolveOrchestrator:

593 """Get or create the singleton AutoEvolveOrchestrator."""

594 global _orchestrator

595 if _orchestrator is None:

596 with _lock:

597 if _orchestrator is None:

598 _orchestrator = AutoEvolveOrchestrator()

599 return _orchestrator

600

601

602# ── Owner Pause/Resume ───────────────────────────────────────

603

604# Paused experiment IDs — owner can pause their experiment's iteration

605_paused_experiments: Dict[str, str] = {} # experiment_id → paused_by_user_id

606_pause_lock = threading.Lock()

607

608

609def pause_experiment_evolution(experiment_id: str, user_id: str) -> Dict:

610 """Pause a running experiment's iteration (owner only).

611

612 The experiment stays in 'evaluating' status but the agent goal

613 is signalled to stop iterating.

614 """

615 # Verify ownership

616 try:

617 from integrations.social.models import db_session

618 from integrations.social.thought_experiment_service import ThoughtExperimentService

619 with db_session(commit=False) as db:

620 detail = ThoughtExperimentService.get_experiment_detail(

621 db, experiment_id)

622 if not detail:

623 return {'success': False, 'reason': 'not_found'}

624 if detail.get('creator_id') != user_id:

625 return {'success': False, 'reason': 'not_owner',

626 'message': 'Only the experiment creator can pause it'}

627 except Exception as e:

628 return {'success': False, 'reason': str(e)}

629

630 with _pause_lock:

631 _paused_experiments[experiment_id] = user_id

632

633 logger.info(f"Experiment {experiment_id} paused by {user_id}")

634 return {'success': True, 'experiment_id': experiment_id, 'status': 'paused'}

635

636

637def resume_experiment_evolution(experiment_id: str, user_id: str) -> Dict:

638 """Resume a paused experiment's iteration (owner only)."""

639 with _pause_lock:

640 paused_by = _paused_experiments.get(experiment_id)

641 if not paused_by:

642 return {'success': False, 'reason': 'not_paused'}

643 if paused_by != user_id:

644 return {'success': False, 'reason': 'not_owner',

645 'message': 'Only the user who paused can resume'}

646 del _paused_experiments[experiment_id]

647

648 logger.info(f"Experiment {experiment_id} resumed by {user_id}")

649 return {'success': True, 'experiment_id': experiment_id, 'status': 'resumed'}

650

651

652def is_experiment_paused(experiment_id: str) -> bool:

653 """Check if an experiment's evolution is paused."""

654 with _pause_lock:

655 return experiment_id in _paused_experiments

656

657

658def get_paused_experiments() -> List[str]:

659 """Get list of paused experiment IDs."""

660 with _pause_lock:

661 return list(_paused_experiments.keys())

662

663

664# ── Agent Tool Functions ─────────────────────────────────────

665

666def start_auto_evolve(max_experiments: int = 5,

667 min_approval_score: float = 0.3,

668 user_id: str = 'system') -> str:

669 """Start an auto-evolve cycle: democratically select thought experiments

670 and dispatch them to autonomous iteration loops.

671

672 The orchestrator:

673 1. Gathers eligible thought experiments (voting/evaluating phase)

674 2. Filters through ConstitutionalFilter

675 3. Tallies democratic votes (human + agent, weighted)

676 4. Selects top-N by approval score

677 5. Dispatches each to its type-aware iteration loop

678

679 Software experiments → autoresearch (edit→run→metric→iterate)

680 Traditional experiments → reason_and_refine (hypothesize→score→refine)

681 Physical AI experiments → observe_and_measure

682

683 Args:

684 max_experiments: Max experiments to dispatch (default 5)

685 min_approval_score: Minimum weighted vote score to qualify (default 0.3)

686 user_id: Who triggered the cycle

687

688 Returns:

689 JSON with session_id and status

690 """

691 orch = get_auto_evolve_orchestrator()

692 result = orch.start(

693 max_experiments=int(max_experiments),

694 min_approval_score=float(min_approval_score),

695 user_id=user_id,

696 )

697 return json.dumps(result)

698

699

700def get_auto_evolve_status() -> str:

701 """Get the status of the current auto-evolve cycle.

702

703 Returns progress including: candidates gathered, filtered, selected,

704 dispatched, and per-experiment status.

705 """

706 orch = get_auto_evolve_orchestrator()

707 return json.dumps(orch.get_status())

708

709

710def pause_evolve_experiment(experiment_id: str, user_id: str) -> str:

711 """Pause a running thought experiment's evolution loop.

712

713 Only the experiment creator (owner) can pause their experiment.

714 The experiment stays in 'evaluating' status but iteration stops.

715

716 Args:

717 experiment_id: The ThoughtExperiment ID to pause

718 user_id: ID of the user requesting pause (must be creator)

719

720 Returns:

721 JSON with success status

722 """

723 result = pause_experiment_evolution(experiment_id, user_id)

724 return json.dumps(result)

725

726

727def resume_evolve_experiment(experiment_id: str, user_id: str) -> str:

728 """Resume a paused thought experiment's evolution loop.

729

730 Only the user who paused it can resume.

731

732 Args:

733 experiment_id: The ThoughtExperiment ID to resume

734 user_id: ID of the user requesting resume (must be pauser)

735

736 Returns:

737 JSON with success status

738 """

739 result = resume_experiment_evolution(experiment_id, user_id)

740 return json.dumps(result)

741

742

743# Tool registration for ServiceToolRegistry

744AUTO_EVOLVE_TOOLS = [

745 {

746 'name': 'start_auto_evolve',

747 'func': start_auto_evolve,

748 'description': (

749 'Start democratic auto-evolve cycle: gather thought experiments, '

750 'constitutional filter, vote tally, dispatch winners to '

751 'autonomous iteration loops.'

752 ),

753 'tags': ['auto_evolve', 'thought_experiment'],

754 },

755 {

756 'name': 'get_auto_evolve_status',

757 'func': get_auto_evolve_status,

758 'description': 'Get progress of the current auto-evolve cycle.',

759 'tags': ['auto_evolve'],

760 },

761 {

762 'name': 'pause_evolve_experiment',

763 'func': pause_evolve_experiment,

764 'description': (

765 'Pause a running thought experiment evolution (owner only). '

766 'Stops iteration but keeps evaluating status.'

767 ),

768 'tags': ['auto_evolve', 'thought_experiment'],

769 },

770 {

771 'name': 'resume_evolve_experiment',

772 'func': resume_evolve_experiment,

773 'description': 'Resume a paused thought experiment evolution (owner only).',

774 'tags': ['auto_evolve', 'thought_experiment'],

775 },

776]

Coverage for integrations / agent_engine / auto_evolve.py: 62.3%

247 statements