Coverage for integrations / coding_agent / autoevolve_code_tools.py: 81.8%

379 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Auto-Evolve Code Tools — Agent-native code experiment tools. 

3 

4Individual tools for autonomous code experiments: setup → edit → run → score → 

5keep/revert → finalize. The agent's conversation loop (autogen group chat) 

6drives iteration — no hardcoded Python while loop. 

7 

8Inspired by karpathy/autoresearch. Each tool is a single step: 

9 1. autoresearch_setup — create session, run baseline, return session_id 

10 2. autoresearch_edit — LLM proposes + applies one code edit 

11 3. autoresearch_run — run experiment, extract metric, record benchmark 

12 4. autoresearch_decide — keep (git commit) or revert (git checkout) 

13 5. autoresearch_finalize — save report, export learning delta 

14 6. get_autoresearch_status — poll session progress 

15 

16Uses existing infra only: 

17 - AiderNativeBackend for code edits 

18 - run_cmd_subprocess for experiment execution 

19 - BenchmarkTracker for score tracking 

20 - CodingRecipeBridge for saving winning edits as recipes 

21 - AgentBaselineService for evolution snapshots 

22 - EventBus for live progress events 

23""" 

24import json 

25import logging 

26import os 

27import threading 

28import time 

29import uuid 

30from dataclasses import dataclass, field, asdict 

31from typing import Dict, List, Optional, Tuple 

32 

33logger = logging.getLogger('hevolve.autoresearch') 

34 

35 

36# ── Result Types ───────────────────────────────────────────── 

37 

38@dataclass 

39class ExperimentResult: 

40 """Result of a single experiment iteration.""" 

41 iteration: int 

42 hypothesis: str 

43 metric_name: str 

44 metric_value: Optional[float] 

45 baseline_value: Optional[float] 

46 improved: bool 

47 files_changed: List[str] = field(default_factory=list) 

48 edits: List[Dict] = field(default_factory=list) 

49 run_output: str = '' 

50 error: str = '' 

51 duration_s: float = 0.0 

52 

53 @property 

54 def delta(self) -> Optional[float]: 

55 if self.metric_value is not None and self.baseline_value is not None: 

56 return self.metric_value - self.baseline_value 

57 return None 

58 

59 

60@dataclass 

61class AutoResearchSession: 

62 """Tracks the full autoresearch session state.""" 

63 session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) 

64 experiment_id: str = '' # ThoughtExperiment ID (if triggered by one) 

65 goal_id: str = '' # AgentGoal ID 

66 repo_path: str = '' # Working directory 

67 target_file: str = '' # The file being modified (like train.py) 

68 run_command: str = '' # Command to run the experiment 

69 metric_name: str = 'score' # Name of the metric to optimize 

70 metric_pattern: str = '' # Regex to extract metric from output 

71 metric_direction: str = 'higher_is_better' # or 'lower_is_better' 

72 max_iterations: int = 50 

73 time_budget_s: int = 300 # Per-iteration time budget (5 min default) 

74 spark_budget: int = 200 # Total Spark budget 

75 spark_consumed: int = 0 

76 spark_per_iteration: int = 4 # Spark cost per iteration 

77 

78 # State 

79 baseline_metric: Optional[float] = None 

80 best_metric: Optional[float] = None 

81 best_iteration: int = 0 

82 current_iteration: int = 0 

83 status: str = 'pending' # pending | running | completed | failed | budget_exhausted 

84 results: List[Dict] = field(default_factory=list) 

85 start_time: float = 0.0 

86 total_improvements: int = 0 

87 

88 # Regression-escape-hatch flags — set LOUDLY when an enforcement layer 

89 # is unavailable, so downstream consumers (dashboards, tests, audits) 

90 # can surface "this session didn't enforce baseline / benchmark gain". 

91 # These are cleared (False) when the dependency is present and working. 

92 baseline_enforced: bool = True # AgentBaselineService captured snapshot 

93 benchmark_gain_enforced: bool = True # BenchmarkTracker recorded iteration 

94 federation_export_enforced: bool = True # Learning delta exported 

95 

96 # RSI gates — every promoted improvement must pass both. If a gate 

97 # layer is unavailable, the flag flips to False and a WARNING is 

98 # emitted; if a gate layer is present and rejects, counters increment 

99 # and last_rejection_reason records why. Dashboards + tests read 

100 # these via to_progress_dict() to verify the recursive self-improvement 

101 # loop is actually closed. 

102 constitutional_enforced: bool = True # ConstitutionalFilter ran 

103 baseline_delta_enforced: bool = True # validate_against_baseline ran 

104 federation_broadcast_enforced: bool = True # broadcast_delta ran 

105 constitutional_rejections: int = 0 

106 baseline_rejections: int = 0 

107 last_rejection_reason: str = '' 

108 

109 # Last edit state (for decide step) 

110 _pending_edits: List[Dict] = field(default_factory=list) 

111 _pending_files: List[str] = field(default_factory=list) 

112 _pending_hypothesis: str = '' 

113 

114 def is_budget_exhausted(self) -> bool: 

115 return self.spark_consumed + self.spark_per_iteration > self.spark_budget 

116 

117 def is_improved(self, new_val: float) -> bool: 

118 if self.best_metric is None: 

119 return True 

120 if self.metric_direction == 'lower_is_better': 

121 return new_val < self.best_metric 

122 return new_val > self.best_metric 

123 

124 def to_progress_dict(self) -> Dict: 

125 return { 

126 'session_id': self.session_id, 

127 'status': self.status, 

128 'iteration': self.current_iteration, 

129 'max_iterations': self.max_iterations, 

130 'baseline_metric': self.baseline_metric, 

131 'best_metric': self.best_metric, 

132 'best_iteration': self.best_iteration, 

133 'total_improvements': self.total_improvements, 

134 'spark_consumed': self.spark_consumed, 

135 'spark_budget': self.spark_budget, 

136 'elapsed_s': time.time() - self.start_time if self.start_time else 0, 

137 # Enforcement flags — if False, dependency was missing/failed and 

138 # the session ran without that regression-safety layer. 

139 'baseline_enforced': self.baseline_enforced, 

140 'benchmark_gain_enforced': self.benchmark_gain_enforced, 

141 'federation_export_enforced': self.federation_export_enforced, 

142 # RSI gates — recursive self-improvement loop closure. 

143 'constitutional_enforced': self.constitutional_enforced, 

144 'baseline_delta_enforced': self.baseline_delta_enforced, 

145 'federation_broadcast_enforced': self.federation_broadcast_enforced, 

146 'constitutional_rejections': self.constitutional_rejections, 

147 'baseline_rejections': self.baseline_rejections, 

148 'last_rejection_reason': self.last_rejection_reason, 

149 } 

150 

151 

152# ── Engine (session store + utilities) ──────────────────────── 

153 

154class AutoResearchEngine: 

155 """Session store and utility methods for autoresearch tools. 

156 

157 NOT a loop — the agent's conversation drives iteration by calling 

158 individual tool functions in sequence. 

159 """ 

160 

161 def __init__(self): 

162 self._active_sessions: Dict[str, AutoResearchSession] = {} 

163 self._lock = threading.Lock() 

164 

165 def register_session(self, session: AutoResearchSession): 

166 with self._lock: 

167 self._active_sessions[session.session_id] = session 

168 

169 def unregister_session(self, session_id: str): 

170 with self._lock: 

171 self._active_sessions.pop(session_id, None) 

172 

173 def get_active_sessions(self) -> List[Dict]: 

174 """Return progress for all active sessions.""" 

175 with self._lock: 

176 return [s.to_progress_dict() for s in self._active_sessions.values()] 

177 

178 def get_session(self, session_id: str) -> Optional[AutoResearchSession]: 

179 with self._lock: 

180 return self._active_sessions.get(session_id) 

181 

182 # ── Edit Generation ───────────────────────────────────── 

183 

184 def generate_and_apply_edit(self, session: AutoResearchSession 

185 ) -> Optional[Tuple[str, List[Dict], List[str]]]: 

186 """Use LLM to generate a hypothesis and code edit, then apply it.""" 

187 try: 

188 from integrations.coding_agent.aider_native_backend import AiderNativeBackend 

189 backend = AiderNativeBackend() 

190 

191 history_summary = self.build_history_summary(session) 

192 

193 # Query BenchmarkTracker for best-performing tool insights 

194 benchmark_hint = '' 

195 try: 

196 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker 

197 tracker = get_benchmark_tracker() 

198 best = tracker.get_best_tool('autoresearch') 

199 if best: 

200 name, success_rate, avg_time = best 

201 benchmark_hint = ( 

202 f"\nBENCHMARK INSIGHT: Best tool '{name}' has " 

203 f"{success_rate:.0%} success rate, avg {avg_time:.1f}s.\n" 

204 ) 

205 except Exception: 

206 pass 

207 

208 # RSI-5: ε-greedy exploration arm. When HEVOLVE_RSI_EXPLORE=1 

209 # and the sampled coin lands in the explore bucket, swap the 

210 # incremental-tuning prompt stance for a radical-mutation 

211 # stance. The LLM remains the code mutator (no parallel 

212 # code-generator path), but the instruction distribution 

213 # shifts — this is the cheapest honest wiring of the 

214 # stochastic arm without inventing a second mutation backend. 

215 # Safety: the candidate still passes RSI-1 + RSI-2 gates 

216 # inside commit_improvement before promotion. 

217 exploration_hint = '' 

218 try: 

219 from integrations.agent_engine.exploration_arm import ( 

220 select_strategy, 

221 ) 

222 if select_strategy() == 'explore': 

223 exploration_hint = ( 

224 "\nEXPLORATION MODE: propose a RADICAL / " 

225 "ARCHITECTURAL change this iteration — not an " 

226 "incremental tweak. Favor ideas that reshape " 

227 "the approach; safety gates still run before " 

228 "promotion, so a failed bold change costs " 

229 "nothing while a successful one opens the " 

230 "search space.\n" 

231 ) 

232 except Exception: 

233 pass 

234 

235 task = ( 

236 f"You are running an autonomous research loop.\n\n" 

237 f"TARGET FILE: {session.target_file}\n" 

238 f"METRIC: {session.metric_name} " 

239 f"({'lower is better' if session.metric_direction == 'lower_is_better' else 'higher is better'})\n" 

240 f"BASELINE: {session.baseline_metric}\n" 

241 f"CURRENT BEST: {session.best_metric} (iteration {session.best_iteration})\n" 

242 f"ITERATION: {session.current_iteration}/{session.max_iterations}\n\n" 

243 f"EXPERIMENT HISTORY:\n{history_summary}\n\n" 

244 f"{benchmark_hint}" 

245 f"{exploration_hint}" 

246 f"RUN COMMAND: {session.run_command}\n\n" 

247 f"YOUR TASK:\n" 

248 f"1. Analyze what worked and what didn't from the history above\n" 

249 f"2. Propose ONE focused modification to {session.target_file}\n" 

250 f"3. Explain your hypothesis in one sentence\n" 

251 f"4. Make the edit using SEARCH/REPLACE blocks\n\n" 

252 f"RULES:\n" 

253 f"- One change per iteration — small, testable, reversible\n" 

254 f"- If you're stuck, try combinations of previous improvements\n" 

255 f"- If all ideas seem tried, try something radical or architectural\n" 

256 f"- Simplicity wins — a 0.001 gain from deleting code beats a 0.001 gain from 20 lines\n" 

257 f"- NEVER modify the evaluation metric or test harness\n" 

258 ) 

259 

260 context = { 

261 'working_dir': session.repo_path, 

262 'files': [session.target_file], 

263 } 

264 

265 result = backend.execute(task, context, timeout=120) 

266 

267 if not result.get('success'): 

268 return None 

269 

270 output = result.get('output', '') 

271 hypothesis = output.split('\n')[0][:200] if output else 'Unknown hypothesis' 

272 edits = result.get('edits', []) 

273 files_changed = result.get('files_changed', []) 

274 

275 return hypothesis, edits, files_changed 

276 

277 except Exception as e: 

278 logger.warning(f"[{session.session_id}] Edit generation failed: {e}") 

279 return None 

280 

281 # ── Experiment Execution ───────────────────────────────── 

282 

283 def run_experiment(self, session: AutoResearchSession, 

284 is_baseline: bool = False) -> ExperimentResult: 

285 """Run the experiment command and extract the metric.""" 

286 import re 

287 

288 result = ExperimentResult( 

289 iteration=0 if is_baseline else session.current_iteration, 

290 hypothesis='baseline' if is_baseline else '', 

291 metric_name=session.metric_name, 

292 metric_value=None, 

293 baseline_value=session.baseline_metric, 

294 improved=False, 

295 ) 

296 

297 start = time.time() 

298 try: 

299 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess 

300 exit_code, output = run_cmd_subprocess( 

301 session.run_command, 

302 cwd=session.repo_path, 

303 timeout=session.time_budget_s, 

304 ) 

305 result.run_output = output[-5000:] if len(output) > 5000 else output 

306 result.duration_s = time.time() - start 

307 

308 if exit_code != 0: 

309 lines = output.split('\n') 

310 tb_start = None 

311 for i, line in enumerate(lines): 

312 if 'Traceback' in line or 'Error' in line: 

313 tb_start = i 

314 break 

315 if tb_start is not None: 

316 result.error = '\n'.join(lines[tb_start:tb_start + 20]) 

317 else: 

318 result.error = f'Exit code {exit_code}: {lines[-3:]}' 

319 return result 

320 

321 metric_val = self.extract_metric(output, session) 

322 result.metric_value = metric_val 

323 

324 if metric_val is not None and is_baseline: 

325 result.improved = False 

326 

327 self.record_benchmark(session, result) 

328 

329 except Exception as e: 

330 result.error = str(e) 

331 result.duration_s = time.time() - start 

332 

333 return result 

334 

335 def record_benchmark(self, session: AutoResearchSession, 

336 result: ExperimentResult): 

337 """Record experiment result in BenchmarkTracker for evolution tracking. 

338 

339 BenchmarkTracker is how we prove that iter N actually beat iter N-1 on 

340 the hive-shared leaderboard. If it's unavailable or raises, the 

341 session's `benchmark_gain_enforced` flag is flipped to False and a 

342 WARNING is emitted — callers (dashboards, tests) can then surface 

343 "this session ran WITHOUT benchmark gain enforcement" rather than 

344 silently treating 0 gain as a valid result. 

345 """ 

346 try: 

347 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker 

348 tracker = get_benchmark_tracker() 

349 tracker.record( 

350 task_type='autoresearch', 

351 tool_name='aider_native_backend', 

352 completion_time_s=result.duration_s, 

353 success=not result.error and result.metric_value is not None, 

354 model_name=session.metric_name, 

355 user_id=session.goal_id or session.session_id, 

356 ) 

357 except ImportError as e: 

358 session.benchmark_gain_enforced = False 

359 logger.warning( 

360 "[%s] BenchmarkTracker unavailable (ImportError: %s) — " 

361 "iter %d ran WITHOUT benchmark-gain enforcement. " 

362 "Set session.benchmark_gain_enforced=False.", 

363 session.session_id, e, result.iteration, 

364 ) 

365 except Exception as e: 

366 session.benchmark_gain_enforced = False 

367 logger.warning( 

368 "[%s] BenchmarkTracker record failed (%s: %s) — " 

369 "iter %d ran WITHOUT benchmark-gain enforcement.", 

370 session.session_id, type(e).__name__, e, result.iteration, 

371 ) 

372 

373 def extract_metric(self, output: str, session: AutoResearchSession 

374 ) -> Optional[float]: 

375 """Extract the target metric from experiment output.""" 

376 import re 

377 

378 if session.metric_pattern: 

379 match = re.search(session.metric_pattern, output) 

380 if match: 

381 try: 

382 return float(match.group(1)) 

383 except (ValueError, IndexError): 

384 pass 

385 

386 patterns = [ 

387 rf'{re.escape(session.metric_name)}[:\s=]+([0-9]+\.?[0-9]*)', 

388 rf'^{re.escape(session.metric_name)}[:\s]+([0-9]+\.?[0-9]*)', 

389 r'(\d+) passed', 

390 r'(?:score|result|metric|accuracy|loss|bpb)[:\s=]+([0-9]+\.?[0-9]*)', 

391 ] 

392 

393 for pat in patterns: 

394 match = re.search(pat, output, re.IGNORECASE | re.MULTILINE) 

395 if match: 

396 try: 

397 return float(match.group(1)) 

398 except (ValueError, IndexError): 

399 continue 

400 

401 return None 

402 

403 # ── Git State Management ───────────────────────────────── 

404 

405 def revert_changes(self, session: AutoResearchSession): 

406 """Revert the working directory to the last good state.""" 

407 try: 

408 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess 

409 run_cmd_subprocess( 

410 f'git checkout -- {session.target_file}', 

411 cwd=session.repo_path, 

412 timeout=10, 

413 ) 

414 except Exception as e: 

415 logger.warning(f"[{session.session_id}] Revert failed: {e}") 

416 

417 # ── RSI gates ───────────────────────────────────────────── 

418 # These two gates close the recursive self-improvement loop: no 

419 # candidate becomes the new baseline unless ConstitutionalFilter 

420 # allows it AND AgentBaselineService.validate_against_baseline 

421 # reports no regression. Both gates fail-open when the dependency 

422 # is missing, but flip the corresponding *_enforced flag to False 

423 # and log LOUDLY so the dashboard/test can see the bypass. 

424 

425 def _constitutional_gate(self, session: 'AutoResearchSession', 

426 result: 'ExperimentResult') -> Tuple[bool, str]: 

427 """Check the hypothesis + edit summary against ConstitutionalFilter. 

428 

429 Returns (allowed, reason). Fail-open on ImportError or hash 

430 tamper, but flip session.constitutional_enforced=False in that 

431 case so the dashboard can surface the missing gate. 

432 """ 

433 try: 

434 from security.hive_guardrails import ConstitutionalFilter 

435 except ImportError as e: 

436 session.constitutional_enforced = False 

437 logger.warning( 

438 "[%s] ConstitutionalFilter unavailable (ImportError: %s) — " 

439 "iter %d promoted WITHOUT constitutional gate. " 

440 "Set session.constitutional_enforced=False.", 

441 session.session_id, e, result.iteration, 

442 ) 

443 return True, 'gate_unavailable' 

444 

445 prompt_text = ' '.join(filter(None, [ 

446 result.hypothesis or '', 

447 session.metric_name or '', 

448 ' '.join(result.files_changed or []), 

449 ])) 

450 try: 

451 allowed, reason = ConstitutionalFilter.check_prompt(prompt_text) 

452 except RuntimeError as e: 

453 # Guardrail tamper — fail-CLOSED. This is the one case where 

454 # the gate is authoritative: if the guardrail values were 

455 # mutated in memory we must NOT promote. 

456 session.constitutional_enforced = True 

457 logger.critical( 

458 "[%s] ConstitutionalFilter TAMPER on iter %d: %s — " 

459 "refusing to promote.", 

460 session.session_id, result.iteration, e, 

461 ) 

462 return False, f'guardrail_tamper: {e}' 

463 except Exception as e: 

464 session.constitutional_enforced = False 

465 logger.warning( 

466 "[%s] ConstitutionalFilter.check_prompt failed " 

467 "(%s: %s) — iter %d promoted WITHOUT gate.", 

468 session.session_id, type(e).__name__, e, result.iteration, 

469 ) 

470 return True, 'gate_errored' 

471 

472 return allowed, reason 

473 

474 def _baseline_delta_gate(self, session: 'AutoResearchSession' 

475 ) -> Tuple[bool, List[str], str]: 

476 """Run AgentBaselineService.validate_against_baseline. 

477 

478 Returns (passed, regressions, reason). Fail-open if service is 

479 unavailable or no baseline exists yet (first-run case), but 

480 flip session.baseline_delta_enforced=False on unavailable path. 

481 """ 

482 try: 

483 from integrations.agent_engine.agent_baseline_service import ( 

484 AgentBaselineService, 

485 ) 

486 except ImportError as e: 

487 session.baseline_delta_enforced = False 

488 logger.warning( 

489 "[%s] AgentBaselineService unavailable for delta gate " 

490 "(ImportError: %s) — promoted WITHOUT baseline compare.", 

491 session.session_id, e, 

492 ) 

493 return True, [], 'gate_unavailable' 

494 

495 try: 

496 prompt_id = session.experiment_id or session.session_id 

497 result = AgentBaselineService.validate_against_baseline( 

498 prompt_id=prompt_id, flow_id=0, 

499 ) 

500 except Exception as e: 

501 session.baseline_delta_enforced = False 

502 logger.warning( 

503 "[%s] validate_against_baseline errored (%s: %s) — " 

504 "promoted WITHOUT baseline compare.", 

505 session.session_id, type(e).__name__, e, 

506 ) 

507 return True, [], 'gate_errored' 

508 

509 passed = bool(result.get('passed', True)) 

510 regressions = list(result.get('regressions', []) or []) 

511 reason = result.get('reason', '') or ( 

512 'no_regressions' if passed else 'regressions_detected' 

513 ) 

514 return passed, regressions, reason 

515 

516 def commit_improvement(self, session: AutoResearchSession, 

517 result: ExperimentResult) -> bool: 

518 """Commit the improvement to git and save as recipe step. 

519 

520 Returns True if the candidate was actually promoted (gates passed + 

521 git commit attempted), False if a gate rejected it (pending edits 

522 reverted, session rejection counters incremented). 

523 

524 RSI gate chain (both must pass to promote): 

525 1. ConstitutionalFilter — hypothesis/edit summary free of 

526 violation patterns. 

527 2. AgentBaselineService.validate_against_baseline — no 

528 cross-metric regression vs the latest live snapshot. 

529 

530 Fail-open on missing dependencies (flags flip loud), fail-closed 

531 on guardrail tamper. 

532 """ 

533 # ── RSI-1: constitutional gate ── 

534 allowed, cons_reason = self._constitutional_gate(session, result) 

535 if not allowed: 

536 session.constitutional_rejections += 1 

537 session.last_rejection_reason = f'constitutional: {cons_reason}' 

538 logger.warning( 

539 "[%s] Iter %d REJECTED by ConstitutionalFilter: %s — " 

540 "reverting pending edits.", 

541 session.session_id, result.iteration, cons_reason, 

542 ) 

543 self.revert_changes(session) 

544 self.emit_progress(session, 'autoresearch.rejected', { 

545 'iteration': result.iteration, 

546 'gate': 'constitutional', 

547 'reason': cons_reason, 

548 }) 

549 return False 

550 

551 # ── RSI-2: baseline delta gate ── 

552 passed, regressions, base_reason = self._baseline_delta_gate(session) 

553 if not passed: 

554 session.baseline_rejections += 1 

555 session.last_rejection_reason = ( 

556 f'baseline_regression: {"; ".join(regressions) or base_reason}' 

557 ) 

558 logger.warning( 

559 "[%s] Iter %d REJECTED by baseline delta: %s — " 

560 "reverting pending edits.", 

561 session.session_id, result.iteration, 

562 regressions or base_reason, 

563 ) 

564 self.revert_changes(session) 

565 self.emit_progress(session, 'autoresearch.rejected', { 

566 'iteration': result.iteration, 

567 'gate': 'baseline_delta', 

568 'regressions': regressions, 

569 'reason': base_reason, 

570 }) 

571 return False 

572 

573 # ── Gates passed — proceed with existing commit + recipe + snapshot ── 

574 try: 

575 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess 

576 msg = (f"autoresearch iter {result.iteration}: " 

577 f"{session.metric_name}={result.metric_value} " 

578 f"(was {result.baseline_value})") 

579 run_cmd_subprocess( 

580 f'git add {session.target_file} && git commit -m "{msg}"', 

581 cwd=session.repo_path, 

582 timeout=15, 

583 ) 

584 except Exception as e: 

585 logger.debug(f"[{session.session_id}] Git commit skipped: {e}") 

586 

587 try: 

588 from integrations.coding_agent.recipe_bridge import CodingRecipeBridge 

589 bridge = CodingRecipeBridge() 

590 bridge.capture_edit_as_recipe_step( 

591 task=f'autoresearch: {session.metric_name} optimization', 

592 tool_name='autoresearch', 

593 file_edits=result.edits, 

594 working_dir=session.repo_path, 

595 ) 

596 except ImportError as e: 

597 logger.warning( 

598 "[%s] CodingRecipeBridge unavailable (ImportError: %s) — " 

599 "iter %d improvement NOT captured as recipe step.", 

600 session.session_id, e, result.iteration, 

601 ) 

602 except Exception as e: 

603 logger.warning( 

604 "[%s] CodingRecipeBridge.capture failed (%s: %s) — " 

605 "iter %d improvement NOT captured as recipe step.", 

606 session.session_id, type(e).__name__, e, result.iteration, 

607 ) 

608 

609 # AgentBaselineService snapshot is the regression escape-hatch the 

610 # audit flagged: if we silently skip it, a later benchmark-based 

611 # rollback has no anchor to roll back TO. Make the absence LOUD 

612 # and set the session flag so dashboards/tests can read it. 

613 try: 

614 from integrations.agent_engine.agent_baseline_service import AgentBaselineService 

615 AgentBaselineService.capture_snapshot( 

616 prompt_id=session.experiment_id or session.session_id, 

617 flow_id='autoresearch', 

618 trigger='autoresearch_improvement', 

619 user_id=session.goal_id or 'system', 

620 ) 

621 except ImportError as e: 

622 session.baseline_enforced = False 

623 logger.warning( 

624 "[%s] AgentBaselineService unavailable (ImportError: %s) — " 

625 "iter %d kept WITHOUT baseline snapshot. " 

626 "Set session.baseline_enforced=False — regression rollback " 

627 "will have no anchor for this iteration.", 

628 session.session_id, e, result.iteration, 

629 ) 

630 except Exception as e: 

631 session.baseline_enforced = False 

632 logger.warning( 

633 "[%s] AgentBaselineService.capture_snapshot failed " 

634 "(%s: %s) — iter %d kept WITHOUT baseline snapshot.", 

635 session.session_id, type(e).__name__, e, result.iteration, 

636 ) 

637 

638 self.emit_progress(session, 'autoresearch.promoted', { 

639 'iteration': result.iteration, 

640 'metric_value': result.metric_value, 

641 'baseline_value': result.baseline_value, 

642 }) 

643 return True 

644 

645 # ── History & Reporting ────────────────────────────────── 

646 

647 def build_history_summary(self, session: AutoResearchSession) -> str: 

648 """Build a compact summary of previous iterations for the LLM.""" 

649 if not session.results: 

650 return 'No previous iterations.' 

651 

652 lines = [] 

653 for r in session.results[-10:]: 

654 status = 'IMPROVED' if r.get('improved') else 'reverted' 

655 val = r.get('metric_value', '?') 

656 hyp = r.get('hypothesis', '')[:80] 

657 err = r.get('error', '')[:50] 

658 if err: 

659 lines.append(f" iter {r.get('iteration', '?')}: CRASHED — {err}") 

660 else: 

661 lines.append(f" iter {r.get('iteration', '?')}: {val} ({status}) — {hyp}") 

662 

663 return '\n'.join(lines) 

664 

665 def save_report(self, session: AutoResearchSession): 

666 """Save the session report to agent_data for persistence.""" 

667 try: 

668 report_dir = os.path.join( 

669 os.path.dirname(__file__), '..', '..', 'agent_data', 'autoresearch') 

670 os.makedirs(report_dir, exist_ok=True) 

671 

672 report_path = os.path.join(report_dir, f'{session.session_id}.json') 

673 report = { 

674 'session': session.to_progress_dict(), 

675 'config': { 

676 'repo_path': session.repo_path, 

677 'target_file': session.target_file, 

678 'run_command': session.run_command, 

679 'metric_name': session.metric_name, 

680 'metric_direction': session.metric_direction, 

681 'max_iterations': session.max_iterations, 

682 'time_budget_s': session.time_budget_s, 

683 }, 

684 'results': session.results, 

685 } 

686 with open(report_path, 'w', encoding='utf-8') as f: 

687 json.dump(report, f, indent=2, default=str) 

688 logger.info(f"[{session.session_id}] Report saved: {report_path}") 

689 except Exception as e: 

690 logger.warning(f"[{session.session_id}] Report save failed: {e}") 

691 

692 self.export_learning_delta(session) 

693 

694 def export_learning_delta(self, session: AutoResearchSession): 

695 """Export session results as a federated learning delta AND 

696 broadcast them to peer Hive nodes. 

697 

698 Two layers: 

699 1. BenchmarkTracker.export_learning_delta — prepare the 

700 delta payload. If unavailable, federation_export_enforced 

701 flips False and a WARNING is emitted. 

702 2. FederatedAggregator.broadcast_delta — actually transmit 

703 the delta to known peers. If unavailable OR if the peer 

704 POST leg errors, federation_broadcast_enforced flips False 

705 and a WARNING is emitted. ScopeGuard inside broadcast_delta 

706 is the authoritative egress gate (PII / secrets blocked). 

707 

708 This closes RSI-3: promoted improvements actually propagate across 

709 the Hive so "the most" user-owned nodes benefit, not just the 

710 instance that ran the iteration. 

711 """ 

712 delta = None 

713 try: 

714 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker 

715 tracker = get_benchmark_tracker() 

716 delta = tracker.export_learning_delta() or {} 

717 delta['autoresearch'] = { 

718 'session_id': session.session_id, 

719 'experiment_id': session.experiment_id, 

720 'metric_name': session.metric_name, 

721 'baseline': session.baseline_metric, 

722 'best': session.best_metric, 

723 'total_improvements': session.total_improvements, 

724 'iterations': session.current_iteration, 

725 'constitutional_rejections': session.constitutional_rejections, 

726 'baseline_rejections': session.baseline_rejections, 

727 } 

728 logger.info( 

729 "[%s] Learning delta prepared for federation " 

730 "(improvements=%d, rejections=c%d/b%d)", 

731 session.session_id, session.total_improvements, 

732 session.constitutional_rejections, session.baseline_rejections, 

733 ) 

734 except ImportError as e: 

735 session.federation_export_enforced = False 

736 logger.warning( 

737 "[%s] BenchmarkTracker unavailable (ImportError: %s) — " 

738 "learning delta NOT exported; hive will not learn from " 

739 "this session. Set session.federation_export_enforced=False.", 

740 session.session_id, e, 

741 ) 

742 return 

743 except Exception as e: 

744 session.federation_export_enforced = False 

745 logger.warning( 

746 "[%s] Learning delta export failed (%s: %s) — " 

747 "hive will not learn from this session.", 

748 session.session_id, type(e).__name__, e, 

749 ) 

750 return 

751 

752 # ── RSI-3: broadcast to peer Hive nodes ── 

753 # This is the "federate" leg. Without it, improvements stay 

754 # local and "the most" never benefits. ScopeGuard inside 

755 # broadcast_delta is the authoritative egress gate. 

756 try: 

757 from integrations.agent_engine.federated_aggregator import ( 

758 get_federated_aggregator, 

759 ) 

760 aggregator = get_federated_aggregator() 

761 aggregator.broadcast_delta(delta) 

762 logger.info( 

763 "[%s] Learning delta broadcast to hive peers via " 

764 "FederatedAggregator", session.session_id, 

765 ) 

766 except ImportError as e: 

767 session.federation_broadcast_enforced = False 

768 logger.warning( 

769 "[%s] FederatedAggregator unavailable (ImportError: %s) — " 

770 "delta NOT broadcast to peers; hive will not learn from " 

771 "this session. Set session.federation_broadcast_enforced=False.", 

772 session.session_id, e, 

773 ) 

774 except Exception as e: 

775 session.federation_broadcast_enforced = False 

776 logger.warning( 

777 "[%s] FederatedAggregator.broadcast_delta failed " 

778 "(%s: %s) — peers did not receive this session's delta.", 

779 session.session_id, type(e).__name__, e, 

780 ) 

781 

782 def emit_progress(self, session: AutoResearchSession, 

783 event_topic: str, data: Dict = None): 

784 """Emit progress event via EventBus for live tracker updates.""" 

785 try: 

786 from core.platform.events import emit_event 

787 payload = data or {} 

788 payload['session_id'] = session.session_id 

789 payload['experiment_id'] = session.experiment_id 

790 payload['goal_id'] = session.goal_id 

791 emit_event(event_topic, payload) 

792 except Exception: 

793 pass 

794 

795 

796# ── Singleton ──────────────────────────────────────────────── 

797 

798_engine: Optional[AutoResearchEngine] = None 

799_engine_lock = threading.Lock() 

800 

801 

802def get_autoresearch_engine() -> AutoResearchEngine: 

803 """Get or create the singleton AutoResearchEngine.""" 

804 global _engine 

805 if _engine is None: 

806 with _engine_lock: 

807 if _engine is None: 

808 _engine = AutoResearchEngine() 

809 return _engine 

810 

811 

812# ── Agent Tool Functions (step-based) ──────────────────────── 

813# The agent calls these in sequence. The agent's conversation loop 

814# IS the iteration loop — no hardcoded Python while loop. 

815 

816 

817def autoresearch_setup(repo_path: str, target_file: str, run_command: str, 

818 metric_name: str = 'score', 

819 metric_pattern: str = '', 

820 metric_direction: str = 'higher_is_better', 

821 max_iterations: int = 50, 

822 time_budget_s: int = 300, 

823 experiment_id: str = '', 

824 goal_id: str = '') -> str: 

825 """Set up an autoresearch session and run the baseline experiment. 

826 

827 Call this FIRST. Creates a session, runs the unmodified code to capture 

828 the baseline metric, and returns a session_id for subsequent steps. 

829 

830 Agent loop pattern: 

831 1. autoresearch_setup(...) → get session_id + baseline 

832 2. autoresearch_edit(session_id) → propose code edit 

833 3. autoresearch_run(session_id) → run + score 

834 4. autoresearch_decide(session_id) → keep or revert 

835 5. Repeat 2-4 until converged or budget exhausted 

836 6. autoresearch_finalize(session_id) → save report 

837 

838 Args: 

839 repo_path: Path to the git repository 

840 target_file: The file to modify (relative to repo_path) 

841 run_command: Shell command to run the experiment 

842 metric_name: Name of the metric to optimize 

843 metric_pattern: Regex with group(1) to extract metric from output 

844 metric_direction: 'higher_is_better' or 'lower_is_better' 

845 max_iterations: Maximum iterations before stopping 

846 time_budget_s: Per-iteration time budget in seconds 

847 experiment_id: ThoughtExperiment ID (if triggered by one) 

848 goal_id: AgentGoal ID 

849 

850 Returns: 

851 JSON with session_id, baseline_metric, and status 

852 """ 

853 if not os.path.isdir(repo_path): 

854 return json.dumps({'error': f'repo_path not found: {repo_path}'}) 

855 

856 target_path = os.path.join(repo_path, target_file) 

857 if not os.path.isfile(target_path): 

858 return json.dumps({'error': f'target_file not found: {target_file}'}) 

859 

860 session = AutoResearchSession( 

861 experiment_id=experiment_id, 

862 goal_id=goal_id, 

863 repo_path=repo_path, 

864 target_file=target_file, 

865 run_command=run_command, 

866 metric_name=metric_name, 

867 metric_pattern=metric_pattern, 

868 metric_direction=metric_direction, 

869 max_iterations=max_iterations, 

870 time_budget_s=time_budget_s, 

871 ) 

872 session.status = 'running' 

873 session.start_time = time.time() 

874 

875 engine = get_autoresearch_engine() 

876 engine.register_session(session) 

877 

878 # Run baseline 

879 baseline = engine.run_experiment(session, is_baseline=True) 

880 if baseline.error: 

881 session.status = 'failed' 

882 session.results.append(asdict(baseline)) 

883 engine.emit_progress(session, 'autoresearch.failed', 

884 {'error': f'Baseline failed: {baseline.error}'}) 

885 return json.dumps({ 

886 'error': f'Baseline failed: {baseline.error}', 

887 'session_id': session.session_id, 

888 }) 

889 

890 session.baseline_metric = baseline.metric_value 

891 session.best_metric = baseline.metric_value 

892 session.results.append(asdict(baseline)) 

893 engine.emit_progress(session, 'autoresearch.started') 

894 engine.emit_progress(session, 'autoresearch.baseline', 

895 {'baseline': baseline.metric_value}) 

896 

897 return json.dumps({ 

898 'session_id': session.session_id, 

899 'status': 'running', 

900 'baseline_metric': baseline.metric_value, 

901 'metric_name': metric_name, 

902 'metric_direction': metric_direction, 

903 'max_iterations': max_iterations, 

904 'instruction': ( 

905 'Baseline captured. Now call autoresearch_edit to propose a code ' 

906 'change, then autoresearch_run to test it, then autoresearch_decide ' 

907 'to keep or revert. Repeat until converged or budget exhausted.' 

908 ), 

909 }) 

910 

911 

912def autoresearch_edit(session_id: str) -> str: 

913 """Propose and apply one code edit for an autoresearch session. 

914 

915 Uses LLM + AiderNativeBackend to generate a hypothesis and apply 

916 the code modification. Call autoresearch_run next to test it. 

917 

918 Args: 

919 session_id: The session ID from autoresearch_setup 

920 

921 Returns: 

922 JSON with hypothesis, files_changed, and budget status 

923 """ 

924 engine = get_autoresearch_engine() 

925 session = engine.get_session(session_id) 

926 if not session: 

927 return json.dumps({'error': f'Session {session_id} not found'}) 

928 

929 if session.is_budget_exhausted(): 

930 session.status = 'budget_exhausted' 

931 return json.dumps({ 

932 'budget_exhausted': True, 

933 'spark_consumed': session.spark_consumed, 

934 'spark_budget': session.spark_budget, 

935 'instruction': 'Budget exhausted. Call autoresearch_finalize to save report.', 

936 }) 

937 

938 session.current_iteration += 1 

939 edit_result = engine.generate_and_apply_edit(session) 

940 

941 if not edit_result: 

942 return json.dumps({ 

943 'success': False, 

944 'iteration': session.current_iteration, 

945 'reason': 'No edit generated by LLM', 

946 'instruction': 'Try calling autoresearch_edit again for a new hypothesis.', 

947 }) 

948 

949 hypothesis, edits, files_changed = edit_result 

950 session._pending_hypothesis = hypothesis 

951 session._pending_edits = edits 

952 session._pending_files = files_changed 

953 

954 return json.dumps({ 

955 'success': True, 

956 'iteration': session.current_iteration, 

957 'hypothesis': hypothesis, 

958 'files_changed': files_changed, 

959 'instruction': 'Edit applied. Call autoresearch_run to test this change.', 

960 }) 

961 

962 

963def autoresearch_run(session_id: str) -> str: 

964 """Run the experiment after an edit and extract the metric. 

965 

966 Executes the run_command, extracts the target metric from output, 

967 and records the result in BenchmarkTracker. 

968 

969 Args: 

970 session_id: The session ID from autoresearch_setup 

971 

972 Returns: 

973 JSON with metric_value, improved, and comparison to best 

974 """ 

975 engine = get_autoresearch_engine() 

976 session = engine.get_session(session_id) 

977 if not session: 

978 return json.dumps({'error': f'Session {session_id} not found'}) 

979 

980 result = engine.run_experiment(session, is_baseline=False) 

981 result.iteration = session.current_iteration 

982 result.hypothesis = session._pending_hypothesis 

983 result.edits = session._pending_edits 

984 result.files_changed = session._pending_files 

985 

986 session.spark_consumed += session.spark_per_iteration 

987 

988 # Determine improvement 

989 improved = False 

990 if result.error: 

991 improved = False 

992 elif result.metric_value is not None and session.is_improved(result.metric_value): 

993 improved = True 

994 

995 result.improved = improved 

996 result.baseline_value = session.best_metric 

997 

998 # Store for decide step 

999 session.results.append(asdict(result)) 

1000 engine.emit_progress(session, 'autoresearch.iteration', asdict(result)) 

1001 

1002 return json.dumps({ 

1003 'iteration': session.current_iteration, 

1004 'metric_value': result.metric_value, 

1005 'best_metric': session.best_metric, 

1006 'improved': improved, 

1007 'error': result.error or None, 

1008 'duration_s': round(result.duration_s, 1), 

1009 'instruction': ( 

1010 f'{"IMPROVED" if improved else "No improvement"}. ' 

1011 f'Call autoresearch_decide to {"keep" if improved else "revert"} this change.' 

1012 ), 

1013 }) 

1014 

1015 

1016def autoresearch_decide(session_id: str) -> str: 

1017 """Keep or revert the last edit based on the experiment result. 

1018 

1019 If the last run improved the metric, commits the change and saves 

1020 it as a recipe step. If not, reverts via git checkout. 

1021 

1022 Args: 

1023 session_id: The session ID from autoresearch_setup 

1024 

1025 Returns: 

1026 JSON with decision, current best, and next step advice 

1027 """ 

1028 engine = get_autoresearch_engine() 

1029 session = engine.get_session(session_id) 

1030 if not session: 

1031 return json.dumps({'error': f'Session {session_id} not found'}) 

1032 

1033 if not session.results: 

1034 return json.dumps({'error': 'No experiment results to decide on'}) 

1035 

1036 last_result_dict = session.results[-1] 

1037 improved = last_result_dict.get('improved', False) 

1038 metric_value = last_result_dict.get('metric_value') 

1039 error = last_result_dict.get('error', '') 

1040 

1041 if error or not improved: 

1042 # Revert — in-session metric failed to improve. 

1043 engine.revert_changes(session) 

1044 decision = 'reverted' 

1045 logger.info(f"[{session.session_id}] Iter {session.current_iteration} " 

1046 f"reverted: {metric_value} vs best {session.best_metric}") 

1047 else: 

1048 # Candidate passed the in-session metric check — hand it to 

1049 # commit_improvement which runs the RSI gates (constitutional + 

1050 # baseline delta) and returns False if either rejects. On 

1051 # rejection it reverts the pending edits itself and bumps the 

1052 # appropriate rejection counter, so we only update best_metric / 

1053 # best_iteration / total_improvements when the promote actually 

1054 # landed. This enforces the monotonic-vs-today's-baseline 

1055 # guarantee globally, not just on the metric being optimized. 

1056 prior_best = session.best_metric 

1057 result = ExperimentResult( 

1058 iteration=session.current_iteration, 

1059 hypothesis=session._pending_hypothesis, 

1060 metric_name=session.metric_name, 

1061 metric_value=metric_value, 

1062 baseline_value=prior_best, 

1063 improved=True, 

1064 edits=session._pending_edits, 

1065 files_changed=session._pending_files, 

1066 ) 

1067 committed = engine.commit_improvement(session, result) 

1068 if committed: 

1069 session.best_metric = metric_value 

1070 session.best_iteration = session.current_iteration 

1071 session.total_improvements += 1 

1072 decision = 'kept' 

1073 logger.info( 

1074 f"[{session.session_id}] Iter {session.current_iteration} " 

1075 f"IMPROVED: {metric_value} (was {prior_best})") 

1076 else: 

1077 # RSI gate rejected. best_metric stays at prior_best and the 

1078 # pending edits were already reverted by commit_improvement. 

1079 decision = 'rejected_by_gate' 

1080 logger.info( 

1081 f"[{session.session_id}] Iter {session.current_iteration} " 

1082 f"gated (reason={session.last_rejection_reason}); " 

1083 f"best remains {prior_best}") 

1084 

1085 # Clear pending state 

1086 session._pending_hypothesis = '' 

1087 session._pending_edits = [] 

1088 session._pending_files = [] 

1089 

1090 # Convergence check 

1091 should_continue = ( 

1092 session.current_iteration < session.max_iterations 

1093 and not session.is_budget_exhausted() 

1094 ) 

1095 

1096 return json.dumps({ 

1097 'decision': decision, 

1098 'iteration': session.current_iteration, 

1099 'best_metric': session.best_metric, 

1100 'best_iteration': session.best_iteration, 

1101 'total_improvements': session.total_improvements, 

1102 'spark_consumed': session.spark_consumed, 

1103 'should_continue': should_continue, 

1104 'instruction': ( 

1105 'Call autoresearch_edit for the next iteration.' 

1106 if should_continue else 

1107 'Done iterating. Call autoresearch_finalize to save the report.' 

1108 ), 

1109 }) 

1110 

1111 

1112def autoresearch_finalize(session_id: str) -> str: 

1113 """Finalize an autoresearch session — save report and export deltas. 

1114 

1115 Call this when iteration is complete (converged, budget exhausted, 

1116 or max iterations reached). Saves the session report and exports 

1117 learning deltas for hive-wide federation. 

1118 

1119 Args: 

1120 session_id: The session ID from autoresearch_setup 

1121 

1122 Returns: 

1123 JSON with final session summary 

1124 """ 

1125 engine = get_autoresearch_engine() 

1126 session = engine.get_session(session_id) 

1127 if not session: 

1128 return json.dumps({'error': f'Session {session_id} not found'}) 

1129 

1130 if session.status == 'running': 

1131 session.status = 'completed' 

1132 

1133 engine.save_report(session) 

1134 engine.emit_progress(session, 'autoresearch.completed', 

1135 session.to_progress_dict()) 

1136 engine.unregister_session(session_id) 

1137 

1138 return json.dumps({ 

1139 'status': session.status, 

1140 'session_id': session.session_id, 

1141 'baseline_metric': session.baseline_metric, 

1142 'best_metric': session.best_metric, 

1143 'best_iteration': session.best_iteration, 

1144 'total_improvements': session.total_improvements, 

1145 'total_iterations': session.current_iteration, 

1146 'spark_consumed': session.spark_consumed, 

1147 'elapsed_s': round(time.time() - session.start_time, 1), 

1148 }) 

1149 

1150 

1151def get_autoresearch_status(session_id: str = '') -> str: 

1152 """Get the status of an autoresearch session or all active sessions. 

1153 

1154 Args: 

1155 session_id: Specific session ID, or empty for all active sessions 

1156 

1157 Returns: 

1158 JSON with session progress 

1159 """ 

1160 engine = get_autoresearch_engine() 

1161 

1162 if session_id: 

1163 session = engine.get_session(session_id) 

1164 if session: 

1165 return json.dumps(session.to_progress_dict()) 

1166 # Check saved reports 

1167 report_path = os.path.join( 

1168 os.path.dirname(__file__), '..', '..', 'agent_data', 

1169 'autoresearch', f'{session_id}.json') 

1170 if os.path.isfile(report_path): 

1171 with open(report_path, 'r') as f: 

1172 return f.read() 

1173 return json.dumps({'error': f'Session {session_id} not found'}) 

1174 

1175 return json.dumps({'active_sessions': engine.get_active_sessions()}) 

1176 

1177 

1178# ── Backward-compatible alias ───────────────────────────────── 

1179# launch_experiment_autoresearch in thought_experiment_tools.py calls this 

1180 

1181def start_autoresearch(repo_path: str, target_file: str, run_command: str, 

1182 metric_name: str = 'score', metric_pattern: str = '', 

1183 metric_direction: str = 'higher_is_better', 

1184 max_iterations: int = 50, time_budget_s: int = 300, 

1185 experiment_id: str = '', goal_id: str = '', 

1186 hive_parallel: bool = False, 

1187 num_variants: int = 3) -> str: 

1188 """Backward-compatible wrapper — delegates to autoresearch_setup. 

1189 

1190 The hive_parallel parameter is accepted but ignored (hive dispatch 

1191 is now handled by the agent via compute mesh tools). 

1192 """ 

1193 return autoresearch_setup( 

1194 repo_path=repo_path, target_file=target_file, 

1195 run_command=run_command, metric_name=metric_name, 

1196 metric_pattern=metric_pattern, metric_direction=metric_direction, 

1197 max_iterations=max_iterations, time_budget_s=time_budget_s, 

1198 experiment_id=experiment_id, goal_id=goal_id, 

1199 ) 

1200 

1201 

1202# Tool registration list (consumed by ServiceToolRegistry) 

1203AUTOEVOLVE_CODE_TOOLS = [ 

1204 { 

1205 'name': 'autoresearch_setup', 

1206 'func': autoresearch_setup, 

1207 'description': ( 

1208 'Set up a code research session and run baseline. Returns session_id. ' 

1209 'Call autoresearch_edit → autoresearch_run → autoresearch_decide in a loop.' 

1210 ), 

1211 'tags': ['autoresearch', 'coding'], 

1212 }, 

1213 { 

1214 'name': 'autoresearch_edit', 

1215 'func': autoresearch_edit, 

1216 'description': 'Propose and apply one LLM-generated code edit.', 

1217 'tags': ['autoresearch', 'coding'], 

1218 }, 

1219 { 

1220 'name': 'autoresearch_run', 

1221 'func': autoresearch_run, 

1222 'description': 'Run the experiment after an edit and extract the metric.', 

1223 'tags': ['autoresearch', 'coding'], 

1224 }, 

1225 { 

1226 'name': 'autoresearch_decide', 

1227 'func': autoresearch_decide, 

1228 'description': 'Keep (git commit) or revert (git checkout) the last edit.', 

1229 'tags': ['autoresearch', 'coding'], 

1230 }, 

1231 { 

1232 'name': 'autoresearch_finalize', 

1233 'func': autoresearch_finalize, 

1234 'description': 'Save session report and export learning deltas to federation.', 

1235 'tags': ['autoresearch', 'coding'], 

1236 }, 

1237 { 

1238 'name': 'get_autoresearch_status', 

1239 'func': get_autoresearch_status, 

1240 'description': 'Get progress of an autoresearch session or list all active sessions.', 

1241 'tags': ['autoresearch'], 

1242 }, 

1243] 

1244 

1245# Backward-compat alias 

1246AUTORESEARCH_TOOLS = AUTOEVOLVE_CODE_TOOLS