Coverage for integrations/coding_agent/autoevolve_code

1"""

2Auto-Evolve Code Tools — Agent-native code experiment tools.

4Individual tools for autonomous code experiments: setup → edit → run → score →

5keep/revert → finalize. The agent's conversation loop (autogen group chat)

6drives iteration — no hardcoded Python while loop.

8Inspired by karpathy/autoresearch. Each tool is a single step:

9 1. autoresearch_setup — create session, run baseline, return session_id

10 2. autoresearch_edit — LLM proposes + applies one code edit

11 3. autoresearch_run — run experiment, extract metric, record benchmark

12 4. autoresearch_decide — keep (git commit) or revert (git checkout)

13 5. autoresearch_finalize — save report, export learning delta

14 6. get_autoresearch_status — poll session progress

16Uses existing infra only:

17 - AiderNativeBackend for code edits

18 - run_cmd_subprocess for experiment execution

19 - BenchmarkTracker for score tracking

20 - CodingRecipeBridge for saving winning edits as recipes

21 - AgentBaselineService for evolution snapshots

22 - EventBus for live progress events

23"""

24import json

25import logging

26import os

27import threading

28import time

29import uuid

30from dataclasses import dataclass, field, asdict

31from typing import Dict, List, Optional, Tuple

33logger = logging.getLogger('hevolve.autoresearch')

36# ── Result Types ─────────────────────────────────────────────

38@dataclass

39class ExperimentResult:

40 """Result of a single experiment iteration."""

41 iteration: int

42 hypothesis: str

43 metric_name: str

44 metric_value: Optional[float]

45 baseline_value: Optional[float]

46 improved: bool

47 files_changed: List[str] = field(default_factory=list)

48 edits: List[Dict] = field(default_factory=list)

49 run_output: str = ''

50 error: str = ''

51 duration_s: float = 0.0

53 @property

54 def delta(self) -> Optional[float]:

55 if self.metric_value is not None and self.baseline_value is not None:

56 return self.metric_value - self.baseline_value

57 return None

60@dataclass

61class AutoResearchSession:

62 """Tracks the full autoresearch session state."""

63 session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

64 experiment_id: str = '' # ThoughtExperiment ID (if triggered by one)

65 goal_id: str = '' # AgentGoal ID

66 repo_path: str = '' # Working directory

67 target_file: str = '' # The file being modified (like train.py)

68 run_command: str = '' # Command to run the experiment

69 metric_name: str = 'score' # Name of the metric to optimize

70 metric_pattern: str = '' # Regex to extract metric from output

71 metric_direction: str = 'higher_is_better' # or 'lower_is_better'

72 max_iterations: int = 50

73 time_budget_s: int = 300 # Per-iteration time budget (5 min default)

74 spark_budget: int = 200 # Total Spark budget

75 spark_consumed: int = 0

76 spark_per_iteration: int = 4 # Spark cost per iteration

78 # State

79 baseline_metric: Optional[float] = None

80 best_metric: Optional[float] = None

81 best_iteration: int = 0

82 current_iteration: int = 0

83 status: str = 'pending' # pending | running | completed | failed | budget_exhausted

84 results: List[Dict] = field(default_factory=list)

85 start_time: float = 0.0

86 total_improvements: int = 0

88 # Regression-escape-hatch flags — set LOUDLY when an enforcement layer

89 # is unavailable, so downstream consumers (dashboards, tests, audits)

90 # can surface "this session didn't enforce baseline / benchmark gain".

91 # These are cleared (False) when the dependency is present and working.

92 baseline_enforced: bool = True # AgentBaselineService captured snapshot

93 benchmark_gain_enforced: bool = True # BenchmarkTracker recorded iteration

94 federation_export_enforced: bool = True # Learning delta exported

96 # RSI gates — every promoted improvement must pass both. If a gate

97 # layer is unavailable, the flag flips to False and a WARNING is

98 # emitted; if a gate layer is present and rejects, counters increment

99 # and last_rejection_reason records why. Dashboards + tests read

100 # these via to_progress_dict() to verify the recursive self-improvement

101 # loop is actually closed.

102 constitutional_enforced: bool = True # ConstitutionalFilter ran

103 baseline_delta_enforced: bool = True # validate_against_baseline ran

104 federation_broadcast_enforced: bool = True # broadcast_delta ran

105 constitutional_rejections: int = 0

106 baseline_rejections: int = 0

107 last_rejection_reason: str = ''

108

109 # Last edit state (for decide step)

110 _pending_edits: List[Dict] = field(default_factory=list)

111 _pending_files: List[str] = field(default_factory=list)

112 _pending_hypothesis: str = ''

113

114 def is_budget_exhausted(self) -> bool:

115 return self.spark_consumed + self.spark_per_iteration > self.spark_budget

116

117 def is_improved(self, new_val: float) -> bool:

118 if self.best_metric is None:

119 return True

120 if self.metric_direction == 'lower_is_better':

121 return new_val < self.best_metric

122 return new_val > self.best_metric

123

124 def to_progress_dict(self) -> Dict:

125 return {

126 'session_id': self.session_id,

127 'status': self.status,

128 'iteration': self.current_iteration,

129 'max_iterations': self.max_iterations,

130 'baseline_metric': self.baseline_metric,

131 'best_metric': self.best_metric,

132 'best_iteration': self.best_iteration,

133 'total_improvements': self.total_improvements,

134 'spark_consumed': self.spark_consumed,

135 'spark_budget': self.spark_budget,

136 'elapsed_s': time.time() - self.start_time if self.start_time else 0,

137 # Enforcement flags — if False, dependency was missing/failed and

138 # the session ran without that regression-safety layer.

139 'baseline_enforced': self.baseline_enforced,

140 'benchmark_gain_enforced': self.benchmark_gain_enforced,

141 'federation_export_enforced': self.federation_export_enforced,

142 # RSI gates — recursive self-improvement loop closure.

143 'constitutional_enforced': self.constitutional_enforced,

144 'baseline_delta_enforced': self.baseline_delta_enforced,

145 'federation_broadcast_enforced': self.federation_broadcast_enforced,

146 'constitutional_rejections': self.constitutional_rejections,

147 'baseline_rejections': self.baseline_rejections,

148 'last_rejection_reason': self.last_rejection_reason,

149 }

150

151

152# ── Engine (session store + utilities) ────────────────────────

153

154class AutoResearchEngine:

155 """Session store and utility methods for autoresearch tools.

156

157 NOT a loop — the agent's conversation drives iteration by calling

158 individual tool functions in sequence.

159 """

160

161 def __init__(self):

162 self._active_sessions: Dict[str, AutoResearchSession] = {}

163 self._lock = threading.Lock()

164

165 def register_session(self, session: AutoResearchSession):

166 with self._lock:

167 self._active_sessions[session.session_id] = session

168

169 def unregister_session(self, session_id: str):

170 with self._lock:

171 self._active_sessions.pop(session_id, None)

172

173 def get_active_sessions(self) -> List[Dict]:

174 """Return progress for all active sessions."""

175 with self._lock:

176 return [s.to_progress_dict() for s in self._active_sessions.values()]

177

178 def get_session(self, session_id: str) -> Optional[AutoResearchSession]:

179 with self._lock:

180 return self._active_sessions.get(session_id)

181

182 # ── Edit Generation ─────────────────────────────────────

183

184 def generate_and_apply_edit(self, session: AutoResearchSession

185 ) -> Optional[Tuple[str, List[Dict], List[str]]]:

186 """Use LLM to generate a hypothesis and code edit, then apply it."""

187 try:

188 from integrations.coding_agent.aider_native_backend import AiderNativeBackend

189 backend = AiderNativeBackend()

190

191 history_summary = self.build_history_summary(session)

192

193 # Query BenchmarkTracker for best-performing tool insights

194 benchmark_hint = ''

195 try:

196 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker

197 tracker = get_benchmark_tracker()

198 best = tracker.get_best_tool('autoresearch')

199 if best:

200 name, success_rate, avg_time = best

201 benchmark_hint = (

202 f"\nBENCHMARK INSIGHT: Best tool '{name}' has "

203 f"{success_rate:.0%} success rate, avg {avg_time:.1f}s.\n"

204 )

205 except Exception:

206 pass

207

208 # RSI-5: ε-greedy exploration arm. When HEVOLVE_RSI_EXPLORE=1

209 # and the sampled coin lands in the explore bucket, swap the

210 # incremental-tuning prompt stance for a radical-mutation

211 # stance. The LLM remains the code mutator (no parallel

212 # code-generator path), but the instruction distribution

213 # shifts — this is the cheapest honest wiring of the

214 # stochastic arm without inventing a second mutation backend.

215 # Safety: the candidate still passes RSI-1 + RSI-2 gates

216 # inside commit_improvement before promotion.

217 exploration_hint = ''

218 try:

219 from integrations.agent_engine.exploration_arm import (

220 select_strategy,

221 )

222 if select_strategy() == 'explore':

223 exploration_hint = (

224 "\nEXPLORATION MODE: propose a RADICAL / "

225 "ARCHITECTURAL change this iteration — not an "

226 "incremental tweak. Favor ideas that reshape "

227 "the approach; safety gates still run before "

228 "promotion, so a failed bold change costs "

229 "nothing while a successful one opens the "

230 "search space.\n"

231 )

232 except Exception:

233 pass

234

235 task = (

236 f"You are running an autonomous research loop.\n\n"

237 f"TARGET FILE: {session.target_file}\n"

238 f"METRIC: {session.metric_name} "

239 f"({'lower is better' if session.metric_direction == 'lower_is_better' else 'higher is better'})\n"

240 f"BASELINE: {session.baseline_metric}\n"

241 f"CURRENT BEST: {session.best_metric} (iteration {session.best_iteration})\n"

242 f"ITERATION: {session.current_iteration}/{session.max_iterations}\n\n"

243 f"EXPERIMENT HISTORY:\n{history_summary}\n\n"

244 f"{benchmark_hint}"

245 f"{exploration_hint}"

246 f"RUN COMMAND: {session.run_command}\n\n"

247 f"YOUR TASK:\n"

248 f"1. Analyze what worked and what didn't from the history above\n"

249 f"2. Propose ONE focused modification to {session.target_file}\n"

250 f"3. Explain your hypothesis in one sentence\n"

251 f"4. Make the edit using SEARCH/REPLACE blocks\n\n"

252 f"RULES:\n"

253 f"- One change per iteration — small, testable, reversible\n"

254 f"- If you're stuck, try combinations of previous improvements\n"

255 f"- If all ideas seem tried, try something radical or architectural\n"

256 f"- Simplicity wins — a 0.001 gain from deleting code beats a 0.001 gain from 20 lines\n"

257 f"- NEVER modify the evaluation metric or test harness\n"

258 )

259

260 context = {

261 'working_dir': session.repo_path,

262 'files': [session.target_file],

263 }

264

265 result = backend.execute(task, context, timeout=120)

266

267 if not result.get('success'):

268 return None

269

270 output = result.get('output', '')

271 hypothesis = output.split('\n')[0][:200] if output else 'Unknown hypothesis'

272 edits = result.get('edits', [])

273 files_changed = result.get('files_changed', [])

274

275 return hypothesis, edits, files_changed

276

277 except Exception as e:

278 logger.warning(f"[{session.session_id}] Edit generation failed: {e}")

279 return None

280

281 # ── Experiment Execution ─────────────────────────────────

282

283 def run_experiment(self, session: AutoResearchSession,

284 is_baseline: bool = False) -> ExperimentResult:

285 """Run the experiment command and extract the metric."""

286 import re

287

288 result = ExperimentResult(

289 iteration=0 if is_baseline else session.current_iteration,

290 hypothesis='baseline' if is_baseline else '',

291 metric_name=session.metric_name,

292 metric_value=None,

293 baseline_value=session.baseline_metric,

294 improved=False,

295 )

296

297 start = time.time()

298 try:

299 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess

300 exit_code, output = run_cmd_subprocess(

301 session.run_command,

302 cwd=session.repo_path,

303 timeout=session.time_budget_s,

304 )

305 result.run_output = output[-5000:] if len(output) > 5000 else output

306 result.duration_s = time.time() - start

307

308 if exit_code != 0:

309 lines = output.split('\n')

310 tb_start = None

311 for i, line in enumerate(lines):

312 if 'Traceback' in line or 'Error' in line:

313 tb_start = i

314 break

315 if tb_start is not None:

316 result.error = '\n'.join(lines[tb_start:tb_start + 20])

317 else:

318 result.error = f'Exit code {exit_code}: {lines[-3:]}'

319 return result

320

321 metric_val = self.extract_metric(output, session)

322 result.metric_value = metric_val

323

324 if metric_val is not None and is_baseline:

325 result.improved = False

326

327 self.record_benchmark(session, result)

328

329 except Exception as e:

330 result.error = str(e)

331 result.duration_s = time.time() - start

332

333 return result

334

335 def record_benchmark(self, session: AutoResearchSession,

336 result: ExperimentResult):

337 """Record experiment result in BenchmarkTracker for evolution tracking.

338

339 BenchmarkTracker is how we prove that iter N actually beat iter N-1 on

340 the hive-shared leaderboard. If it's unavailable or raises, the

341 session's `benchmark_gain_enforced` flag is flipped to False and a

342 WARNING is emitted — callers (dashboards, tests) can then surface

343 "this session ran WITHOUT benchmark gain enforcement" rather than

344 silently treating 0 gain as a valid result.

345 """

346 try:

347 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker

348 tracker = get_benchmark_tracker()

349 tracker.record(

350 task_type='autoresearch',

351 tool_name='aider_native_backend',

352 completion_time_s=result.duration_s,

353 success=not result.error and result.metric_value is not None,

354 model_name=session.metric_name,

355 user_id=session.goal_id or session.session_id,

356 )

357 except ImportError as e:

358 session.benchmark_gain_enforced = False

359 logger.warning(

360 "[%s] BenchmarkTracker unavailable (ImportError: %s) — "

361 "iter %d ran WITHOUT benchmark-gain enforcement. "

362 "Set session.benchmark_gain_enforced=False.",

363 session.session_id, e, result.iteration,

364 )

365 except Exception as e:

366 session.benchmark_gain_enforced = False

367 logger.warning(

368 "[%s] BenchmarkTracker record failed (%s: %s) — "

369 "iter %d ran WITHOUT benchmark-gain enforcement.",

370 session.session_id, type(e).__name__, e, result.iteration,

371 )

372

373 def extract_metric(self, output: str, session: AutoResearchSession

374 ) -> Optional[float]:

375 """Extract the target metric from experiment output."""

376 import re

377

378 if session.metric_pattern:

379 match = re.search(session.metric_pattern, output)

380 if match:

381 try:

382 return float(match.group(1))

383 except (ValueError, IndexError):

384 pass

385

386 patterns = [

387 rf'{re.escape(session.metric_name)}[:\s=]+([0-9]+\.?[0-9]*)',

388 rf'^{re.escape(session.metric_name)}[:\s]+([0-9]+\.?[0-9]*)',

389 r'(\d+) passed',

391 ]

392

393 for pat in patterns:

394 match = re.search(pat, output, re.IGNORECASE | re.MULTILINE)

395 if match:

396 try:

397 return float(match.group(1))

398 except (ValueError, IndexError):

399 continue

400

401 return None

402

403 # ── Git State Management ─────────────────────────────────

404

405 def revert_changes(self, session: AutoResearchSession):

406 """Revert the working directory to the last good state."""

407 try:

408 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess

409 run_cmd_subprocess(

410 f'git checkout -- {session.target_file}',

411 cwd=session.repo_path,

412 timeout=10,

413 )

414 except Exception as e:

415 logger.warning(f"[{session.session_id}] Revert failed: {e}")

416

417 # ── RSI gates ─────────────────────────────────────────────

418 # These two gates close the recursive self-improvement loop: no

419 # candidate becomes the new baseline unless ConstitutionalFilter

420 # allows it AND AgentBaselineService.validate_against_baseline

421 # reports no regression. Both gates fail-open when the dependency

422 # is missing, but flip the corresponding *_enforced flag to False

423 # and log LOUDLY so the dashboard/test can see the bypass.

424

425 def _constitutional_gate(self, session: 'AutoResearchSession',

426 result: 'ExperimentResult') -> Tuple[bool, str]:

427 """Check the hypothesis + edit summary against ConstitutionalFilter.

428

429 Returns (allowed, reason). Fail-open on ImportError or hash

430 tamper, but flip session.constitutional_enforced=False in that

431 case so the dashboard can surface the missing gate.

432 """

433 try:

434 from security.hive_guardrails import ConstitutionalFilter

435 except ImportError as e:

436 session.constitutional_enforced = False

437 logger.warning(

438 "[%s] ConstitutionalFilter unavailable (ImportError: %s) — "

439 "iter %d promoted WITHOUT constitutional gate. "

440 "Set session.constitutional_enforced=False.",

441 session.session_id, e, result.iteration,

442 )

443 return True, 'gate_unavailable'

444

445 prompt_text = ' '.join(filter(None, [

446 result.hypothesis or '',

447 session.metric_name or '',

448 ' '.join(result.files_changed or []),

449 ]))

450 try:

451 allowed, reason = ConstitutionalFilter.check_prompt(prompt_text)

452 except RuntimeError as e:

453 # Guardrail tamper — fail-CLOSED. This is the one case where

454 # the gate is authoritative: if the guardrail values were

455 # mutated in memory we must NOT promote.

456 session.constitutional_enforced = True

457 logger.critical(

458 "[%s] ConstitutionalFilter TAMPER on iter %d: %s — "

459 "refusing to promote.",

460 session.session_id, result.iteration, e,

461 )

462 return False, f'guardrail_tamper: {e}'

463 except Exception as e:

464 session.constitutional_enforced = False

465 logger.warning(

466 "[%s] ConstitutionalFilter.check_prompt failed "

467 "(%s: %s) — iter %d promoted WITHOUT gate.",

468 session.session_id, type(e).__name__, e, result.iteration,

469 )

470 return True, 'gate_errored'

471

472 return allowed, reason

473

474 def _baseline_delta_gate(self, session: 'AutoResearchSession'

475 ) -> Tuple[bool, List[str], str]:

476 """Run AgentBaselineService.validate_against_baseline.

477

478 Returns (passed, regressions, reason). Fail-open if service is

479 unavailable or no baseline exists yet (first-run case), but

480 flip session.baseline_delta_enforced=False on unavailable path.

481 """

482 try:

483 from integrations.agent_engine.agent_baseline_service import (

484 AgentBaselineService,

485 )

486 except ImportError as e:

487 session.baseline_delta_enforced = False

488 logger.warning(

489 "[%s] AgentBaselineService unavailable for delta gate "

490 "(ImportError: %s) — promoted WITHOUT baseline compare.",

491 session.session_id, e,

492 )

493 return True, [], 'gate_unavailable'

494

495 try:

496 prompt_id = session.experiment_id or session.session_id

497 result = AgentBaselineService.validate_against_baseline(

498 prompt_id=prompt_id, flow_id=0,

499 )

500 except Exception as e:

501 session.baseline_delta_enforced = False

502 logger.warning(

503 "[%s] validate_against_baseline errored (%s: %s) — "

504 "promoted WITHOUT baseline compare.",

505 session.session_id, type(e).__name__, e,

506 )

507 return True, [], 'gate_errored'

508

509 passed = bool(result.get('passed', True))

510 regressions = list(result.get('regressions', []) or [])

511 reason = result.get('reason', '') or (

512 'no_regressions' if passed else 'regressions_detected'

513 )

514 return passed, regressions, reason

515

516 def commit_improvement(self, session: AutoResearchSession,

517 result: ExperimentResult) -> bool:

518 """Commit the improvement to git and save as recipe step.

519

520 Returns True if the candidate was actually promoted (gates passed +

521 git commit attempted), False if a gate rejected it (pending edits

522 reverted, session rejection counters incremented).

523

524 RSI gate chain (both must pass to promote):

525 1. ConstitutionalFilter — hypothesis/edit summary free of

526 violation patterns.

527 2. AgentBaselineService.validate_against_baseline — no

528 cross-metric regression vs the latest live snapshot.

529

530 Fail-open on missing dependencies (flags flip loud), fail-closed

531 on guardrail tamper.

532 """

533 # ── RSI-1: constitutional gate ──

534 allowed, cons_reason = self._constitutional_gate(session, result)

535 if not allowed:

536 session.constitutional_rejections += 1

537 session.last_rejection_reason = f'constitutional: {cons_reason}'

538 logger.warning(

539 "[%s] Iter %d REJECTED by ConstitutionalFilter: %s — "

540 "reverting pending edits.",

541 session.session_id, result.iteration, cons_reason,

542 )

543 self.revert_changes(session)

544 self.emit_progress(session, 'autoresearch.rejected', {

545 'iteration': result.iteration,

546 'gate': 'constitutional',

547 'reason': cons_reason,

548 })

549 return False

550

551 # ── RSI-2: baseline delta gate ──

552 passed, regressions, base_reason = self._baseline_delta_gate(session)

553 if not passed:

554 session.baseline_rejections += 1

555 session.last_rejection_reason = (

556 f'baseline_regression: {"; ".join(regressions) or base_reason}'

557 )

558 logger.warning(

559 "[%s] Iter %d REJECTED by baseline delta: %s — "

560 "reverting pending edits.",

561 session.session_id, result.iteration,

562 regressions or base_reason,

563 )

564 self.revert_changes(session)

565 self.emit_progress(session, 'autoresearch.rejected', {

566 'iteration': result.iteration,

567 'gate': 'baseline_delta',

568 'regressions': regressions,

569 'reason': base_reason,

570 })

571 return False

572

573 # ── Gates passed — proceed with existing commit + recipe + snapshot ──

574 try:

575 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess

576 msg = (f"autoresearch iter {result.iteration}: "

577 f"{session.metric_name}={result.metric_value} "

578 f"(was {result.baseline_value})")

579 run_cmd_subprocess(

580 f'git add {session.target_file} && git commit -m "{msg}"',

581 cwd=session.repo_path,

582 timeout=15,

583 )

584 except Exception as e:

585 logger.debug(f"[{session.session_id}] Git commit skipped: {e}")

586

587 try:

588 from integrations.coding_agent.recipe_bridge import CodingRecipeBridge

589 bridge = CodingRecipeBridge()

590 bridge.capture_edit_as_recipe_step(

591 task=f'autoresearch: {session.metric_name} optimization',

592 tool_name='autoresearch',

593 file_edits=result.edits,

594 working_dir=session.repo_path,

595 )

596 except ImportError as e:

597 logger.warning(

598 "[%s] CodingRecipeBridge unavailable (ImportError: %s) — "

599 "iter %d improvement NOT captured as recipe step.",

600 session.session_id, e, result.iteration,

601 )

602 except Exception as e:

603 logger.warning(

604 "[%s] CodingRecipeBridge.capture failed (%s: %s) — "

605 "iter %d improvement NOT captured as recipe step.",

606 session.session_id, type(e).__name__, e, result.iteration,

607 )

608

609 # AgentBaselineService snapshot is the regression escape-hatch the

610 # audit flagged: if we silently skip it, a later benchmark-based

611 # rollback has no anchor to roll back TO. Make the absence LOUD

612 # and set the session flag so dashboards/tests can read it.

613 try:

614 from integrations.agent_engine.agent_baseline_service import AgentBaselineService

615 AgentBaselineService.capture_snapshot(

616 prompt_id=session.experiment_id or session.session_id,

617 flow_id='autoresearch',

618 trigger='autoresearch_improvement',

619 user_id=session.goal_id or 'system',

620 )

621 except ImportError as e:

622 session.baseline_enforced = False

623 logger.warning(

624 "[%s] AgentBaselineService unavailable (ImportError: %s) — "

625 "iter %d kept WITHOUT baseline snapshot. "

626 "Set session.baseline_enforced=False — regression rollback "

627 "will have no anchor for this iteration.",

628 session.session_id, e, result.iteration,

629 )

630 except Exception as e:

631 session.baseline_enforced = False

632 logger.warning(

633 "[%s] AgentBaselineService.capture_snapshot failed "

634 "(%s: %s) — iter %d kept WITHOUT baseline snapshot.",

635 session.session_id, type(e).__name__, e, result.iteration,

636 )

637

638 self.emit_progress(session, 'autoresearch.promoted', {

639 'iteration': result.iteration,

640 'metric_value': result.metric_value,

641 'baseline_value': result.baseline_value,

642 })

643 return True

644

645 # ── History & Reporting ──────────────────────────────────

646

647 def build_history_summary(self, session: AutoResearchSession) -> str:

648 """Build a compact summary of previous iterations for the LLM."""

649 if not session.results:

650 return 'No previous iterations.'

651

652 lines = []

653 for r in session.results[-10:]:

654 status = 'IMPROVED' if r.get('improved') else 'reverted'

655 val = r.get('metric_value', '?')

656 hyp = r.get('hypothesis', '')[:80]

657 err = r.get('error', '')[:50]

658 if err:

659 lines.append(f" iter {r.get('iteration', '?')}: CRASHED — {err}")

660 else:

661 lines.append(f" iter {r.get('iteration', '?')}: {val} ({status}) — {hyp}")

662

663 return '\n'.join(lines)

664

665 def save_report(self, session: AutoResearchSession):

666 """Save the session report to agent_data for persistence."""

667 try:

668 report_dir = os.path.join(

669 os.path.dirname(__file__), '..', '..', 'agent_data', 'autoresearch')

670 os.makedirs(report_dir, exist_ok=True)

671

672 report_path = os.path.join(report_dir, f'{session.session_id}.json')

673 report = {

674 'session': session.to_progress_dict(),

675 'config': {

676 'repo_path': session.repo_path,

677 'target_file': session.target_file,

678 'run_command': session.run_command,

679 'metric_name': session.metric_name,

680 'metric_direction': session.metric_direction,

681 'max_iterations': session.max_iterations,

682 'time_budget_s': session.time_budget_s,

683 },

684 'results': session.results,

685 }

686 with open(report_path, 'w', encoding='utf-8') as f:

687 json.dump(report, f, indent=2, default=str)

688 logger.info(f"[{session.session_id}] Report saved: {report_path}")

689 except Exception as e:

690 logger.warning(f"[{session.session_id}] Report save failed: {e}")

691

692 self.export_learning_delta(session)

693

694 def export_learning_delta(self, session: AutoResearchSession):

695 """Export session results as a federated learning delta AND

696 broadcast them to peer Hive nodes.

697

698 Two layers:

699 1. BenchmarkTracker.export_learning_delta — prepare the

700 delta payload. If unavailable, federation_export_enforced

701 flips False and a WARNING is emitted.

702 2. FederatedAggregator.broadcast_delta — actually transmit

703 the delta to known peers. If unavailable OR if the peer

704 POST leg errors, federation_broadcast_enforced flips False

705 and a WARNING is emitted. ScopeGuard inside broadcast_delta

706 is the authoritative egress gate (PII / secrets blocked).

707

708 This closes RSI-3: promoted improvements actually propagate across

709 the Hive so "the most" user-owned nodes benefit, not just the

710 instance that ran the iteration.

711 """

712 delta = None

713 try:

714 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker

715 tracker = get_benchmark_tracker()

716 delta = tracker.export_learning_delta() or {}

717 delta['autoresearch'] = {

718 'session_id': session.session_id,

719 'experiment_id': session.experiment_id,

720 'metric_name': session.metric_name,

721 'baseline': session.baseline_metric,

722 'best': session.best_metric,

723 'total_improvements': session.total_improvements,

724 'iterations': session.current_iteration,

725 'constitutional_rejections': session.constitutional_rejections,

726 'baseline_rejections': session.baseline_rejections,

727 }

728 logger.info(

729 "[%s] Learning delta prepared for federation "

730 "(improvements=%d, rejections=c%d/b%d)",

731 session.session_id, session.total_improvements,

732 session.constitutional_rejections, session.baseline_rejections,

733 )

734 except ImportError as e:

735 session.federation_export_enforced = False

736 logger.warning(

737 "[%s] BenchmarkTracker unavailable (ImportError: %s) — "

738 "learning delta NOT exported; hive will not learn from "

739 "this session. Set session.federation_export_enforced=False.",

740 session.session_id, e,

741 )

742 return

743 except Exception as e:

744 session.federation_export_enforced = False

745 logger.warning(

746 "[%s] Learning delta export failed (%s: %s) — "

747 "hive will not learn from this session.",

748 session.session_id, type(e).__name__, e,

749 )

750 return

751

752 # ── RSI-3: broadcast to peer Hive nodes ──

753 # This is the "federate" leg. Without it, improvements stay

754 # local and "the most" never benefits. ScopeGuard inside

755 # broadcast_delta is the authoritative egress gate.

756 try:

757 from integrations.agent_engine.federated_aggregator import (

758 get_federated_aggregator,

759 )

760 aggregator = get_federated_aggregator()

761 aggregator.broadcast_delta(delta)

762 logger.info(

763 "[%s] Learning delta broadcast to hive peers via "

764 "FederatedAggregator", session.session_id,

765 )

766 except ImportError as e:

767 session.federation_broadcast_enforced = False

768 logger.warning(

769 "[%s] FederatedAggregator unavailable (ImportError: %s) — "

770 "delta NOT broadcast to peers; hive will not learn from "

771 "this session. Set session.federation_broadcast_enforced=False.",

772 session.session_id, e,

773 )

774 except Exception as e:

775 session.federation_broadcast_enforced = False

776 logger.warning(

777 "[%s] FederatedAggregator.broadcast_delta failed "

778 "(%s: %s) — peers did not receive this session's delta.",

779 session.session_id, type(e).__name__, e,

780 )

781

782 def emit_progress(self, session: AutoResearchSession,

783 event_topic: str, data: Dict = None):

784 """Emit progress event via EventBus for live tracker updates."""

785 try:

786 from core.platform.events import emit_event

787 payload = data or {}

788 payload['session_id'] = session.session_id

789 payload['experiment_id'] = session.experiment_id

790 payload['goal_id'] = session.goal_id

791 emit_event(event_topic, payload)

792 except Exception:

793 pass

794

795

796# ── Singleton ────────────────────────────────────────────────

797

798_engine: Optional[AutoResearchEngine] = None

799_engine_lock = threading.Lock()

800

801

802def get_autoresearch_engine() -> AutoResearchEngine:

803 """Get or create the singleton AutoResearchEngine."""

804 global _engine

805 if _engine is None:

806 with _engine_lock:

807 if _engine is None:

808 _engine = AutoResearchEngine()

809 return _engine

810

811

812# ── Agent Tool Functions (step-based) ────────────────────────

813# The agent calls these in sequence. The agent's conversation loop

814# IS the iteration loop — no hardcoded Python while loop.

815

816

817def autoresearch_setup(repo_path: str, target_file: str, run_command: str,

818 metric_name: str = 'score',

819 metric_pattern: str = '',

820 metric_direction: str = 'higher_is_better',

821 max_iterations: int = 50,

822 time_budget_s: int = 300,

823 experiment_id: str = '',

824 goal_id: str = '') -> str:

825 """Set up an autoresearch session and run the baseline experiment.

826

827 Call this FIRST. Creates a session, runs the unmodified code to capture

828 the baseline metric, and returns a session_id for subsequent steps.

829

830 Agent loop pattern:

831 1. autoresearch_setup(...) → get session_id + baseline

832 2. autoresearch_edit(session_id) → propose code edit

833 3. autoresearch_run(session_id) → run + score

834 4. autoresearch_decide(session_id) → keep or revert

835 5. Repeat 2-4 until converged or budget exhausted

836 6. autoresearch_finalize(session_id) → save report

837

838 Args:

839 repo_path: Path to the git repository

840 target_file: The file to modify (relative to repo_path)

841 run_command: Shell command to run the experiment

842 metric_name: Name of the metric to optimize

843 metric_pattern: Regex with group(1) to extract metric from output

844 metric_direction: 'higher_is_better' or 'lower_is_better'

845 max_iterations: Maximum iterations before stopping

846 time_budget_s: Per-iteration time budget in seconds

847 experiment_id: ThoughtExperiment ID (if triggered by one)

848 goal_id: AgentGoal ID

849

850 Returns:

851 JSON with session_id, baseline_metric, and status

852 """

853 if not os.path.isdir(repo_path):

854 return json.dumps({'error': f'repo_path not found: {repo_path}'})

855

856 target_path = os.path.join(repo_path, target_file)

857 if not os.path.isfile(target_path):

858 return json.dumps({'error': f'target_file not found: {target_file}'})

859

860 session = AutoResearchSession(

861 experiment_id=experiment_id,

862 goal_id=goal_id,

863 repo_path=repo_path,

864 target_file=target_file,

865 run_command=run_command,

866 metric_name=metric_name,

867 metric_pattern=metric_pattern,

868 metric_direction=metric_direction,

869 max_iterations=max_iterations,

870 time_budget_s=time_budget_s,

871 )

872 session.status = 'running'

873 session.start_time = time.time()

874

875 engine = get_autoresearch_engine()

876 engine.register_session(session)

877

878 # Run baseline

879 baseline = engine.run_experiment(session, is_baseline=True)

880 if baseline.error:

881 session.status = 'failed'

882 session.results.append(asdict(baseline))

883 engine.emit_progress(session, 'autoresearch.failed',

884 {'error': f'Baseline failed: {baseline.error}'})

885 return json.dumps({

886 'error': f'Baseline failed: {baseline.error}',

887 'session_id': session.session_id,

888 })

889

890 session.baseline_metric = baseline.metric_value

891 session.best_metric = baseline.metric_value

892 session.results.append(asdict(baseline))

893 engine.emit_progress(session, 'autoresearch.started')

894 engine.emit_progress(session, 'autoresearch.baseline',

895 {'baseline': baseline.metric_value})

896

897 return json.dumps({

898 'session_id': session.session_id,

899 'status': 'running',

900 'baseline_metric': baseline.metric_value,

901 'metric_name': metric_name,

902 'metric_direction': metric_direction,

903 'max_iterations': max_iterations,

904 'instruction': (

905 'Baseline captured. Now call autoresearch_edit to propose a code '

906 'change, then autoresearch_run to test it, then autoresearch_decide '

907 'to keep or revert. Repeat until converged or budget exhausted.'

908 ),

909 })

910

911

912def autoresearch_edit(session_id: str) -> str:

913 """Propose and apply one code edit for an autoresearch session.

914

915 Uses LLM + AiderNativeBackend to generate a hypothesis and apply

916 the code modification. Call autoresearch_run next to test it.

917

918 Args:

919 session_id: The session ID from autoresearch_setup

920

921 Returns:

922 JSON with hypothesis, files_changed, and budget status

923 """

924 engine = get_autoresearch_engine()

925 session = engine.get_session(session_id)

926 if not session:

927 return json.dumps({'error': f'Session {session_id} not found'})

928

929 if session.is_budget_exhausted():

930 session.status = 'budget_exhausted'

931 return json.dumps({

932 'budget_exhausted': True,

933 'spark_consumed': session.spark_consumed,

934 'spark_budget': session.spark_budget,

935 'instruction': 'Budget exhausted. Call autoresearch_finalize to save report.',

936 })

937

938 session.current_iteration += 1

939 edit_result = engine.generate_and_apply_edit(session)

940

941 if not edit_result:

942 return json.dumps({

943 'success': False,

944 'iteration': session.current_iteration,

945 'reason': 'No edit generated by LLM',

946 'instruction': 'Try calling autoresearch_edit again for a new hypothesis.',

947 })

948

949 hypothesis, edits, files_changed = edit_result

950 session._pending_hypothesis = hypothesis

951 session._pending_edits = edits

952 session._pending_files = files_changed

953

954 return json.dumps({

955 'success': True,

956 'iteration': session.current_iteration,

957 'hypothesis': hypothesis,

958 'files_changed': files_changed,

959 'instruction': 'Edit applied. Call autoresearch_run to test this change.',

960 })

961

962

963def autoresearch_run(session_id: str) -> str:

964 """Run the experiment after an edit and extract the metric.

965

966 Executes the run_command, extracts the target metric from output,

967 and records the result in BenchmarkTracker.

968

969 Args:

970 session_id: The session ID from autoresearch_setup

971

972 Returns:

973 JSON with metric_value, improved, and comparison to best

974 """

975 engine = get_autoresearch_engine()

976 session = engine.get_session(session_id)

977 if not session:

978 return json.dumps({'error': f'Session {session_id} not found'})

979

980 result = engine.run_experiment(session, is_baseline=False)

981 result.iteration = session.current_iteration

982 result.hypothesis = session._pending_hypothesis

983 result.edits = session._pending_edits

984 result.files_changed = session._pending_files

985

986 session.spark_consumed += session.spark_per_iteration

987

988 # Determine improvement

989 improved = False

990 if result.error:

991 improved = False

992 elif result.metric_value is not None and session.is_improved(result.metric_value):

993 improved = True

994

995 result.improved = improved

996 result.baseline_value = session.best_metric

997

998 # Store for decide step

999 session.results.append(asdict(result))

1000 engine.emit_progress(session, 'autoresearch.iteration', asdict(result))

1001

1002 return json.dumps({

1003 'iteration': session.current_iteration,

1004 'metric_value': result.metric_value,

1005 'best_metric': session.best_metric,

1006 'improved': improved,

1007 'error': result.error or None,

1008 'duration_s': round(result.duration_s, 1),

1009 'instruction': (

1010 f'{"IMPROVED" if improved else "No improvement"}. '

1011 f'Call autoresearch_decide to {"keep" if improved else "revert"} this change.'

1012 ),

1013 })

1014

1015

1016def autoresearch_decide(session_id: str) -> str:

1017 """Keep or revert the last edit based on the experiment result.

1018

1019 If the last run improved the metric, commits the change and saves

1020 it as a recipe step. If not, reverts via git checkout.

1021

1022 Args:

1023 session_id: The session ID from autoresearch_setup

1024

1025 Returns:

1026 JSON with decision, current best, and next step advice

1027 """

1028 engine = get_autoresearch_engine()

1029 session = engine.get_session(session_id)

1030 if not session:

1031 return json.dumps({'error': f'Session {session_id} not found'})

1032

1033 if not session.results:

1034 return json.dumps({'error': 'No experiment results to decide on'})

1035

1036 last_result_dict = session.results[-1]

1037 improved = last_result_dict.get('improved', False)

1038 metric_value = last_result_dict.get('metric_value')

1039 error = last_result_dict.get('error', '')

1040

1041 if error or not improved:

1042 # Revert — in-session metric failed to improve.

1043 engine.revert_changes(session)

1044 decision = 'reverted'

1045 logger.info(f"[{session.session_id}] Iter {session.current_iteration} "

1046 f"reverted: {metric_value} vs best {session.best_metric}")

1047 else:

1048 # Candidate passed the in-session metric check — hand it to

1049 # commit_improvement which runs the RSI gates (constitutional +

1050 # baseline delta) and returns False if either rejects. On

1051 # rejection it reverts the pending edits itself and bumps the

1052 # appropriate rejection counter, so we only update best_metric /

1053 # best_iteration / total_improvements when the promote actually

1054 # landed. This enforces the monotonic-vs-today's-baseline

1055 # guarantee globally, not just on the metric being optimized.

1056 prior_best = session.best_metric

1057 result = ExperimentResult(

1058 iteration=session.current_iteration,

1059 hypothesis=session._pending_hypothesis,

1060 metric_name=session.metric_name,

1061 metric_value=metric_value,

1062 baseline_value=prior_best,

1063 improved=True,

1064 edits=session._pending_edits,

1065 files_changed=session._pending_files,

1066 )

1067 committed = engine.commit_improvement(session, result)

1068 if committed:

1069 session.best_metric = metric_value

1070 session.best_iteration = session.current_iteration

1071 session.total_improvements += 1

1072 decision = 'kept'

1073 logger.info(

1074 f"[{session.session_id}] Iter {session.current_iteration} "

1075 f"IMPROVED: {metric_value} (was {prior_best})")

1076 else:

1077 # RSI gate rejected. best_metric stays at prior_best and the

1078 # pending edits were already reverted by commit_improvement.

1079 decision = 'rejected_by_gate'

1080 logger.info(

1081 f"[{session.session_id}] Iter {session.current_iteration} "

1082 f"gated (reason={session.last_rejection_reason}); "

1083 f"best remains {prior_best}")

1084

1085 # Clear pending state

1086 session._pending_hypothesis = ''

1087 session._pending_edits = []

1088 session._pending_files = []

1089

1090 # Convergence check

1091 should_continue = (

1092 session.current_iteration < session.max_iterations

1093 and not session.is_budget_exhausted()

1094 )

1095

1096 return json.dumps({

1097 'decision': decision,

1098 'iteration': session.current_iteration,

1099 'best_metric': session.best_metric,

1100 'best_iteration': session.best_iteration,

1101 'total_improvements': session.total_improvements,

1102 'spark_consumed': session.spark_consumed,

1103 'should_continue': should_continue,

1104 'instruction': (

1105 'Call autoresearch_edit for the next iteration.'

1106 if should_continue else

1107 'Done iterating. Call autoresearch_finalize to save the report.'

1108 ),

1109 })

1110

1111

1112def autoresearch_finalize(session_id: str) -> str:

1113 """Finalize an autoresearch session — save report and export deltas.

1114

1115 Call this when iteration is complete (converged, budget exhausted,

1116 or max iterations reached). Saves the session report and exports

1117 learning deltas for hive-wide federation.

1118

1119 Args:

1120 session_id: The session ID from autoresearch_setup

1121

1122 Returns:

1123 JSON with final session summary

1124 """

1125 engine = get_autoresearch_engine()

1126 session = engine.get_session(session_id)

1127 if not session:

1128 return json.dumps({'error': f'Session {session_id} not found'})

1129

1130 if session.status == 'running':

1131 session.status = 'completed'

1132

1133 engine.save_report(session)

1134 engine.emit_progress(session, 'autoresearch.completed',

1135 session.to_progress_dict())

1136 engine.unregister_session(session_id)

1137

1138 return json.dumps({

1139 'status': session.status,

1140 'session_id': session.session_id,

1141 'baseline_metric': session.baseline_metric,

1142 'best_metric': session.best_metric,

1143 'best_iteration': session.best_iteration,

1144 'total_improvements': session.total_improvements,

1145 'total_iterations': session.current_iteration,

1146 'spark_consumed': session.spark_consumed,

1147 'elapsed_s': round(time.time() - session.start_time, 1),

1148 })

1149

1150

1151def get_autoresearch_status(session_id: str = '') -> str:

1152 """Get the status of an autoresearch session or all active sessions.

1153

1154 Args:

1155 session_id: Specific session ID, or empty for all active sessions

1156

1157 Returns:

1158 JSON with session progress

1159 """

1160 engine = get_autoresearch_engine()

1161

1162 if session_id:

1163 session = engine.get_session(session_id)

1164 if session:

1165 return json.dumps(session.to_progress_dict())

1166 # Check saved reports

1167 report_path = os.path.join(

1168 os.path.dirname(__file__), '..', '..', 'agent_data',

1169 'autoresearch', f'{session_id}.json')

1170 if os.path.isfile(report_path):

1171 with open(report_path, 'r') as f:

1172 return f.read()

1173 return json.dumps({'error': f'Session {session_id} not found'})

1174

1175 return json.dumps({'active_sessions': engine.get_active_sessions()})

1176

1177

1178# ── Backward-compatible alias ─────────────────────────────────

1179# launch_experiment_autoresearch in thought_experiment_tools.py calls this

1180

1181def start_autoresearch(repo_path: str, target_file: str, run_command: str,

1182 metric_name: str = 'score', metric_pattern: str = '',

1183 metric_direction: str = 'higher_is_better',

1184 max_iterations: int = 50, time_budget_s: int = 300,

1185 experiment_id: str = '', goal_id: str = '',

1186 hive_parallel: bool = False,

1187 num_variants: int = 3) -> str:

1188 """Backward-compatible wrapper — delegates to autoresearch_setup.

1189

1190 The hive_parallel parameter is accepted but ignored (hive dispatch

1191 is now handled by the agent via compute mesh tools).

1192 """

1193 return autoresearch_setup(

1194 repo_path=repo_path, target_file=target_file,

1195 run_command=run_command, metric_name=metric_name,

1196 metric_pattern=metric_pattern, metric_direction=metric_direction,

1197 max_iterations=max_iterations, time_budget_s=time_budget_s,

1198 experiment_id=experiment_id, goal_id=goal_id,

1199 )

1200

1201

1202# Tool registration list (consumed by ServiceToolRegistry)

1203AUTOEVOLVE_CODE_TOOLS = [

1204 {

1205 'name': 'autoresearch_setup',

1206 'func': autoresearch_setup,

1207 'description': (

1208 'Set up a code research session and run baseline. Returns session_id. '

1209 'Call autoresearch_edit → autoresearch_run → autoresearch_decide in a loop.'

1210 ),

1211 'tags': ['autoresearch', 'coding'],

1212 },

1213 {

1214 'name': 'autoresearch_edit',

1215 'func': autoresearch_edit,

1216 'description': 'Propose and apply one LLM-generated code edit.',

1217 'tags': ['autoresearch', 'coding'],

1218 },

1219 {

1220 'name': 'autoresearch_run',

1221 'func': autoresearch_run,

1222 'description': 'Run the experiment after an edit and extract the metric.',

1223 'tags': ['autoresearch', 'coding'],

1224 },

1225 {

1226 'name': 'autoresearch_decide',

1227 'func': autoresearch_decide,

1228 'description': 'Keep (git commit) or revert (git checkout) the last edit.',

1229 'tags': ['autoresearch', 'coding'],

1230 },

1231 {

1232 'name': 'autoresearch_finalize',

1233 'func': autoresearch_finalize,

1234 'description': 'Save session report and export learning deltas to federation.',

1235 'tags': ['autoresearch', 'coding'],

1236 },

1237 {

1238 'name': 'get_autoresearch_status',

1239 'func': get_autoresearch_status,

1240 'description': 'Get progress of an autoresearch session or list all active sessions.',

1241 'tags': ['autoresearch'],

1242 },

1243]

1244

1245# Backward-compat alias

1246AUTORESEARCH_TOOLS = AUTOEVOLVE_CODE_TOOLS

Coverage for integrations / coding_agent / autoevolve_code_tools.py: 81.8%

379 statements