Coverage for integrations / coding_agent / autoevolve_code_tools.py: 81.8%
379 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Auto-Evolve Code Tools — Agent-native code experiment tools.
4Individual tools for autonomous code experiments: setup → edit → run → score →
5keep/revert → finalize. The agent's conversation loop (autogen group chat)
6drives iteration — no hardcoded Python while loop.
8Inspired by karpathy/autoresearch. Each tool is a single step:
9 1. autoresearch_setup — create session, run baseline, return session_id
10 2. autoresearch_edit — LLM proposes + applies one code edit
11 3. autoresearch_run — run experiment, extract metric, record benchmark
12 4. autoresearch_decide — keep (git commit) or revert (git checkout)
13 5. autoresearch_finalize — save report, export learning delta
14 6. get_autoresearch_status — poll session progress
16Uses existing infra only:
17 - AiderNativeBackend for code edits
18 - run_cmd_subprocess for experiment execution
19 - BenchmarkTracker for score tracking
20 - CodingRecipeBridge for saving winning edits as recipes
21 - AgentBaselineService for evolution snapshots
22 - EventBus for live progress events
23"""
24import json
25import logging
26import os
27import threading
28import time
29import uuid
30from dataclasses import dataclass, field, asdict
31from typing import Dict, List, Optional, Tuple
33logger = logging.getLogger('hevolve.autoresearch')
36# ── Result Types ─────────────────────────────────────────────
38@dataclass
39class ExperimentResult:
40 """Result of a single experiment iteration."""
41 iteration: int
42 hypothesis: str
43 metric_name: str
44 metric_value: Optional[float]
45 baseline_value: Optional[float]
46 improved: bool
47 files_changed: List[str] = field(default_factory=list)
48 edits: List[Dict] = field(default_factory=list)
49 run_output: str = ''
50 error: str = ''
51 duration_s: float = 0.0
53 @property
54 def delta(self) -> Optional[float]:
55 if self.metric_value is not None and self.baseline_value is not None:
56 return self.metric_value - self.baseline_value
57 return None
60@dataclass
61class AutoResearchSession:
62 """Tracks the full autoresearch session state."""
63 session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
64 experiment_id: str = '' # ThoughtExperiment ID (if triggered by one)
65 goal_id: str = '' # AgentGoal ID
66 repo_path: str = '' # Working directory
67 target_file: str = '' # The file being modified (like train.py)
68 run_command: str = '' # Command to run the experiment
69 metric_name: str = 'score' # Name of the metric to optimize
70 metric_pattern: str = '' # Regex to extract metric from output
71 metric_direction: str = 'higher_is_better' # or 'lower_is_better'
72 max_iterations: int = 50
73 time_budget_s: int = 300 # Per-iteration time budget (5 min default)
74 spark_budget: int = 200 # Total Spark budget
75 spark_consumed: int = 0
76 spark_per_iteration: int = 4 # Spark cost per iteration
78 # State
79 baseline_metric: Optional[float] = None
80 best_metric: Optional[float] = None
81 best_iteration: int = 0
82 current_iteration: int = 0
83 status: str = 'pending' # pending | running | completed | failed | budget_exhausted
84 results: List[Dict] = field(default_factory=list)
85 start_time: float = 0.0
86 total_improvements: int = 0
88 # Regression-escape-hatch flags — set LOUDLY when an enforcement layer
89 # is unavailable, so downstream consumers (dashboards, tests, audits)
90 # can surface "this session didn't enforce baseline / benchmark gain".
91 # These are cleared (False) when the dependency is present and working.
92 baseline_enforced: bool = True # AgentBaselineService captured snapshot
93 benchmark_gain_enforced: bool = True # BenchmarkTracker recorded iteration
94 federation_export_enforced: bool = True # Learning delta exported
96 # RSI gates — every promoted improvement must pass both. If a gate
97 # layer is unavailable, the flag flips to False and a WARNING is
98 # emitted; if a gate layer is present and rejects, counters increment
99 # and last_rejection_reason records why. Dashboards + tests read
100 # these via to_progress_dict() to verify the recursive self-improvement
101 # loop is actually closed.
102 constitutional_enforced: bool = True # ConstitutionalFilter ran
103 baseline_delta_enforced: bool = True # validate_against_baseline ran
104 federation_broadcast_enforced: bool = True # broadcast_delta ran
105 constitutional_rejections: int = 0
106 baseline_rejections: int = 0
107 last_rejection_reason: str = ''
109 # Last edit state (for decide step)
110 _pending_edits: List[Dict] = field(default_factory=list)
111 _pending_files: List[str] = field(default_factory=list)
112 _pending_hypothesis: str = ''
114 def is_budget_exhausted(self) -> bool:
115 return self.spark_consumed + self.spark_per_iteration > self.spark_budget
117 def is_improved(self, new_val: float) -> bool:
118 if self.best_metric is None:
119 return True
120 if self.metric_direction == 'lower_is_better':
121 return new_val < self.best_metric
122 return new_val > self.best_metric
124 def to_progress_dict(self) -> Dict:
125 return {
126 'session_id': self.session_id,
127 'status': self.status,
128 'iteration': self.current_iteration,
129 'max_iterations': self.max_iterations,
130 'baseline_metric': self.baseline_metric,
131 'best_metric': self.best_metric,
132 'best_iteration': self.best_iteration,
133 'total_improvements': self.total_improvements,
134 'spark_consumed': self.spark_consumed,
135 'spark_budget': self.spark_budget,
136 'elapsed_s': time.time() - self.start_time if self.start_time else 0,
137 # Enforcement flags — if False, dependency was missing/failed and
138 # the session ran without that regression-safety layer.
139 'baseline_enforced': self.baseline_enforced,
140 'benchmark_gain_enforced': self.benchmark_gain_enforced,
141 'federation_export_enforced': self.federation_export_enforced,
142 # RSI gates — recursive self-improvement loop closure.
143 'constitutional_enforced': self.constitutional_enforced,
144 'baseline_delta_enforced': self.baseline_delta_enforced,
145 'federation_broadcast_enforced': self.federation_broadcast_enforced,
146 'constitutional_rejections': self.constitutional_rejections,
147 'baseline_rejections': self.baseline_rejections,
148 'last_rejection_reason': self.last_rejection_reason,
149 }
152# ── Engine (session store + utilities) ────────────────────────
154class AutoResearchEngine:
155 """Session store and utility methods for autoresearch tools.
157 NOT a loop — the agent's conversation drives iteration by calling
158 individual tool functions in sequence.
159 """
161 def __init__(self):
162 self._active_sessions: Dict[str, AutoResearchSession] = {}
163 self._lock = threading.Lock()
165 def register_session(self, session: AutoResearchSession):
166 with self._lock:
167 self._active_sessions[session.session_id] = session
169 def unregister_session(self, session_id: str):
170 with self._lock:
171 self._active_sessions.pop(session_id, None)
173 def get_active_sessions(self) -> List[Dict]:
174 """Return progress for all active sessions."""
175 with self._lock:
176 return [s.to_progress_dict() for s in self._active_sessions.values()]
178 def get_session(self, session_id: str) -> Optional[AutoResearchSession]:
179 with self._lock:
180 return self._active_sessions.get(session_id)
182 # ── Edit Generation ─────────────────────────────────────
184 def generate_and_apply_edit(self, session: AutoResearchSession
185 ) -> Optional[Tuple[str, List[Dict], List[str]]]:
186 """Use LLM to generate a hypothesis and code edit, then apply it."""
187 try:
188 from integrations.coding_agent.aider_native_backend import AiderNativeBackend
189 backend = AiderNativeBackend()
191 history_summary = self.build_history_summary(session)
193 # Query BenchmarkTracker for best-performing tool insights
194 benchmark_hint = ''
195 try:
196 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker
197 tracker = get_benchmark_tracker()
198 best = tracker.get_best_tool('autoresearch')
199 if best:
200 name, success_rate, avg_time = best
201 benchmark_hint = (
202 f"\nBENCHMARK INSIGHT: Best tool '{name}' has "
203 f"{success_rate:.0%} success rate, avg {avg_time:.1f}s.\n"
204 )
205 except Exception:
206 pass
208 # RSI-5: ε-greedy exploration arm. When HEVOLVE_RSI_EXPLORE=1
209 # and the sampled coin lands in the explore bucket, swap the
210 # incremental-tuning prompt stance for a radical-mutation
211 # stance. The LLM remains the code mutator (no parallel
212 # code-generator path), but the instruction distribution
213 # shifts — this is the cheapest honest wiring of the
214 # stochastic arm without inventing a second mutation backend.
215 # Safety: the candidate still passes RSI-1 + RSI-2 gates
216 # inside commit_improvement before promotion.
217 exploration_hint = ''
218 try:
219 from integrations.agent_engine.exploration_arm import (
220 select_strategy,
221 )
222 if select_strategy() == 'explore':
223 exploration_hint = (
224 "\nEXPLORATION MODE: propose a RADICAL / "
225 "ARCHITECTURAL change this iteration — not an "
226 "incremental tweak. Favor ideas that reshape "
227 "the approach; safety gates still run before "
228 "promotion, so a failed bold change costs "
229 "nothing while a successful one opens the "
230 "search space.\n"
231 )
232 except Exception:
233 pass
235 task = (
236 f"You are running an autonomous research loop.\n\n"
237 f"TARGET FILE: {session.target_file}\n"
238 f"METRIC: {session.metric_name} "
239 f"({'lower is better' if session.metric_direction == 'lower_is_better' else 'higher is better'})\n"
240 f"BASELINE: {session.baseline_metric}\n"
241 f"CURRENT BEST: {session.best_metric} (iteration {session.best_iteration})\n"
242 f"ITERATION: {session.current_iteration}/{session.max_iterations}\n\n"
243 f"EXPERIMENT HISTORY:\n{history_summary}\n\n"
244 f"{benchmark_hint}"
245 f"{exploration_hint}"
246 f"RUN COMMAND: {session.run_command}\n\n"
247 f"YOUR TASK:\n"
248 f"1. Analyze what worked and what didn't from the history above\n"
249 f"2. Propose ONE focused modification to {session.target_file}\n"
250 f"3. Explain your hypothesis in one sentence\n"
251 f"4. Make the edit using SEARCH/REPLACE blocks\n\n"
252 f"RULES:\n"
253 f"- One change per iteration — small, testable, reversible\n"
254 f"- If you're stuck, try combinations of previous improvements\n"
255 f"- If all ideas seem tried, try something radical or architectural\n"
256 f"- Simplicity wins — a 0.001 gain from deleting code beats a 0.001 gain from 20 lines\n"
257 f"- NEVER modify the evaluation metric or test harness\n"
258 )
260 context = {
261 'working_dir': session.repo_path,
262 'files': [session.target_file],
263 }
265 result = backend.execute(task, context, timeout=120)
267 if not result.get('success'):
268 return None
270 output = result.get('output', '')
271 hypothesis = output.split('\n')[0][:200] if output else 'Unknown hypothesis'
272 edits = result.get('edits', [])
273 files_changed = result.get('files_changed', [])
275 return hypothesis, edits, files_changed
277 except Exception as e:
278 logger.warning(f"[{session.session_id}] Edit generation failed: {e}")
279 return None
281 # ── Experiment Execution ─────────────────────────────────
283 def run_experiment(self, session: AutoResearchSession,
284 is_baseline: bool = False) -> ExperimentResult:
285 """Run the experiment command and extract the metric."""
286 import re
288 result = ExperimentResult(
289 iteration=0 if is_baseline else session.current_iteration,
290 hypothesis='baseline' if is_baseline else '',
291 metric_name=session.metric_name,
292 metric_value=None,
293 baseline_value=session.baseline_metric,
294 improved=False,
295 )
297 start = time.time()
298 try:
299 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess
300 exit_code, output = run_cmd_subprocess(
301 session.run_command,
302 cwd=session.repo_path,
303 timeout=session.time_budget_s,
304 )
305 result.run_output = output[-5000:] if len(output) > 5000 else output
306 result.duration_s = time.time() - start
308 if exit_code != 0:
309 lines = output.split('\n')
310 tb_start = None
311 for i, line in enumerate(lines):
312 if 'Traceback' in line or 'Error' in line:
313 tb_start = i
314 break
315 if tb_start is not None:
316 result.error = '\n'.join(lines[tb_start:tb_start + 20])
317 else:
318 result.error = f'Exit code {exit_code}: {lines[-3:]}'
319 return result
321 metric_val = self.extract_metric(output, session)
322 result.metric_value = metric_val
324 if metric_val is not None and is_baseline:
325 result.improved = False
327 self.record_benchmark(session, result)
329 except Exception as e:
330 result.error = str(e)
331 result.duration_s = time.time() - start
333 return result
335 def record_benchmark(self, session: AutoResearchSession,
336 result: ExperimentResult):
337 """Record experiment result in BenchmarkTracker for evolution tracking.
339 BenchmarkTracker is how we prove that iter N actually beat iter N-1 on
340 the hive-shared leaderboard. If it's unavailable or raises, the
341 session's `benchmark_gain_enforced` flag is flipped to False and a
342 WARNING is emitted — callers (dashboards, tests) can then surface
343 "this session ran WITHOUT benchmark gain enforcement" rather than
344 silently treating 0 gain as a valid result.
345 """
346 try:
347 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker
348 tracker = get_benchmark_tracker()
349 tracker.record(
350 task_type='autoresearch',
351 tool_name='aider_native_backend',
352 completion_time_s=result.duration_s,
353 success=not result.error and result.metric_value is not None,
354 model_name=session.metric_name,
355 user_id=session.goal_id or session.session_id,
356 )
357 except ImportError as e:
358 session.benchmark_gain_enforced = False
359 logger.warning(
360 "[%s] BenchmarkTracker unavailable (ImportError: %s) — "
361 "iter %d ran WITHOUT benchmark-gain enforcement. "
362 "Set session.benchmark_gain_enforced=False.",
363 session.session_id, e, result.iteration,
364 )
365 except Exception as e:
366 session.benchmark_gain_enforced = False
367 logger.warning(
368 "[%s] BenchmarkTracker record failed (%s: %s) — "
369 "iter %d ran WITHOUT benchmark-gain enforcement.",
370 session.session_id, type(e).__name__, e, result.iteration,
371 )
373 def extract_metric(self, output: str, session: AutoResearchSession
374 ) -> Optional[float]:
375 """Extract the target metric from experiment output."""
376 import re
378 if session.metric_pattern:
379 match = re.search(session.metric_pattern, output)
380 if match:
381 try:
382 return float(match.group(1))
383 except (ValueError, IndexError):
384 pass
386 patterns = [
387 rf'{re.escape(session.metric_name)}[:\s=]+([0-9]+\.?[0-9]*)',
388 rf'^{re.escape(session.metric_name)}[:\s]+([0-9]+\.?[0-9]*)',
389 r'(\d+) passed',
390 r'(?:score|result|metric|accuracy|loss|bpb)[:\s=]+([0-9]+\.?[0-9]*)',
391 ]
393 for pat in patterns:
394 match = re.search(pat, output, re.IGNORECASE | re.MULTILINE)
395 if match:
396 try:
397 return float(match.group(1))
398 except (ValueError, IndexError):
399 continue
401 return None
403 # ── Git State Management ─────────────────────────────────
405 def revert_changes(self, session: AutoResearchSession):
406 """Revert the working directory to the last good state."""
407 try:
408 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess
409 run_cmd_subprocess(
410 f'git checkout -- {session.target_file}',
411 cwd=session.repo_path,
412 timeout=10,
413 )
414 except Exception as e:
415 logger.warning(f"[{session.session_id}] Revert failed: {e}")
417 # ── RSI gates ─────────────────────────────────────────────
418 # These two gates close the recursive self-improvement loop: no
419 # candidate becomes the new baseline unless ConstitutionalFilter
420 # allows it AND AgentBaselineService.validate_against_baseline
421 # reports no regression. Both gates fail-open when the dependency
422 # is missing, but flip the corresponding *_enforced flag to False
423 # and log LOUDLY so the dashboard/test can see the bypass.
425 def _constitutional_gate(self, session: 'AutoResearchSession',
426 result: 'ExperimentResult') -> Tuple[bool, str]:
427 """Check the hypothesis + edit summary against ConstitutionalFilter.
429 Returns (allowed, reason). Fail-open on ImportError or hash
430 tamper, but flip session.constitutional_enforced=False in that
431 case so the dashboard can surface the missing gate.
432 """
433 try:
434 from security.hive_guardrails import ConstitutionalFilter
435 except ImportError as e:
436 session.constitutional_enforced = False
437 logger.warning(
438 "[%s] ConstitutionalFilter unavailable (ImportError: %s) — "
439 "iter %d promoted WITHOUT constitutional gate. "
440 "Set session.constitutional_enforced=False.",
441 session.session_id, e, result.iteration,
442 )
443 return True, 'gate_unavailable'
445 prompt_text = ' '.join(filter(None, [
446 result.hypothesis or '',
447 session.metric_name or '',
448 ' '.join(result.files_changed or []),
449 ]))
450 try:
451 allowed, reason = ConstitutionalFilter.check_prompt(prompt_text)
452 except RuntimeError as e:
453 # Guardrail tamper — fail-CLOSED. This is the one case where
454 # the gate is authoritative: if the guardrail values were
455 # mutated in memory we must NOT promote.
456 session.constitutional_enforced = True
457 logger.critical(
458 "[%s] ConstitutionalFilter TAMPER on iter %d: %s — "
459 "refusing to promote.",
460 session.session_id, result.iteration, e,
461 )
462 return False, f'guardrail_tamper: {e}'
463 except Exception as e:
464 session.constitutional_enforced = False
465 logger.warning(
466 "[%s] ConstitutionalFilter.check_prompt failed "
467 "(%s: %s) — iter %d promoted WITHOUT gate.",
468 session.session_id, type(e).__name__, e, result.iteration,
469 )
470 return True, 'gate_errored'
472 return allowed, reason
474 def _baseline_delta_gate(self, session: 'AutoResearchSession'
475 ) -> Tuple[bool, List[str], str]:
476 """Run AgentBaselineService.validate_against_baseline.
478 Returns (passed, regressions, reason). Fail-open if service is
479 unavailable or no baseline exists yet (first-run case), but
480 flip session.baseline_delta_enforced=False on unavailable path.
481 """
482 try:
483 from integrations.agent_engine.agent_baseline_service import (
484 AgentBaselineService,
485 )
486 except ImportError as e:
487 session.baseline_delta_enforced = False
488 logger.warning(
489 "[%s] AgentBaselineService unavailable for delta gate "
490 "(ImportError: %s) — promoted WITHOUT baseline compare.",
491 session.session_id, e,
492 )
493 return True, [], 'gate_unavailable'
495 try:
496 prompt_id = session.experiment_id or session.session_id
497 result = AgentBaselineService.validate_against_baseline(
498 prompt_id=prompt_id, flow_id=0,
499 )
500 except Exception as e:
501 session.baseline_delta_enforced = False
502 logger.warning(
503 "[%s] validate_against_baseline errored (%s: %s) — "
504 "promoted WITHOUT baseline compare.",
505 session.session_id, type(e).__name__, e,
506 )
507 return True, [], 'gate_errored'
509 passed = bool(result.get('passed', True))
510 regressions = list(result.get('regressions', []) or [])
511 reason = result.get('reason', '') or (
512 'no_regressions' if passed else 'regressions_detected'
513 )
514 return passed, regressions, reason
516 def commit_improvement(self, session: AutoResearchSession,
517 result: ExperimentResult) -> bool:
518 """Commit the improvement to git and save as recipe step.
520 Returns True if the candidate was actually promoted (gates passed +
521 git commit attempted), False if a gate rejected it (pending edits
522 reverted, session rejection counters incremented).
524 RSI gate chain (both must pass to promote):
525 1. ConstitutionalFilter — hypothesis/edit summary free of
526 violation patterns.
527 2. AgentBaselineService.validate_against_baseline — no
528 cross-metric regression vs the latest live snapshot.
530 Fail-open on missing dependencies (flags flip loud), fail-closed
531 on guardrail tamper.
532 """
533 # ── RSI-1: constitutional gate ──
534 allowed, cons_reason = self._constitutional_gate(session, result)
535 if not allowed:
536 session.constitutional_rejections += 1
537 session.last_rejection_reason = f'constitutional: {cons_reason}'
538 logger.warning(
539 "[%s] Iter %d REJECTED by ConstitutionalFilter: %s — "
540 "reverting pending edits.",
541 session.session_id, result.iteration, cons_reason,
542 )
543 self.revert_changes(session)
544 self.emit_progress(session, 'autoresearch.rejected', {
545 'iteration': result.iteration,
546 'gate': 'constitutional',
547 'reason': cons_reason,
548 })
549 return False
551 # ── RSI-2: baseline delta gate ──
552 passed, regressions, base_reason = self._baseline_delta_gate(session)
553 if not passed:
554 session.baseline_rejections += 1
555 session.last_rejection_reason = (
556 f'baseline_regression: {"; ".join(regressions) or base_reason}'
557 )
558 logger.warning(
559 "[%s] Iter %d REJECTED by baseline delta: %s — "
560 "reverting pending edits.",
561 session.session_id, result.iteration,
562 regressions or base_reason,
563 )
564 self.revert_changes(session)
565 self.emit_progress(session, 'autoresearch.rejected', {
566 'iteration': result.iteration,
567 'gate': 'baseline_delta',
568 'regressions': regressions,
569 'reason': base_reason,
570 })
571 return False
573 # ── Gates passed — proceed with existing commit + recipe + snapshot ──
574 try:
575 from integrations.coding_agent.aider_core.run_cmd import run_cmd_subprocess
576 msg = (f"autoresearch iter {result.iteration}: "
577 f"{session.metric_name}={result.metric_value} "
578 f"(was {result.baseline_value})")
579 run_cmd_subprocess(
580 f'git add {session.target_file} && git commit -m "{msg}"',
581 cwd=session.repo_path,
582 timeout=15,
583 )
584 except Exception as e:
585 logger.debug(f"[{session.session_id}] Git commit skipped: {e}")
587 try:
588 from integrations.coding_agent.recipe_bridge import CodingRecipeBridge
589 bridge = CodingRecipeBridge()
590 bridge.capture_edit_as_recipe_step(
591 task=f'autoresearch: {session.metric_name} optimization',
592 tool_name='autoresearch',
593 file_edits=result.edits,
594 working_dir=session.repo_path,
595 )
596 except ImportError as e:
597 logger.warning(
598 "[%s] CodingRecipeBridge unavailable (ImportError: %s) — "
599 "iter %d improvement NOT captured as recipe step.",
600 session.session_id, e, result.iteration,
601 )
602 except Exception as e:
603 logger.warning(
604 "[%s] CodingRecipeBridge.capture failed (%s: %s) — "
605 "iter %d improvement NOT captured as recipe step.",
606 session.session_id, type(e).__name__, e, result.iteration,
607 )
609 # AgentBaselineService snapshot is the regression escape-hatch the
610 # audit flagged: if we silently skip it, a later benchmark-based
611 # rollback has no anchor to roll back TO. Make the absence LOUD
612 # and set the session flag so dashboards/tests can read it.
613 try:
614 from integrations.agent_engine.agent_baseline_service import AgentBaselineService
615 AgentBaselineService.capture_snapshot(
616 prompt_id=session.experiment_id or session.session_id,
617 flow_id='autoresearch',
618 trigger='autoresearch_improvement',
619 user_id=session.goal_id or 'system',
620 )
621 except ImportError as e:
622 session.baseline_enforced = False
623 logger.warning(
624 "[%s] AgentBaselineService unavailable (ImportError: %s) — "
625 "iter %d kept WITHOUT baseline snapshot. "
626 "Set session.baseline_enforced=False — regression rollback "
627 "will have no anchor for this iteration.",
628 session.session_id, e, result.iteration,
629 )
630 except Exception as e:
631 session.baseline_enforced = False
632 logger.warning(
633 "[%s] AgentBaselineService.capture_snapshot failed "
634 "(%s: %s) — iter %d kept WITHOUT baseline snapshot.",
635 session.session_id, type(e).__name__, e, result.iteration,
636 )
638 self.emit_progress(session, 'autoresearch.promoted', {
639 'iteration': result.iteration,
640 'metric_value': result.metric_value,
641 'baseline_value': result.baseline_value,
642 })
643 return True
645 # ── History & Reporting ──────────────────────────────────
647 def build_history_summary(self, session: AutoResearchSession) -> str:
648 """Build a compact summary of previous iterations for the LLM."""
649 if not session.results:
650 return 'No previous iterations.'
652 lines = []
653 for r in session.results[-10:]:
654 status = 'IMPROVED' if r.get('improved') else 'reverted'
655 val = r.get('metric_value', '?')
656 hyp = r.get('hypothesis', '')[:80]
657 err = r.get('error', '')[:50]
658 if err:
659 lines.append(f" iter {r.get('iteration', '?')}: CRASHED — {err}")
660 else:
661 lines.append(f" iter {r.get('iteration', '?')}: {val} ({status}) — {hyp}")
663 return '\n'.join(lines)
665 def save_report(self, session: AutoResearchSession):
666 """Save the session report to agent_data for persistence."""
667 try:
668 report_dir = os.path.join(
669 os.path.dirname(__file__), '..', '..', 'agent_data', 'autoresearch')
670 os.makedirs(report_dir, exist_ok=True)
672 report_path = os.path.join(report_dir, f'{session.session_id}.json')
673 report = {
674 'session': session.to_progress_dict(),
675 'config': {
676 'repo_path': session.repo_path,
677 'target_file': session.target_file,
678 'run_command': session.run_command,
679 'metric_name': session.metric_name,
680 'metric_direction': session.metric_direction,
681 'max_iterations': session.max_iterations,
682 'time_budget_s': session.time_budget_s,
683 },
684 'results': session.results,
685 }
686 with open(report_path, 'w', encoding='utf-8') as f:
687 json.dump(report, f, indent=2, default=str)
688 logger.info(f"[{session.session_id}] Report saved: {report_path}")
689 except Exception as e:
690 logger.warning(f"[{session.session_id}] Report save failed: {e}")
692 self.export_learning_delta(session)
694 def export_learning_delta(self, session: AutoResearchSession):
695 """Export session results as a federated learning delta AND
696 broadcast them to peer Hive nodes.
698 Two layers:
699 1. BenchmarkTracker.export_learning_delta — prepare the
700 delta payload. If unavailable, federation_export_enforced
701 flips False and a WARNING is emitted.
702 2. FederatedAggregator.broadcast_delta — actually transmit
703 the delta to known peers. If unavailable OR if the peer
704 POST leg errors, federation_broadcast_enforced flips False
705 and a WARNING is emitted. ScopeGuard inside broadcast_delta
706 is the authoritative egress gate (PII / secrets blocked).
708 This closes RSI-3: promoted improvements actually propagate across
709 the Hive so "the most" user-owned nodes benefit, not just the
710 instance that ran the iteration.
711 """
712 delta = None
713 try:
714 from integrations.coding_agent.benchmark_tracker import get_benchmark_tracker
715 tracker = get_benchmark_tracker()
716 delta = tracker.export_learning_delta() or {}
717 delta['autoresearch'] = {
718 'session_id': session.session_id,
719 'experiment_id': session.experiment_id,
720 'metric_name': session.metric_name,
721 'baseline': session.baseline_metric,
722 'best': session.best_metric,
723 'total_improvements': session.total_improvements,
724 'iterations': session.current_iteration,
725 'constitutional_rejections': session.constitutional_rejections,
726 'baseline_rejections': session.baseline_rejections,
727 }
728 logger.info(
729 "[%s] Learning delta prepared for federation "
730 "(improvements=%d, rejections=c%d/b%d)",
731 session.session_id, session.total_improvements,
732 session.constitutional_rejections, session.baseline_rejections,
733 )
734 except ImportError as e:
735 session.federation_export_enforced = False
736 logger.warning(
737 "[%s] BenchmarkTracker unavailable (ImportError: %s) — "
738 "learning delta NOT exported; hive will not learn from "
739 "this session. Set session.federation_export_enforced=False.",
740 session.session_id, e,
741 )
742 return
743 except Exception as e:
744 session.federation_export_enforced = False
745 logger.warning(
746 "[%s] Learning delta export failed (%s: %s) — "
747 "hive will not learn from this session.",
748 session.session_id, type(e).__name__, e,
749 )
750 return
752 # ── RSI-3: broadcast to peer Hive nodes ──
753 # This is the "federate" leg. Without it, improvements stay
754 # local and "the most" never benefits. ScopeGuard inside
755 # broadcast_delta is the authoritative egress gate.
756 try:
757 from integrations.agent_engine.federated_aggregator import (
758 get_federated_aggregator,
759 )
760 aggregator = get_federated_aggregator()
761 aggregator.broadcast_delta(delta)
762 logger.info(
763 "[%s] Learning delta broadcast to hive peers via "
764 "FederatedAggregator", session.session_id,
765 )
766 except ImportError as e:
767 session.federation_broadcast_enforced = False
768 logger.warning(
769 "[%s] FederatedAggregator unavailable (ImportError: %s) — "
770 "delta NOT broadcast to peers; hive will not learn from "
771 "this session. Set session.federation_broadcast_enforced=False.",
772 session.session_id, e,
773 )
774 except Exception as e:
775 session.federation_broadcast_enforced = False
776 logger.warning(
777 "[%s] FederatedAggregator.broadcast_delta failed "
778 "(%s: %s) — peers did not receive this session's delta.",
779 session.session_id, type(e).__name__, e,
780 )
782 def emit_progress(self, session: AutoResearchSession,
783 event_topic: str, data: Dict = None):
784 """Emit progress event via EventBus for live tracker updates."""
785 try:
786 from core.platform.events import emit_event
787 payload = data or {}
788 payload['session_id'] = session.session_id
789 payload['experiment_id'] = session.experiment_id
790 payload['goal_id'] = session.goal_id
791 emit_event(event_topic, payload)
792 except Exception:
793 pass
796# ── Singleton ────────────────────────────────────────────────
798_engine: Optional[AutoResearchEngine] = None
799_engine_lock = threading.Lock()
802def get_autoresearch_engine() -> AutoResearchEngine:
803 """Get or create the singleton AutoResearchEngine."""
804 global _engine
805 if _engine is None:
806 with _engine_lock:
807 if _engine is None:
808 _engine = AutoResearchEngine()
809 return _engine
812# ── Agent Tool Functions (step-based) ────────────────────────
813# The agent calls these in sequence. The agent's conversation loop
814# IS the iteration loop — no hardcoded Python while loop.
817def autoresearch_setup(repo_path: str, target_file: str, run_command: str,
818 metric_name: str = 'score',
819 metric_pattern: str = '',
820 metric_direction: str = 'higher_is_better',
821 max_iterations: int = 50,
822 time_budget_s: int = 300,
823 experiment_id: str = '',
824 goal_id: str = '') -> str:
825 """Set up an autoresearch session and run the baseline experiment.
827 Call this FIRST. Creates a session, runs the unmodified code to capture
828 the baseline metric, and returns a session_id for subsequent steps.
830 Agent loop pattern:
831 1. autoresearch_setup(...) → get session_id + baseline
832 2. autoresearch_edit(session_id) → propose code edit
833 3. autoresearch_run(session_id) → run + score
834 4. autoresearch_decide(session_id) → keep or revert
835 5. Repeat 2-4 until converged or budget exhausted
836 6. autoresearch_finalize(session_id) → save report
838 Args:
839 repo_path: Path to the git repository
840 target_file: The file to modify (relative to repo_path)
841 run_command: Shell command to run the experiment
842 metric_name: Name of the metric to optimize
843 metric_pattern: Regex with group(1) to extract metric from output
844 metric_direction: 'higher_is_better' or 'lower_is_better'
845 max_iterations: Maximum iterations before stopping
846 time_budget_s: Per-iteration time budget in seconds
847 experiment_id: ThoughtExperiment ID (if triggered by one)
848 goal_id: AgentGoal ID
850 Returns:
851 JSON with session_id, baseline_metric, and status
852 """
853 if not os.path.isdir(repo_path):
854 return json.dumps({'error': f'repo_path not found: {repo_path}'})
856 target_path = os.path.join(repo_path, target_file)
857 if not os.path.isfile(target_path):
858 return json.dumps({'error': f'target_file not found: {target_file}'})
860 session = AutoResearchSession(
861 experiment_id=experiment_id,
862 goal_id=goal_id,
863 repo_path=repo_path,
864 target_file=target_file,
865 run_command=run_command,
866 metric_name=metric_name,
867 metric_pattern=metric_pattern,
868 metric_direction=metric_direction,
869 max_iterations=max_iterations,
870 time_budget_s=time_budget_s,
871 )
872 session.status = 'running'
873 session.start_time = time.time()
875 engine = get_autoresearch_engine()
876 engine.register_session(session)
878 # Run baseline
879 baseline = engine.run_experiment(session, is_baseline=True)
880 if baseline.error:
881 session.status = 'failed'
882 session.results.append(asdict(baseline))
883 engine.emit_progress(session, 'autoresearch.failed',
884 {'error': f'Baseline failed: {baseline.error}'})
885 return json.dumps({
886 'error': f'Baseline failed: {baseline.error}',
887 'session_id': session.session_id,
888 })
890 session.baseline_metric = baseline.metric_value
891 session.best_metric = baseline.metric_value
892 session.results.append(asdict(baseline))
893 engine.emit_progress(session, 'autoresearch.started')
894 engine.emit_progress(session, 'autoresearch.baseline',
895 {'baseline': baseline.metric_value})
897 return json.dumps({
898 'session_id': session.session_id,
899 'status': 'running',
900 'baseline_metric': baseline.metric_value,
901 'metric_name': metric_name,
902 'metric_direction': metric_direction,
903 'max_iterations': max_iterations,
904 'instruction': (
905 'Baseline captured. Now call autoresearch_edit to propose a code '
906 'change, then autoresearch_run to test it, then autoresearch_decide '
907 'to keep or revert. Repeat until converged or budget exhausted.'
908 ),
909 })
912def autoresearch_edit(session_id: str) -> str:
913 """Propose and apply one code edit for an autoresearch session.
915 Uses LLM + AiderNativeBackend to generate a hypothesis and apply
916 the code modification. Call autoresearch_run next to test it.
918 Args:
919 session_id: The session ID from autoresearch_setup
921 Returns:
922 JSON with hypothesis, files_changed, and budget status
923 """
924 engine = get_autoresearch_engine()
925 session = engine.get_session(session_id)
926 if not session:
927 return json.dumps({'error': f'Session {session_id} not found'})
929 if session.is_budget_exhausted():
930 session.status = 'budget_exhausted'
931 return json.dumps({
932 'budget_exhausted': True,
933 'spark_consumed': session.spark_consumed,
934 'spark_budget': session.spark_budget,
935 'instruction': 'Budget exhausted. Call autoresearch_finalize to save report.',
936 })
938 session.current_iteration += 1
939 edit_result = engine.generate_and_apply_edit(session)
941 if not edit_result:
942 return json.dumps({
943 'success': False,
944 'iteration': session.current_iteration,
945 'reason': 'No edit generated by LLM',
946 'instruction': 'Try calling autoresearch_edit again for a new hypothesis.',
947 })
949 hypothesis, edits, files_changed = edit_result
950 session._pending_hypothesis = hypothesis
951 session._pending_edits = edits
952 session._pending_files = files_changed
954 return json.dumps({
955 'success': True,
956 'iteration': session.current_iteration,
957 'hypothesis': hypothesis,
958 'files_changed': files_changed,
959 'instruction': 'Edit applied. Call autoresearch_run to test this change.',
960 })
963def autoresearch_run(session_id: str) -> str:
964 """Run the experiment after an edit and extract the metric.
966 Executes the run_command, extracts the target metric from output,
967 and records the result in BenchmarkTracker.
969 Args:
970 session_id: The session ID from autoresearch_setup
972 Returns:
973 JSON with metric_value, improved, and comparison to best
974 """
975 engine = get_autoresearch_engine()
976 session = engine.get_session(session_id)
977 if not session:
978 return json.dumps({'error': f'Session {session_id} not found'})
980 result = engine.run_experiment(session, is_baseline=False)
981 result.iteration = session.current_iteration
982 result.hypothesis = session._pending_hypothesis
983 result.edits = session._pending_edits
984 result.files_changed = session._pending_files
986 session.spark_consumed += session.spark_per_iteration
988 # Determine improvement
989 improved = False
990 if result.error:
991 improved = False
992 elif result.metric_value is not None and session.is_improved(result.metric_value):
993 improved = True
995 result.improved = improved
996 result.baseline_value = session.best_metric
998 # Store for decide step
999 session.results.append(asdict(result))
1000 engine.emit_progress(session, 'autoresearch.iteration', asdict(result))
1002 return json.dumps({
1003 'iteration': session.current_iteration,
1004 'metric_value': result.metric_value,
1005 'best_metric': session.best_metric,
1006 'improved': improved,
1007 'error': result.error or None,
1008 'duration_s': round(result.duration_s, 1),
1009 'instruction': (
1010 f'{"IMPROVED" if improved else "No improvement"}. '
1011 f'Call autoresearch_decide to {"keep" if improved else "revert"} this change.'
1012 ),
1013 })
1016def autoresearch_decide(session_id: str) -> str:
1017 """Keep or revert the last edit based on the experiment result.
1019 If the last run improved the metric, commits the change and saves
1020 it as a recipe step. If not, reverts via git checkout.
1022 Args:
1023 session_id: The session ID from autoresearch_setup
1025 Returns:
1026 JSON with decision, current best, and next step advice
1027 """
1028 engine = get_autoresearch_engine()
1029 session = engine.get_session(session_id)
1030 if not session:
1031 return json.dumps({'error': f'Session {session_id} not found'})
1033 if not session.results:
1034 return json.dumps({'error': 'No experiment results to decide on'})
1036 last_result_dict = session.results[-1]
1037 improved = last_result_dict.get('improved', False)
1038 metric_value = last_result_dict.get('metric_value')
1039 error = last_result_dict.get('error', '')
1041 if error or not improved:
1042 # Revert — in-session metric failed to improve.
1043 engine.revert_changes(session)
1044 decision = 'reverted'
1045 logger.info(f"[{session.session_id}] Iter {session.current_iteration} "
1046 f"reverted: {metric_value} vs best {session.best_metric}")
1047 else:
1048 # Candidate passed the in-session metric check — hand it to
1049 # commit_improvement which runs the RSI gates (constitutional +
1050 # baseline delta) and returns False if either rejects. On
1051 # rejection it reverts the pending edits itself and bumps the
1052 # appropriate rejection counter, so we only update best_metric /
1053 # best_iteration / total_improvements when the promote actually
1054 # landed. This enforces the monotonic-vs-today's-baseline
1055 # guarantee globally, not just on the metric being optimized.
1056 prior_best = session.best_metric
1057 result = ExperimentResult(
1058 iteration=session.current_iteration,
1059 hypothesis=session._pending_hypothesis,
1060 metric_name=session.metric_name,
1061 metric_value=metric_value,
1062 baseline_value=prior_best,
1063 improved=True,
1064 edits=session._pending_edits,
1065 files_changed=session._pending_files,
1066 )
1067 committed = engine.commit_improvement(session, result)
1068 if committed:
1069 session.best_metric = metric_value
1070 session.best_iteration = session.current_iteration
1071 session.total_improvements += 1
1072 decision = 'kept'
1073 logger.info(
1074 f"[{session.session_id}] Iter {session.current_iteration} "
1075 f"IMPROVED: {metric_value} (was {prior_best})")
1076 else:
1077 # RSI gate rejected. best_metric stays at prior_best and the
1078 # pending edits were already reverted by commit_improvement.
1079 decision = 'rejected_by_gate'
1080 logger.info(
1081 f"[{session.session_id}] Iter {session.current_iteration} "
1082 f"gated (reason={session.last_rejection_reason}); "
1083 f"best remains {prior_best}")
1085 # Clear pending state
1086 session._pending_hypothesis = ''
1087 session._pending_edits = []
1088 session._pending_files = []
1090 # Convergence check
1091 should_continue = (
1092 session.current_iteration < session.max_iterations
1093 and not session.is_budget_exhausted()
1094 )
1096 return json.dumps({
1097 'decision': decision,
1098 'iteration': session.current_iteration,
1099 'best_metric': session.best_metric,
1100 'best_iteration': session.best_iteration,
1101 'total_improvements': session.total_improvements,
1102 'spark_consumed': session.spark_consumed,
1103 'should_continue': should_continue,
1104 'instruction': (
1105 'Call autoresearch_edit for the next iteration.'
1106 if should_continue else
1107 'Done iterating. Call autoresearch_finalize to save the report.'
1108 ),
1109 })
1112def autoresearch_finalize(session_id: str) -> str:
1113 """Finalize an autoresearch session — save report and export deltas.
1115 Call this when iteration is complete (converged, budget exhausted,
1116 or max iterations reached). Saves the session report and exports
1117 learning deltas for hive-wide federation.
1119 Args:
1120 session_id: The session ID from autoresearch_setup
1122 Returns:
1123 JSON with final session summary
1124 """
1125 engine = get_autoresearch_engine()
1126 session = engine.get_session(session_id)
1127 if not session:
1128 return json.dumps({'error': f'Session {session_id} not found'})
1130 if session.status == 'running':
1131 session.status = 'completed'
1133 engine.save_report(session)
1134 engine.emit_progress(session, 'autoresearch.completed',
1135 session.to_progress_dict())
1136 engine.unregister_session(session_id)
1138 return json.dumps({
1139 'status': session.status,
1140 'session_id': session.session_id,
1141 'baseline_metric': session.baseline_metric,
1142 'best_metric': session.best_metric,
1143 'best_iteration': session.best_iteration,
1144 'total_improvements': session.total_improvements,
1145 'total_iterations': session.current_iteration,
1146 'spark_consumed': session.spark_consumed,
1147 'elapsed_s': round(time.time() - session.start_time, 1),
1148 })
1151def get_autoresearch_status(session_id: str = '') -> str:
1152 """Get the status of an autoresearch session or all active sessions.
1154 Args:
1155 session_id: Specific session ID, or empty for all active sessions
1157 Returns:
1158 JSON with session progress
1159 """
1160 engine = get_autoresearch_engine()
1162 if session_id:
1163 session = engine.get_session(session_id)
1164 if session:
1165 return json.dumps(session.to_progress_dict())
1166 # Check saved reports
1167 report_path = os.path.join(
1168 os.path.dirname(__file__), '..', '..', 'agent_data',
1169 'autoresearch', f'{session_id}.json')
1170 if os.path.isfile(report_path):
1171 with open(report_path, 'r') as f:
1172 return f.read()
1173 return json.dumps({'error': f'Session {session_id} not found'})
1175 return json.dumps({'active_sessions': engine.get_active_sessions()})
1178# ── Backward-compatible alias ─────────────────────────────────
1179# launch_experiment_autoresearch in thought_experiment_tools.py calls this
1181def start_autoresearch(repo_path: str, target_file: str, run_command: str,
1182 metric_name: str = 'score', metric_pattern: str = '',
1183 metric_direction: str = 'higher_is_better',
1184 max_iterations: int = 50, time_budget_s: int = 300,
1185 experiment_id: str = '', goal_id: str = '',
1186 hive_parallel: bool = False,
1187 num_variants: int = 3) -> str:
1188 """Backward-compatible wrapper — delegates to autoresearch_setup.
1190 The hive_parallel parameter is accepted but ignored (hive dispatch
1191 is now handled by the agent via compute mesh tools).
1192 """
1193 return autoresearch_setup(
1194 repo_path=repo_path, target_file=target_file,
1195 run_command=run_command, metric_name=metric_name,
1196 metric_pattern=metric_pattern, metric_direction=metric_direction,
1197 max_iterations=max_iterations, time_budget_s=time_budget_s,
1198 experiment_id=experiment_id, goal_id=goal_id,
1199 )
1202# Tool registration list (consumed by ServiceToolRegistry)
1203AUTOEVOLVE_CODE_TOOLS = [
1204 {
1205 'name': 'autoresearch_setup',
1206 'func': autoresearch_setup,
1207 'description': (
1208 'Set up a code research session and run baseline. Returns session_id. '
1209 'Call autoresearch_edit → autoresearch_run → autoresearch_decide in a loop.'
1210 ),
1211 'tags': ['autoresearch', 'coding'],
1212 },
1213 {
1214 'name': 'autoresearch_edit',
1215 'func': autoresearch_edit,
1216 'description': 'Propose and apply one LLM-generated code edit.',
1217 'tags': ['autoresearch', 'coding'],
1218 },
1219 {
1220 'name': 'autoresearch_run',
1221 'func': autoresearch_run,
1222 'description': 'Run the experiment after an edit and extract the metric.',
1223 'tags': ['autoresearch', 'coding'],
1224 },
1225 {
1226 'name': 'autoresearch_decide',
1227 'func': autoresearch_decide,
1228 'description': 'Keep (git commit) or revert (git checkout) the last edit.',
1229 'tags': ['autoresearch', 'coding'],
1230 },
1231 {
1232 'name': 'autoresearch_finalize',
1233 'func': autoresearch_finalize,
1234 'description': 'Save session report and export learning deltas to federation.',
1235 'tags': ['autoresearch', 'coding'],
1236 },
1237 {
1238 'name': 'get_autoresearch_status',
1239 'func': get_autoresearch_status,
1240 'description': 'Get progress of an autoresearch session or list all active sessions.',
1241 'tags': ['autoresearch'],
1242 },
1243]
1245# Backward-compat alias
1246AUTORESEARCH_TOOLS = AUTOEVOLVE_CODE_TOOLS