Coverage for integrations/agent_engine/thought_experiment

1"""

2Thought Experiment Agent Tools — AutoGen tools for thought experiment coordination.

48 tools for creating, voting, evaluating, and managing thought experiments.

5Includes autoresearch integration: software thought experiments can spawn

6autonomous edit→run→score→iterate loops at hive scale.

7Tier 2 tools (agent_engine context). Same pattern as learning_tools.py.

8"""

9import json

10import logging

11import threading

13try:

14 from core.session_cache import TTLCache

15 _file_locks = TTLCache(ttl_seconds=86400, max_size=50000, name='thought_exp_locks')

16except ImportError:

17 _file_locks = {}

18_file_locks_guard = threading.Lock()

20logger = logging.getLogger('hevolve_social')

23def create_thought_experiment(creator_id: str, title: str,

24 hypothesis: str,

25 expected_outcome: str = '',

26 intent_category: str = 'technology',

27 is_core_ip: bool = False) -> str:

28 """Create a new constitutional thought experiment."""

29 try:

30 from integrations.social.models import db_session

31 from integrations.social.thought_experiment_service import ThoughtExperimentService

33 with db_session() as db:

34 result = ThoughtExperimentService.create_experiment(

35 db, creator_id, title, hypothesis,

36 expected_outcome=expected_outcome,

37 intent_category=intent_category,

38 is_core_ip=is_core_ip)

39 if result:

40 return json.dumps({'success': True, 'experiment': result})

41 else:

42 return json.dumps({

43 'success': False,

44 'reason': 'Blocked by ConstitutionalFilter or invalid input',

45 })

46 except Exception as e:

47 return json.dumps({'error': str(e)})

50def cast_experiment_vote(experiment_id: str, voter_id: str,

51 vote_value: int = 0,

52 reasoning: str = '',

53 suggestion: str = '',

54 voter_type: str = 'agent',

55 confidence: float = 0.8) -> str:

56 """Cast a vote on a thought experiment (as agent or human)."""

57 try:

58 from integrations.social.models import db_session

59 from integrations.social.thought_experiment_service import ThoughtExperimentService

61 with db_session() as db:

62 result = ThoughtExperimentService.cast_vote(

63 db, experiment_id, voter_id,

64 vote_value=int(vote_value),

65 reasoning=reasoning,

66 suggestion=suggestion,

67 voter_type=voter_type,

68 confidence=float(confidence))

69 if result:

70 return json.dumps({'success': True, 'vote': result})

71 else:

72 return json.dumps({

73 'success': False,

74 'reason': 'Experiment not found or not in voting phase',

75 })

76 except Exception as e:

77 return json.dumps({'error': str(e)})

80def evaluate_thought_experiment(experiment_id: str, agent_id: str,

81 score: float = 0.0,

82 confidence: float = 0.8,

83 reasoning: str = '',

84 evidence: str = '') -> str:

85 """Record an agent evaluation for a thought experiment."""

86 try:

87 from integrations.social.models import db_session

88 from integrations.social.thought_experiment_service import ThoughtExperimentService

90 with db_session() as db:

91 result = ThoughtExperimentService.record_agent_evaluation(

92 db, experiment_id, agent_id,

93 score=float(score),

94 confidence=float(confidence),

95 reasoning=reasoning,

96 evidence=evidence)

97 if result:

98 return json.dumps({'success': True, 'experiment': result})

99 else:

100 return json.dumps({'success': False, 'reason': 'not_found'})

101 except Exception as e:

102 return json.dumps({'error': str(e)})

103

104

105def get_experiment_status(experiment_id: str = '',

106 status_filter: str = '') -> str:

107 """Get experiment detail or list experiments by status."""

108 try:

109 from integrations.social.models import db_session

110 from integrations.social.thought_experiment_service import ThoughtExperimentService

111

112 with db_session(commit=False) as db:

113 if experiment_id:

114 result = ThoughtExperimentService.get_experiment_detail(

115 db, experiment_id)

116 return json.dumps({'success': True, 'experiment': result})

117 else:

118 results = ThoughtExperimentService.get_active_experiments(

119 db, status=status_filter or None)

120 return json.dumps({

121 'success': True,

122 'experiments': results,

123 'count': len(results),

124 })

125 except Exception as e:

126 return json.dumps({'error': str(e)})

127

128

129def tally_experiment_votes(experiment_id: str) -> str:

130 """Get the current vote tally for an experiment."""

131 try:

132 from integrations.social.models import db_session

133 from integrations.social.thought_experiment_service import ThoughtExperimentService

134

135 with db_session(commit=False) as db:

136 tally = ThoughtExperimentService.tally_votes(db, experiment_id)

137 return json.dumps({'success': True, 'tally': tally})

138 except Exception as e:

139 return json.dumps({'error': str(e)})

140

141

142def advance_experiment(experiment_id: str,

143 target_status: str = '') -> str:

144 """Advance experiment to next lifecycle phase or specific status."""

145 try:

146 from integrations.social.models import db_session

147 from integrations.social.thought_experiment_service import ThoughtExperimentService

148

149 with db_session() as db:

150 result = ThoughtExperimentService.advance_status(

151 db, experiment_id,

152 target_status=target_status or None)

153 if result:

154 return json.dumps({'success': True, 'experiment': result})

155 else:

156 return json.dumps({

157 'success': False,

158 'reason': 'Cannot advance (invalid status or not found)',

159 })

160 except Exception as e:

161 return json.dumps({'error': str(e)})

162

163

164def iterate_hypothesis(experiment_id: str, hypothesis: str,

165 approach: str = '', evidence: str = '',

166 iteration: int = 0) -> str:

167 """Propose and evaluate a hypothesis iteration for ANY thought experiment.

168

169 This is the generic iteration tool — works for traditional, research,

170 physical_ai, or any experiment type. The agent calls this in a loop:

171 propose hypothesis → gather evidence → score → refine → repeat.

172

173 For software experiments, use launch_experiment_autoresearch instead.

174

175 Respects owner pause — if the experiment creator paused evolution,

176 this tool returns a pause signal and the agent should stop iterating.

177

178 Args:

179 experiment_id: The ThoughtExperiment ID

180 hypothesis: The refined hypothesis for this iteration

181 approach: How you plan to test/evaluate this hypothesis

182 evidence: Evidence or reasoning supporting this iteration

183 iteration: Current iteration number (for tracking)

184

185 Returns:

186 JSON with iteration record and experiment context

187 """

188 # Check owner pause

189 try:

190 from integrations.agent_engine.auto_evolve import is_experiment_paused

191 if is_experiment_paused(experiment_id):

192 return json.dumps({

193 'success': False,

194 'paused': True,

195 'reason': 'Experiment paused by owner. Stop iterating.',

196 'instruction': 'The experiment owner has paused evolution. '

197 'Do NOT continue iterating. Wait for resume.',

198 })

199 except ImportError:

200 pass

201

202 try:

203 from integrations.social.models import db_session

204 from integrations.social.thought_experiment_service import ThoughtExperimentService

205

206 with db_session(commit=False) as db:

207 detail = ThoughtExperimentService.get_experiment_detail(

208 db, experiment_id)

209 if not detail:

210 return json.dumps({'error': 'Experiment not found'})

211

212 # Build iteration record

213 iteration_record = {

214 'iteration': iteration,

215 'hypothesis': hypothesis,

216 'approach': approach,

217 'evidence': evidence,

218 'status': 'proposed',

219 }

220

221 # Return context for the agent to evaluate

222 return json.dumps({

223 'success': True,

224 'iteration': iteration_record,

225 'experiment': {

226 'id': experiment_id,

227 'title': detail.get('title', ''),

228 'original_hypothesis': detail.get('hypothesis', ''),

229 'expected_outcome': detail.get('expected_outcome', ''),

230 'intent_category': detail.get('intent_category', ''),

231 'status': detail.get('status', ''),

232 },

233 'instruction': (

234 'Now evaluate this hypothesis. Use score_hypothesis_result '

235 'to record your score (-2 to +2). Consider: evidence quality, '

236 'clarity, feasibility, and expected impact.'

237 ),

238 })

239 except Exception as e:

240 return json.dumps({'error': str(e)})

241

242

243def score_hypothesis_result(experiment_id: str, iteration: int,

244 score: float, reasoning: str,

245 evidence_quality: float = 0.0,

246 clarity: float = 0.0,

247 feasibility: float = 0.0,

248 impact: float = 0.0) -> str:

249 """Score a hypothesis iteration using a structured rubric.

250

251 Generic scoring tool for all experiment types. The agent uses this

252 after evaluating a hypothesis to decide whether to keep iterating

253 or converge on a conclusion.

254

255 Args:

256 experiment_id: The ThoughtExperiment ID

257 iteration: Iteration number being scored

258 score: Overall score (-2 to +2)

259 reasoning: Why this score

260 evidence_quality: Sub-score for evidence (0-1)

261 clarity: Sub-score for hypothesis clarity (0-1)

262 feasibility: Sub-score for feasibility (0-1)

263 impact: Sub-score for expected impact (0-1)

264

265 Returns:

266 JSON with score record, trend analysis, and continuation advice

267 """

268 import os

269 import tempfile

270

271 score = max(-2.0, min(2.0, float(score)))

272

273 # Load or create iteration history file

274 data_dir = os.path.join(

275 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations')

276 os.makedirs(data_dir, exist_ok=True)

277 history_path = os.path.join(data_dir, f'{experiment_id}.json')

278

279 # Per-experiment lock prevents read-modify-write race

280 with _file_locks_guard:

281 if experiment_id not in _file_locks:

282 _file_locks[experiment_id] = threading.Lock()

283 lock = _file_locks[experiment_id]

284

285 with lock:

286 history = []

287 if os.path.isfile(history_path):

288 try:

289 with open(history_path, 'r', encoding='utf-8') as f:

290 history = json.load(f)

291 except Exception:

292 history = []

293

294 record = {

295 'iteration': iteration,

296 'score': score,

297 'reasoning': reasoning,

298 'rubric': {

299 'evidence_quality': max(0.0, min(1.0, float(evidence_quality))),

300 'clarity': max(0.0, min(1.0, float(clarity))),

301 'feasibility': max(0.0, min(1.0, float(feasibility))),

302 'impact': max(0.0, min(1.0, float(impact))),

303 },

304 }

305 history.append(record)

306

307 # Atomic write: temp file + rename prevents partial writes

308 try:

309 fd, tmp_path = tempfile.mkstemp(dir=data_dir, suffix='.tmp')

310 try:

311 with os.fdopen(fd, 'w', encoding='utf-8') as f:

312 json.dump(history, f, indent=2, default=str)

313 os.replace(tmp_path, history_path)

314 except Exception:

315 try:

316 os.unlink(tmp_path)

317 except OSError:

318 pass

319 raise

320 except Exception:

321 pass

322

323 # Trend analysis

324 scores = [h['score'] for h in history]

325 best_score = max(scores)

326 best_iter = scores.index(best_score)

327 improving = len(scores) >= 2 and scores[-1] > scores[-2]

328 stagnant = len(scores) >= 3 and len(set(scores[-3:])) == 1

329

330 # Convergence advice

331 if stagnant:

332 advice = 'CONVERGE — 3 consecutive same scores. Record final evaluation.'

333 elif len(scores) >= 10:

334 advice = 'BUDGET — 10 iterations reached. Record final evaluation with best hypothesis.'

335 elif improving:

336 advice = 'CONTINUE — score is improving. Iterate with refined hypothesis.'

337 elif score >= 1.5:

338 advice = 'STRONG — high score. Consider recording final evaluation.'

339 else:

340 advice = 'CONTINUE — try a different approach or angle.'

341

342 return json.dumps({

343 'success': True,

344 'record': record,

345 'trend': {

346 'total_iterations': len(scores),

347 'best_score': best_score,

348 'best_iteration': best_iter,

349 'improving': improving,

350 'stagnant': stagnant,

351 },

352 'advice': advice,

353 })

354

355

356def get_iteration_history(experiment_id: str, last_n: int = 10) -> str:

357 """Get the iteration history for a thought experiment.

358

359 Returns past hypothesis iterations with scores and trend analysis.

360 The agent uses this to inform its next hypothesis refinement.

361

362 Args:

363 experiment_id: The ThoughtExperiment ID

364 last_n: Number of recent iterations to return (default 10)

365

366 Returns:

367 JSON with iteration history and summary statistics

368 """

369 import os

370

371 data_dir = os.path.join(

372 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations')

373 history_path = os.path.join(data_dir, f'{experiment_id}.json')

374

375 if not os.path.isfile(history_path):

376 return json.dumps({

377 'success': True,

378 'history': [],

379 'summary': 'No iterations yet. Use iterate_hypothesis to start.',

380 })

381

382 try:

383 with open(history_path, 'r', encoding='utf-8') as f:

384 history = json.load(f)

385 except Exception:

386 history = []

387

388 last_n = min(int(last_n), len(history))

389 recent = history[-last_n:] if last_n > 0 else history

390

391 scores = [h['score'] for h in history]

392 summary = {

393 'total_iterations': len(history),

394 'best_score': max(scores) if scores else None,

395 'worst_score': min(scores) if scores else None,

396 'avg_score': round(sum(scores) / len(scores), 2) if scores else None,

397 'improving_trend': (

398 len(scores) >= 2 and scores[-1] > scores[-2]

399 ),

400 }

401

402 return json.dumps({

403 'success': True,

404 'history': recent,

405 'summary': summary,

406 })

407

408

409def launch_experiment_autoresearch(experiment_id: str,

410 repo_path: str,

411 target_file: str,

412 run_command: str,

413 metric_name: str = 'score',

414 metric_pattern: str = '',

415 metric_direction: str = 'higher_is_better',

416 max_iterations: int = 50,

417 time_budget_s: int = 300,

418 hive_parallel: bool = False) -> str:

419 """Launch an autoresearch loop for a software thought experiment.

420

421 When a thought experiment has experiment_type='software' and reaches the

422 evaluating phase, this tool starts the autonomous edit→run→score→iterate

423 loop. The engine modifies target_file, runs run_command, extracts the

424 metric, keeps improvements, and iterates until budget or max_iterations.

425

426 At hive scale: when hive_parallel=True, multiple hypothesis variants run

427 simultaneously across compute mesh peers (tournament selection picks best).

428

429 Args:

430 experiment_id: The ThoughtExperiment ID to attach results to

431 repo_path: Path to the git repository

432 target_file: The file to modify (relative to repo_path)

433 run_command: Shell command to run the experiment

434 metric_name: Name of the metric to optimize

435 metric_pattern: Regex with group(1) to extract metric from output

436 metric_direction: 'higher_is_better' or 'lower_is_better'

437 max_iterations: Max iterations before stopping

438 time_budget_s: Per-iteration time budget in seconds

439 hive_parallel: If True, run parallel variants across hive peers

440

441 Returns:

442 JSON with session_id and status

443 """

444 try:

445 from integrations.coding_agent.autoevolve_code_tools import start_autoresearch

446 return start_autoresearch(

447 repo_path=repo_path,

448 target_file=target_file,

449 run_command=run_command,

450 metric_name=metric_name,

451 metric_pattern=metric_pattern,

452 metric_direction=metric_direction,

453 max_iterations=max_iterations,

454 time_budget_s=time_budget_s,

455 experiment_id=experiment_id,

456 hive_parallel=hive_parallel,

457 )

458 except Exception as e:

459 return json.dumps({'error': str(e)})

460

461

462def get_experiment_research_status(session_id: str = '') -> str:

463 """Get autoresearch loop progress for a thought experiment.

464

465 Args:

466 session_id: The autoresearch session ID (returned by launch_experiment_autoresearch)

467

468 Returns:

469 JSON with iteration count, best metric, improvements, budget consumed

470 """

471 try:

472 from integrations.coding_agent.autoevolve_code_tools import get_autoresearch_status

473 return get_autoresearch_status(session_id)

474 except Exception as e:

475 return json.dumps({'error': str(e)})

476

477

478# ─── Tool Registration ───

479

480THOUGHT_EXPERIMENT_TOOLS = [

481 {

482 'name': 'create_thought_experiment',

483 'func': create_thought_experiment,

484 'description': 'Create a new constitutional thought experiment',

485 'tags': ['thought_experiment'],

486 },

487 {

488 'name': 'cast_experiment_vote',

489 'func': cast_experiment_vote,

490 'description': 'Cast a vote on a thought experiment',

491 'tags': ['thought_experiment'],

492 },

493 {

494 'name': 'evaluate_thought_experiment',

495 'func': evaluate_thought_experiment,

496 'description': 'Record an agent evaluation for a thought experiment',

497 'tags': ['thought_experiment'],

498 },

499 {

500 'name': 'get_experiment_status',

501 'func': get_experiment_status,

502 'description': 'Get experiment detail or list experiments by status',

503 'tags': ['thought_experiment'],

504 },

505 {

506 'name': 'tally_experiment_votes',

507 'func': tally_experiment_votes,

508 'description': 'Get the current vote tally for an experiment',

509 'tags': ['thought_experiment'],

510 },

511 {

512 'name': 'advance_experiment',

513 'func': advance_experiment,

514 'description': 'Advance experiment to next lifecycle phase',

515 'tags': ['thought_experiment'],

516 },

517 {

518 'name': 'iterate_hypothesis',

519 'func': iterate_hypothesis,

520 'description': (

521 'Propose and track a hypothesis iteration for any thought experiment. '

522 'Use in a loop: propose → evidence → score → refine → repeat.'

523 ),

524 'tags': ['thought_experiment', 'iteration'],

525 },

526 {

527 'name': 'score_hypothesis_result',

528 'func': score_hypothesis_result,

529 'description': (

530 'Score a hypothesis iteration with structured rubric (evidence, '

531 'clarity, feasibility, impact). Returns trend analysis and '

532 'continuation advice.'

533 ),

534 'tags': ['thought_experiment', 'iteration'],

535 },

536 {

537 'name': 'get_iteration_history',

538 'func': get_iteration_history,

539 'description': (

540 'Get past hypothesis iterations with scores and trends. '

541 'Use to inform the next hypothesis refinement.'

542 ),

543 'tags': ['thought_experiment', 'iteration'],

544 },

545 {

546 'name': 'launch_experiment_autoresearch',

547 'func': launch_experiment_autoresearch,

548 'description': (

549 'Launch an autoresearch loop for a SOFTWARE thought experiment: '

550 'edit code, run experiments, score, keep best, iterate at hive scale. '

551 'For non-code experiments, use iterate_hypothesis instead.'

552 ),

553 'tags': ['thought_experiment', 'autoresearch'],

554 },

555 {

556 'name': 'get_experiment_research_status',

557 'func': get_experiment_research_status,

558 'description': 'Get autoresearch loop progress for a thought experiment',

559 'tags': ['thought_experiment', 'autoresearch'],

560 },

561]

Coverage for integrations / agent_engine / thought_experiment_tools.py: 68.1%

138 statements