Coverage for integrations / agent_engine / thought_experiment_tools.py: 68.1%

138 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Thought Experiment Agent Tools — AutoGen tools for thought experiment coordination. 

3 

48 tools for creating, voting, evaluating, and managing thought experiments. 

5Includes autoresearch integration: software thought experiments can spawn 

6autonomous edit→run→score→iterate loops at hive scale. 

7Tier 2 tools (agent_engine context). Same pattern as learning_tools.py. 

8""" 

9import json 

10import logging 

11import threading 

12 

13try: 

14 from core.session_cache import TTLCache 

15 _file_locks = TTLCache(ttl_seconds=86400, max_size=50000, name='thought_exp_locks') 

16except ImportError: 

17 _file_locks = {} 

18_file_locks_guard = threading.Lock() 

19 

20logger = logging.getLogger('hevolve_social') 

21 

22 

23def create_thought_experiment(creator_id: str, title: str, 

24 hypothesis: str, 

25 expected_outcome: str = '', 

26 intent_category: str = 'technology', 

27 is_core_ip: bool = False) -> str: 

28 """Create a new constitutional thought experiment.""" 

29 try: 

30 from integrations.social.models import db_session 

31 from integrations.social.thought_experiment_service import ThoughtExperimentService 

32 

33 with db_session() as db: 

34 result = ThoughtExperimentService.create_experiment( 

35 db, creator_id, title, hypothesis, 

36 expected_outcome=expected_outcome, 

37 intent_category=intent_category, 

38 is_core_ip=is_core_ip) 

39 if result: 

40 return json.dumps({'success': True, 'experiment': result}) 

41 else: 

42 return json.dumps({ 

43 'success': False, 

44 'reason': 'Blocked by ConstitutionalFilter or invalid input', 

45 }) 

46 except Exception as e: 

47 return json.dumps({'error': str(e)}) 

48 

49 

50def cast_experiment_vote(experiment_id: str, voter_id: str, 

51 vote_value: int = 0, 

52 reasoning: str = '', 

53 suggestion: str = '', 

54 voter_type: str = 'agent', 

55 confidence: float = 0.8) -> str: 

56 """Cast a vote on a thought experiment (as agent or human).""" 

57 try: 

58 from integrations.social.models import db_session 

59 from integrations.social.thought_experiment_service import ThoughtExperimentService 

60 

61 with db_session() as db: 

62 result = ThoughtExperimentService.cast_vote( 

63 db, experiment_id, voter_id, 

64 vote_value=int(vote_value), 

65 reasoning=reasoning, 

66 suggestion=suggestion, 

67 voter_type=voter_type, 

68 confidence=float(confidence)) 

69 if result: 

70 return json.dumps({'success': True, 'vote': result}) 

71 else: 

72 return json.dumps({ 

73 'success': False, 

74 'reason': 'Experiment not found or not in voting phase', 

75 }) 

76 except Exception as e: 

77 return json.dumps({'error': str(e)}) 

78 

79 

80def evaluate_thought_experiment(experiment_id: str, agent_id: str, 

81 score: float = 0.0, 

82 confidence: float = 0.8, 

83 reasoning: str = '', 

84 evidence: str = '') -> str: 

85 """Record an agent evaluation for a thought experiment.""" 

86 try: 

87 from integrations.social.models import db_session 

88 from integrations.social.thought_experiment_service import ThoughtExperimentService 

89 

90 with db_session() as db: 

91 result = ThoughtExperimentService.record_agent_evaluation( 

92 db, experiment_id, agent_id, 

93 score=float(score), 

94 confidence=float(confidence), 

95 reasoning=reasoning, 

96 evidence=evidence) 

97 if result: 

98 return json.dumps({'success': True, 'experiment': result}) 

99 else: 

100 return json.dumps({'success': False, 'reason': 'not_found'}) 

101 except Exception as e: 

102 return json.dumps({'error': str(e)}) 

103 

104 

105def get_experiment_status(experiment_id: str = '', 

106 status_filter: str = '') -> str: 

107 """Get experiment detail or list experiments by status.""" 

108 try: 

109 from integrations.social.models import db_session 

110 from integrations.social.thought_experiment_service import ThoughtExperimentService 

111 

112 with db_session(commit=False) as db: 

113 if experiment_id: 

114 result = ThoughtExperimentService.get_experiment_detail( 

115 db, experiment_id) 

116 return json.dumps({'success': True, 'experiment': result}) 

117 else: 

118 results = ThoughtExperimentService.get_active_experiments( 

119 db, status=status_filter or None) 

120 return json.dumps({ 

121 'success': True, 

122 'experiments': results, 

123 'count': len(results), 

124 }) 

125 except Exception as e: 

126 return json.dumps({'error': str(e)}) 

127 

128 

129def tally_experiment_votes(experiment_id: str) -> str: 

130 """Get the current vote tally for an experiment.""" 

131 try: 

132 from integrations.social.models import db_session 

133 from integrations.social.thought_experiment_service import ThoughtExperimentService 

134 

135 with db_session(commit=False) as db: 

136 tally = ThoughtExperimentService.tally_votes(db, experiment_id) 

137 return json.dumps({'success': True, 'tally': tally}) 

138 except Exception as e: 

139 return json.dumps({'error': str(e)}) 

140 

141 

142def advance_experiment(experiment_id: str, 

143 target_status: str = '') -> str: 

144 """Advance experiment to next lifecycle phase or specific status.""" 

145 try: 

146 from integrations.social.models import db_session 

147 from integrations.social.thought_experiment_service import ThoughtExperimentService 

148 

149 with db_session() as db: 

150 result = ThoughtExperimentService.advance_status( 

151 db, experiment_id, 

152 target_status=target_status or None) 

153 if result: 

154 return json.dumps({'success': True, 'experiment': result}) 

155 else: 

156 return json.dumps({ 

157 'success': False, 

158 'reason': 'Cannot advance (invalid status or not found)', 

159 }) 

160 except Exception as e: 

161 return json.dumps({'error': str(e)}) 

162 

163 

164def iterate_hypothesis(experiment_id: str, hypothesis: str, 

165 approach: str = '', evidence: str = '', 

166 iteration: int = 0) -> str: 

167 """Propose and evaluate a hypothesis iteration for ANY thought experiment. 

168 

169 This is the generic iteration tool — works for traditional, research, 

170 physical_ai, or any experiment type. The agent calls this in a loop: 

171 propose hypothesis → gather evidence → score → refine → repeat. 

172 

173 For software experiments, use launch_experiment_autoresearch instead. 

174 

175 Respects owner pause — if the experiment creator paused evolution, 

176 this tool returns a pause signal and the agent should stop iterating. 

177 

178 Args: 

179 experiment_id: The ThoughtExperiment ID 

180 hypothesis: The refined hypothesis for this iteration 

181 approach: How you plan to test/evaluate this hypothesis 

182 evidence: Evidence or reasoning supporting this iteration 

183 iteration: Current iteration number (for tracking) 

184 

185 Returns: 

186 JSON with iteration record and experiment context 

187 """ 

188 # Check owner pause 

189 try: 

190 from integrations.agent_engine.auto_evolve import is_experiment_paused 

191 if is_experiment_paused(experiment_id): 

192 return json.dumps({ 

193 'success': False, 

194 'paused': True, 

195 'reason': 'Experiment paused by owner. Stop iterating.', 

196 'instruction': 'The experiment owner has paused evolution. ' 

197 'Do NOT continue iterating. Wait for resume.', 

198 }) 

199 except ImportError: 

200 pass 

201 

202 try: 

203 from integrations.social.models import db_session 

204 from integrations.social.thought_experiment_service import ThoughtExperimentService 

205 

206 with db_session(commit=False) as db: 

207 detail = ThoughtExperimentService.get_experiment_detail( 

208 db, experiment_id) 

209 if not detail: 

210 return json.dumps({'error': 'Experiment not found'}) 

211 

212 # Build iteration record 

213 iteration_record = { 

214 'iteration': iteration, 

215 'hypothesis': hypothesis, 

216 'approach': approach, 

217 'evidence': evidence, 

218 'status': 'proposed', 

219 } 

220 

221 # Return context for the agent to evaluate 

222 return json.dumps({ 

223 'success': True, 

224 'iteration': iteration_record, 

225 'experiment': { 

226 'id': experiment_id, 

227 'title': detail.get('title', ''), 

228 'original_hypothesis': detail.get('hypothesis', ''), 

229 'expected_outcome': detail.get('expected_outcome', ''), 

230 'intent_category': detail.get('intent_category', ''), 

231 'status': detail.get('status', ''), 

232 }, 

233 'instruction': ( 

234 'Now evaluate this hypothesis. Use score_hypothesis_result ' 

235 'to record your score (-2 to +2). Consider: evidence quality, ' 

236 'clarity, feasibility, and expected impact.' 

237 ), 

238 }) 

239 except Exception as e: 

240 return json.dumps({'error': str(e)}) 

241 

242 

243def score_hypothesis_result(experiment_id: str, iteration: int, 

244 score: float, reasoning: str, 

245 evidence_quality: float = 0.0, 

246 clarity: float = 0.0, 

247 feasibility: float = 0.0, 

248 impact: float = 0.0) -> str: 

249 """Score a hypothesis iteration using a structured rubric. 

250 

251 Generic scoring tool for all experiment types. The agent uses this 

252 after evaluating a hypothesis to decide whether to keep iterating 

253 or converge on a conclusion. 

254 

255 Args: 

256 experiment_id: The ThoughtExperiment ID 

257 iteration: Iteration number being scored 

258 score: Overall score (-2 to +2) 

259 reasoning: Why this score 

260 evidence_quality: Sub-score for evidence (0-1) 

261 clarity: Sub-score for hypothesis clarity (0-1) 

262 feasibility: Sub-score for feasibility (0-1) 

263 impact: Sub-score for expected impact (0-1) 

264 

265 Returns: 

266 JSON with score record, trend analysis, and continuation advice 

267 """ 

268 import os 

269 import tempfile 

270 

271 score = max(-2.0, min(2.0, float(score))) 

272 

273 # Load or create iteration history file 

274 data_dir = os.path.join( 

275 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations') 

276 os.makedirs(data_dir, exist_ok=True) 

277 history_path = os.path.join(data_dir, f'{experiment_id}.json') 

278 

279 # Per-experiment lock prevents read-modify-write race 

280 with _file_locks_guard: 

281 if experiment_id not in _file_locks: 

282 _file_locks[experiment_id] = threading.Lock() 

283 lock = _file_locks[experiment_id] 

284 

285 with lock: 

286 history = [] 

287 if os.path.isfile(history_path): 

288 try: 

289 with open(history_path, 'r', encoding='utf-8') as f: 

290 history = json.load(f) 

291 except Exception: 

292 history = [] 

293 

294 record = { 

295 'iteration': iteration, 

296 'score': score, 

297 'reasoning': reasoning, 

298 'rubric': { 

299 'evidence_quality': max(0.0, min(1.0, float(evidence_quality))), 

300 'clarity': max(0.0, min(1.0, float(clarity))), 

301 'feasibility': max(0.0, min(1.0, float(feasibility))), 

302 'impact': max(0.0, min(1.0, float(impact))), 

303 }, 

304 } 

305 history.append(record) 

306 

307 # Atomic write: temp file + rename prevents partial writes 

308 try: 

309 fd, tmp_path = tempfile.mkstemp(dir=data_dir, suffix='.tmp') 

310 try: 

311 with os.fdopen(fd, 'w', encoding='utf-8') as f: 

312 json.dump(history, f, indent=2, default=str) 

313 os.replace(tmp_path, history_path) 

314 except Exception: 

315 try: 

316 os.unlink(tmp_path) 

317 except OSError: 

318 pass 

319 raise 

320 except Exception: 

321 pass 

322 

323 # Trend analysis 

324 scores = [h['score'] for h in history] 

325 best_score = max(scores) 

326 best_iter = scores.index(best_score) 

327 improving = len(scores) >= 2 and scores[-1] > scores[-2] 

328 stagnant = len(scores) >= 3 and len(set(scores[-3:])) == 1 

329 

330 # Convergence advice 

331 if stagnant: 

332 advice = 'CONVERGE — 3 consecutive same scores. Record final evaluation.' 

333 elif len(scores) >= 10: 

334 advice = 'BUDGET — 10 iterations reached. Record final evaluation with best hypothesis.' 

335 elif improving: 

336 advice = 'CONTINUE — score is improving. Iterate with refined hypothesis.' 

337 elif score >= 1.5: 

338 advice = 'STRONG — high score. Consider recording final evaluation.' 

339 else: 

340 advice = 'CONTINUE — try a different approach or angle.' 

341 

342 return json.dumps({ 

343 'success': True, 

344 'record': record, 

345 'trend': { 

346 'total_iterations': len(scores), 

347 'best_score': best_score, 

348 'best_iteration': best_iter, 

349 'improving': improving, 

350 'stagnant': stagnant, 

351 }, 

352 'advice': advice, 

353 }) 

354 

355 

356def get_iteration_history(experiment_id: str, last_n: int = 10) -> str: 

357 """Get the iteration history for a thought experiment. 

358 

359 Returns past hypothesis iterations with scores and trend analysis. 

360 The agent uses this to inform its next hypothesis refinement. 

361 

362 Args: 

363 experiment_id: The ThoughtExperiment ID 

364 last_n: Number of recent iterations to return (default 10) 

365 

366 Returns: 

367 JSON with iteration history and summary statistics 

368 """ 

369 import os 

370 

371 data_dir = os.path.join( 

372 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations') 

373 history_path = os.path.join(data_dir, f'{experiment_id}.json') 

374 

375 if not os.path.isfile(history_path): 

376 return json.dumps({ 

377 'success': True, 

378 'history': [], 

379 'summary': 'No iterations yet. Use iterate_hypothesis to start.', 

380 }) 

381 

382 try: 

383 with open(history_path, 'r', encoding='utf-8') as f: 

384 history = json.load(f) 

385 except Exception: 

386 history = [] 

387 

388 last_n = min(int(last_n), len(history)) 

389 recent = history[-last_n:] if last_n > 0 else history 

390 

391 scores = [h['score'] for h in history] 

392 summary = { 

393 'total_iterations': len(history), 

394 'best_score': max(scores) if scores else None, 

395 'worst_score': min(scores) if scores else None, 

396 'avg_score': round(sum(scores) / len(scores), 2) if scores else None, 

397 'improving_trend': ( 

398 len(scores) >= 2 and scores[-1] > scores[-2] 

399 ), 

400 } 

401 

402 return json.dumps({ 

403 'success': True, 

404 'history': recent, 

405 'summary': summary, 

406 }) 

407 

408 

409def launch_experiment_autoresearch(experiment_id: str, 

410 repo_path: str, 

411 target_file: str, 

412 run_command: str, 

413 metric_name: str = 'score', 

414 metric_pattern: str = '', 

415 metric_direction: str = 'higher_is_better', 

416 max_iterations: int = 50, 

417 time_budget_s: int = 300, 

418 hive_parallel: bool = False) -> str: 

419 """Launch an autoresearch loop for a software thought experiment. 

420 

421 When a thought experiment has experiment_type='software' and reaches the 

422 evaluating phase, this tool starts the autonomous edit→run→score→iterate 

423 loop. The engine modifies target_file, runs run_command, extracts the 

424 metric, keeps improvements, and iterates until budget or max_iterations. 

425 

426 At hive scale: when hive_parallel=True, multiple hypothesis variants run 

427 simultaneously across compute mesh peers (tournament selection picks best). 

428 

429 Args: 

430 experiment_id: The ThoughtExperiment ID to attach results to 

431 repo_path: Path to the git repository 

432 target_file: The file to modify (relative to repo_path) 

433 run_command: Shell command to run the experiment 

434 metric_name: Name of the metric to optimize 

435 metric_pattern: Regex with group(1) to extract metric from output 

436 metric_direction: 'higher_is_better' or 'lower_is_better' 

437 max_iterations: Max iterations before stopping 

438 time_budget_s: Per-iteration time budget in seconds 

439 hive_parallel: If True, run parallel variants across hive peers 

440 

441 Returns: 

442 JSON with session_id and status 

443 """ 

444 try: 

445 from integrations.coding_agent.autoevolve_code_tools import start_autoresearch 

446 return start_autoresearch( 

447 repo_path=repo_path, 

448 target_file=target_file, 

449 run_command=run_command, 

450 metric_name=metric_name, 

451 metric_pattern=metric_pattern, 

452 metric_direction=metric_direction, 

453 max_iterations=max_iterations, 

454 time_budget_s=time_budget_s, 

455 experiment_id=experiment_id, 

456 hive_parallel=hive_parallel, 

457 ) 

458 except Exception as e: 

459 return json.dumps({'error': str(e)}) 

460 

461 

462def get_experiment_research_status(session_id: str = '') -> str: 

463 """Get autoresearch loop progress for a thought experiment. 

464 

465 Args: 

466 session_id: The autoresearch session ID (returned by launch_experiment_autoresearch) 

467 

468 Returns: 

469 JSON with iteration count, best metric, improvements, budget consumed 

470 """ 

471 try: 

472 from integrations.coding_agent.autoevolve_code_tools import get_autoresearch_status 

473 return get_autoresearch_status(session_id) 

474 except Exception as e: 

475 return json.dumps({'error': str(e)}) 

476 

477 

478# ─── Tool Registration ─── 

479 

480THOUGHT_EXPERIMENT_TOOLS = [ 

481 { 

482 'name': 'create_thought_experiment', 

483 'func': create_thought_experiment, 

484 'description': 'Create a new constitutional thought experiment', 

485 'tags': ['thought_experiment'], 

486 }, 

487 { 

488 'name': 'cast_experiment_vote', 

489 'func': cast_experiment_vote, 

490 'description': 'Cast a vote on a thought experiment', 

491 'tags': ['thought_experiment'], 

492 }, 

493 { 

494 'name': 'evaluate_thought_experiment', 

495 'func': evaluate_thought_experiment, 

496 'description': 'Record an agent evaluation for a thought experiment', 

497 'tags': ['thought_experiment'], 

498 }, 

499 { 

500 'name': 'get_experiment_status', 

501 'func': get_experiment_status, 

502 'description': 'Get experiment detail or list experiments by status', 

503 'tags': ['thought_experiment'], 

504 }, 

505 { 

506 'name': 'tally_experiment_votes', 

507 'func': tally_experiment_votes, 

508 'description': 'Get the current vote tally for an experiment', 

509 'tags': ['thought_experiment'], 

510 }, 

511 { 

512 'name': 'advance_experiment', 

513 'func': advance_experiment, 

514 'description': 'Advance experiment to next lifecycle phase', 

515 'tags': ['thought_experiment'], 

516 }, 

517 { 

518 'name': 'iterate_hypothesis', 

519 'func': iterate_hypothesis, 

520 'description': ( 

521 'Propose and track a hypothesis iteration for any thought experiment. ' 

522 'Use in a loop: propose → evidence → score → refine → repeat.' 

523 ), 

524 'tags': ['thought_experiment', 'iteration'], 

525 }, 

526 { 

527 'name': 'score_hypothesis_result', 

528 'func': score_hypothesis_result, 

529 'description': ( 

530 'Score a hypothesis iteration with structured rubric (evidence, ' 

531 'clarity, feasibility, impact). Returns trend analysis and ' 

532 'continuation advice.' 

533 ), 

534 'tags': ['thought_experiment', 'iteration'], 

535 }, 

536 { 

537 'name': 'get_iteration_history', 

538 'func': get_iteration_history, 

539 'description': ( 

540 'Get past hypothesis iterations with scores and trends. ' 

541 'Use to inform the next hypothesis refinement.' 

542 ), 

543 'tags': ['thought_experiment', 'iteration'], 

544 }, 

545 { 

546 'name': 'launch_experiment_autoresearch', 

547 'func': launch_experiment_autoresearch, 

548 'description': ( 

549 'Launch an autoresearch loop for a SOFTWARE thought experiment: ' 

550 'edit code, run experiments, score, keep best, iterate at hive scale. ' 

551 'For non-code experiments, use iterate_hypothesis instead.' 

552 ), 

553 'tags': ['thought_experiment', 'autoresearch'], 

554 }, 

555 { 

556 'name': 'get_experiment_research_status', 

557 'func': get_experiment_research_status, 

558 'description': 'Get autoresearch loop progress for a thought experiment', 

559 'tags': ['thought_experiment', 'autoresearch'], 

560 }, 

561]