Coverage for integrations / agent_engine / thought_experiment_tools.py: 68.1%
138 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Thought Experiment Agent Tools — AutoGen tools for thought experiment coordination.
48 tools for creating, voting, evaluating, and managing thought experiments.
5Includes autoresearch integration: software thought experiments can spawn
6autonomous edit→run→score→iterate loops at hive scale.
7Tier 2 tools (agent_engine context). Same pattern as learning_tools.py.
8"""
9import json
10import logging
11import threading
13try:
14 from core.session_cache import TTLCache
15 _file_locks = TTLCache(ttl_seconds=86400, max_size=50000, name='thought_exp_locks')
16except ImportError:
17 _file_locks = {}
18_file_locks_guard = threading.Lock()
20logger = logging.getLogger('hevolve_social')
23def create_thought_experiment(creator_id: str, title: str,
24 hypothesis: str,
25 expected_outcome: str = '',
26 intent_category: str = 'technology',
27 is_core_ip: bool = False) -> str:
28 """Create a new constitutional thought experiment."""
29 try:
30 from integrations.social.models import db_session
31 from integrations.social.thought_experiment_service import ThoughtExperimentService
33 with db_session() as db:
34 result = ThoughtExperimentService.create_experiment(
35 db, creator_id, title, hypothesis,
36 expected_outcome=expected_outcome,
37 intent_category=intent_category,
38 is_core_ip=is_core_ip)
39 if result:
40 return json.dumps({'success': True, 'experiment': result})
41 else:
42 return json.dumps({
43 'success': False,
44 'reason': 'Blocked by ConstitutionalFilter or invalid input',
45 })
46 except Exception as e:
47 return json.dumps({'error': str(e)})
50def cast_experiment_vote(experiment_id: str, voter_id: str,
51 vote_value: int = 0,
52 reasoning: str = '',
53 suggestion: str = '',
54 voter_type: str = 'agent',
55 confidence: float = 0.8) -> str:
56 """Cast a vote on a thought experiment (as agent or human)."""
57 try:
58 from integrations.social.models import db_session
59 from integrations.social.thought_experiment_service import ThoughtExperimentService
61 with db_session() as db:
62 result = ThoughtExperimentService.cast_vote(
63 db, experiment_id, voter_id,
64 vote_value=int(vote_value),
65 reasoning=reasoning,
66 suggestion=suggestion,
67 voter_type=voter_type,
68 confidence=float(confidence))
69 if result:
70 return json.dumps({'success': True, 'vote': result})
71 else:
72 return json.dumps({
73 'success': False,
74 'reason': 'Experiment not found or not in voting phase',
75 })
76 except Exception as e:
77 return json.dumps({'error': str(e)})
80def evaluate_thought_experiment(experiment_id: str, agent_id: str,
81 score: float = 0.0,
82 confidence: float = 0.8,
83 reasoning: str = '',
84 evidence: str = '') -> str:
85 """Record an agent evaluation for a thought experiment."""
86 try:
87 from integrations.social.models import db_session
88 from integrations.social.thought_experiment_service import ThoughtExperimentService
90 with db_session() as db:
91 result = ThoughtExperimentService.record_agent_evaluation(
92 db, experiment_id, agent_id,
93 score=float(score),
94 confidence=float(confidence),
95 reasoning=reasoning,
96 evidence=evidence)
97 if result:
98 return json.dumps({'success': True, 'experiment': result})
99 else:
100 return json.dumps({'success': False, 'reason': 'not_found'})
101 except Exception as e:
102 return json.dumps({'error': str(e)})
105def get_experiment_status(experiment_id: str = '',
106 status_filter: str = '') -> str:
107 """Get experiment detail or list experiments by status."""
108 try:
109 from integrations.social.models import db_session
110 from integrations.social.thought_experiment_service import ThoughtExperimentService
112 with db_session(commit=False) as db:
113 if experiment_id:
114 result = ThoughtExperimentService.get_experiment_detail(
115 db, experiment_id)
116 return json.dumps({'success': True, 'experiment': result})
117 else:
118 results = ThoughtExperimentService.get_active_experiments(
119 db, status=status_filter or None)
120 return json.dumps({
121 'success': True,
122 'experiments': results,
123 'count': len(results),
124 })
125 except Exception as e:
126 return json.dumps({'error': str(e)})
129def tally_experiment_votes(experiment_id: str) -> str:
130 """Get the current vote tally for an experiment."""
131 try:
132 from integrations.social.models import db_session
133 from integrations.social.thought_experiment_service import ThoughtExperimentService
135 with db_session(commit=False) as db:
136 tally = ThoughtExperimentService.tally_votes(db, experiment_id)
137 return json.dumps({'success': True, 'tally': tally})
138 except Exception as e:
139 return json.dumps({'error': str(e)})
142def advance_experiment(experiment_id: str,
143 target_status: str = '') -> str:
144 """Advance experiment to next lifecycle phase or specific status."""
145 try:
146 from integrations.social.models import db_session
147 from integrations.social.thought_experiment_service import ThoughtExperimentService
149 with db_session() as db:
150 result = ThoughtExperimentService.advance_status(
151 db, experiment_id,
152 target_status=target_status or None)
153 if result:
154 return json.dumps({'success': True, 'experiment': result})
155 else:
156 return json.dumps({
157 'success': False,
158 'reason': 'Cannot advance (invalid status or not found)',
159 })
160 except Exception as e:
161 return json.dumps({'error': str(e)})
164def iterate_hypothesis(experiment_id: str, hypothesis: str,
165 approach: str = '', evidence: str = '',
166 iteration: int = 0) -> str:
167 """Propose and evaluate a hypothesis iteration for ANY thought experiment.
169 This is the generic iteration tool — works for traditional, research,
170 physical_ai, or any experiment type. The agent calls this in a loop:
171 propose hypothesis → gather evidence → score → refine → repeat.
173 For software experiments, use launch_experiment_autoresearch instead.
175 Respects owner pause — if the experiment creator paused evolution,
176 this tool returns a pause signal and the agent should stop iterating.
178 Args:
179 experiment_id: The ThoughtExperiment ID
180 hypothesis: The refined hypothesis for this iteration
181 approach: How you plan to test/evaluate this hypothesis
182 evidence: Evidence or reasoning supporting this iteration
183 iteration: Current iteration number (for tracking)
185 Returns:
186 JSON with iteration record and experiment context
187 """
188 # Check owner pause
189 try:
190 from integrations.agent_engine.auto_evolve import is_experiment_paused
191 if is_experiment_paused(experiment_id):
192 return json.dumps({
193 'success': False,
194 'paused': True,
195 'reason': 'Experiment paused by owner. Stop iterating.',
196 'instruction': 'The experiment owner has paused evolution. '
197 'Do NOT continue iterating. Wait for resume.',
198 })
199 except ImportError:
200 pass
202 try:
203 from integrations.social.models import db_session
204 from integrations.social.thought_experiment_service import ThoughtExperimentService
206 with db_session(commit=False) as db:
207 detail = ThoughtExperimentService.get_experiment_detail(
208 db, experiment_id)
209 if not detail:
210 return json.dumps({'error': 'Experiment not found'})
212 # Build iteration record
213 iteration_record = {
214 'iteration': iteration,
215 'hypothesis': hypothesis,
216 'approach': approach,
217 'evidence': evidence,
218 'status': 'proposed',
219 }
221 # Return context for the agent to evaluate
222 return json.dumps({
223 'success': True,
224 'iteration': iteration_record,
225 'experiment': {
226 'id': experiment_id,
227 'title': detail.get('title', ''),
228 'original_hypothesis': detail.get('hypothesis', ''),
229 'expected_outcome': detail.get('expected_outcome', ''),
230 'intent_category': detail.get('intent_category', ''),
231 'status': detail.get('status', ''),
232 },
233 'instruction': (
234 'Now evaluate this hypothesis. Use score_hypothesis_result '
235 'to record your score (-2 to +2). Consider: evidence quality, '
236 'clarity, feasibility, and expected impact.'
237 ),
238 })
239 except Exception as e:
240 return json.dumps({'error': str(e)})
243def score_hypothesis_result(experiment_id: str, iteration: int,
244 score: float, reasoning: str,
245 evidence_quality: float = 0.0,
246 clarity: float = 0.0,
247 feasibility: float = 0.0,
248 impact: float = 0.0) -> str:
249 """Score a hypothesis iteration using a structured rubric.
251 Generic scoring tool for all experiment types. The agent uses this
252 after evaluating a hypothesis to decide whether to keep iterating
253 or converge on a conclusion.
255 Args:
256 experiment_id: The ThoughtExperiment ID
257 iteration: Iteration number being scored
258 score: Overall score (-2 to +2)
259 reasoning: Why this score
260 evidence_quality: Sub-score for evidence (0-1)
261 clarity: Sub-score for hypothesis clarity (0-1)
262 feasibility: Sub-score for feasibility (0-1)
263 impact: Sub-score for expected impact (0-1)
265 Returns:
266 JSON with score record, trend analysis, and continuation advice
267 """
268 import os
269 import tempfile
271 score = max(-2.0, min(2.0, float(score)))
273 # Load or create iteration history file
274 data_dir = os.path.join(
275 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations')
276 os.makedirs(data_dir, exist_ok=True)
277 history_path = os.path.join(data_dir, f'{experiment_id}.json')
279 # Per-experiment lock prevents read-modify-write race
280 with _file_locks_guard:
281 if experiment_id not in _file_locks:
282 _file_locks[experiment_id] = threading.Lock()
283 lock = _file_locks[experiment_id]
285 with lock:
286 history = []
287 if os.path.isfile(history_path):
288 try:
289 with open(history_path, 'r', encoding='utf-8') as f:
290 history = json.load(f)
291 except Exception:
292 history = []
294 record = {
295 'iteration': iteration,
296 'score': score,
297 'reasoning': reasoning,
298 'rubric': {
299 'evidence_quality': max(0.0, min(1.0, float(evidence_quality))),
300 'clarity': max(0.0, min(1.0, float(clarity))),
301 'feasibility': max(0.0, min(1.0, float(feasibility))),
302 'impact': max(0.0, min(1.0, float(impact))),
303 },
304 }
305 history.append(record)
307 # Atomic write: temp file + rename prevents partial writes
308 try:
309 fd, tmp_path = tempfile.mkstemp(dir=data_dir, suffix='.tmp')
310 try:
311 with os.fdopen(fd, 'w', encoding='utf-8') as f:
312 json.dump(history, f, indent=2, default=str)
313 os.replace(tmp_path, history_path)
314 except Exception:
315 try:
316 os.unlink(tmp_path)
317 except OSError:
318 pass
319 raise
320 except Exception:
321 pass
323 # Trend analysis
324 scores = [h['score'] for h in history]
325 best_score = max(scores)
326 best_iter = scores.index(best_score)
327 improving = len(scores) >= 2 and scores[-1] > scores[-2]
328 stagnant = len(scores) >= 3 and len(set(scores[-3:])) == 1
330 # Convergence advice
331 if stagnant:
332 advice = 'CONVERGE — 3 consecutive same scores. Record final evaluation.'
333 elif len(scores) >= 10:
334 advice = 'BUDGET — 10 iterations reached. Record final evaluation with best hypothesis.'
335 elif improving:
336 advice = 'CONTINUE — score is improving. Iterate with refined hypothesis.'
337 elif score >= 1.5:
338 advice = 'STRONG — high score. Consider recording final evaluation.'
339 else:
340 advice = 'CONTINUE — try a different approach or angle.'
342 return json.dumps({
343 'success': True,
344 'record': record,
345 'trend': {
346 'total_iterations': len(scores),
347 'best_score': best_score,
348 'best_iteration': best_iter,
349 'improving': improving,
350 'stagnant': stagnant,
351 },
352 'advice': advice,
353 })
356def get_iteration_history(experiment_id: str, last_n: int = 10) -> str:
357 """Get the iteration history for a thought experiment.
359 Returns past hypothesis iterations with scores and trend analysis.
360 The agent uses this to inform its next hypothesis refinement.
362 Args:
363 experiment_id: The ThoughtExperiment ID
364 last_n: Number of recent iterations to return (default 10)
366 Returns:
367 JSON with iteration history and summary statistics
368 """
369 import os
371 data_dir = os.path.join(
372 os.path.dirname(__file__), '..', '..', 'agent_data', 'experiment_iterations')
373 history_path = os.path.join(data_dir, f'{experiment_id}.json')
375 if not os.path.isfile(history_path):
376 return json.dumps({
377 'success': True,
378 'history': [],
379 'summary': 'No iterations yet. Use iterate_hypothesis to start.',
380 })
382 try:
383 with open(history_path, 'r', encoding='utf-8') as f:
384 history = json.load(f)
385 except Exception:
386 history = []
388 last_n = min(int(last_n), len(history))
389 recent = history[-last_n:] if last_n > 0 else history
391 scores = [h['score'] for h in history]
392 summary = {
393 'total_iterations': len(history),
394 'best_score': max(scores) if scores else None,
395 'worst_score': min(scores) if scores else None,
396 'avg_score': round(sum(scores) / len(scores), 2) if scores else None,
397 'improving_trend': (
398 len(scores) >= 2 and scores[-1] > scores[-2]
399 ),
400 }
402 return json.dumps({
403 'success': True,
404 'history': recent,
405 'summary': summary,
406 })
409def launch_experiment_autoresearch(experiment_id: str,
410 repo_path: str,
411 target_file: str,
412 run_command: str,
413 metric_name: str = 'score',
414 metric_pattern: str = '',
415 metric_direction: str = 'higher_is_better',
416 max_iterations: int = 50,
417 time_budget_s: int = 300,
418 hive_parallel: bool = False) -> str:
419 """Launch an autoresearch loop for a software thought experiment.
421 When a thought experiment has experiment_type='software' and reaches the
422 evaluating phase, this tool starts the autonomous edit→run→score→iterate
423 loop. The engine modifies target_file, runs run_command, extracts the
424 metric, keeps improvements, and iterates until budget or max_iterations.
426 At hive scale: when hive_parallel=True, multiple hypothesis variants run
427 simultaneously across compute mesh peers (tournament selection picks best).
429 Args:
430 experiment_id: The ThoughtExperiment ID to attach results to
431 repo_path: Path to the git repository
432 target_file: The file to modify (relative to repo_path)
433 run_command: Shell command to run the experiment
434 metric_name: Name of the metric to optimize
435 metric_pattern: Regex with group(1) to extract metric from output
436 metric_direction: 'higher_is_better' or 'lower_is_better'
437 max_iterations: Max iterations before stopping
438 time_budget_s: Per-iteration time budget in seconds
439 hive_parallel: If True, run parallel variants across hive peers
441 Returns:
442 JSON with session_id and status
443 """
444 try:
445 from integrations.coding_agent.autoevolve_code_tools import start_autoresearch
446 return start_autoresearch(
447 repo_path=repo_path,
448 target_file=target_file,
449 run_command=run_command,
450 metric_name=metric_name,
451 metric_pattern=metric_pattern,
452 metric_direction=metric_direction,
453 max_iterations=max_iterations,
454 time_budget_s=time_budget_s,
455 experiment_id=experiment_id,
456 hive_parallel=hive_parallel,
457 )
458 except Exception as e:
459 return json.dumps({'error': str(e)})
462def get_experiment_research_status(session_id: str = '') -> str:
463 """Get autoresearch loop progress for a thought experiment.
465 Args:
466 session_id: The autoresearch session ID (returned by launch_experiment_autoresearch)
468 Returns:
469 JSON with iteration count, best metric, improvements, budget consumed
470 """
471 try:
472 from integrations.coding_agent.autoevolve_code_tools import get_autoresearch_status
473 return get_autoresearch_status(session_id)
474 except Exception as e:
475 return json.dumps({'error': str(e)})
478# ─── Tool Registration ───
480THOUGHT_EXPERIMENT_TOOLS = [
481 {
482 'name': 'create_thought_experiment',
483 'func': create_thought_experiment,
484 'description': 'Create a new constitutional thought experiment',
485 'tags': ['thought_experiment'],
486 },
487 {
488 'name': 'cast_experiment_vote',
489 'func': cast_experiment_vote,
490 'description': 'Cast a vote on a thought experiment',
491 'tags': ['thought_experiment'],
492 },
493 {
494 'name': 'evaluate_thought_experiment',
495 'func': evaluate_thought_experiment,
496 'description': 'Record an agent evaluation for a thought experiment',
497 'tags': ['thought_experiment'],
498 },
499 {
500 'name': 'get_experiment_status',
501 'func': get_experiment_status,
502 'description': 'Get experiment detail or list experiments by status',
503 'tags': ['thought_experiment'],
504 },
505 {
506 'name': 'tally_experiment_votes',
507 'func': tally_experiment_votes,
508 'description': 'Get the current vote tally for an experiment',
509 'tags': ['thought_experiment'],
510 },
511 {
512 'name': 'advance_experiment',
513 'func': advance_experiment,
514 'description': 'Advance experiment to next lifecycle phase',
515 'tags': ['thought_experiment'],
516 },
517 {
518 'name': 'iterate_hypothesis',
519 'func': iterate_hypothesis,
520 'description': (
521 'Propose and track a hypothesis iteration for any thought experiment. '
522 'Use in a loop: propose → evidence → score → refine → repeat.'
523 ),
524 'tags': ['thought_experiment', 'iteration'],
525 },
526 {
527 'name': 'score_hypothesis_result',
528 'func': score_hypothesis_result,
529 'description': (
530 'Score a hypothesis iteration with structured rubric (evidence, '
531 'clarity, feasibility, impact). Returns trend analysis and '
532 'continuation advice.'
533 ),
534 'tags': ['thought_experiment', 'iteration'],
535 },
536 {
537 'name': 'get_iteration_history',
538 'func': get_iteration_history,
539 'description': (
540 'Get past hypothesis iterations with scores and trends. '
541 'Use to inform the next hypothesis refinement.'
542 ),
543 'tags': ['thought_experiment', 'iteration'],
544 },
545 {
546 'name': 'launch_experiment_autoresearch',
547 'func': launch_experiment_autoresearch,
548 'description': (
549 'Launch an autoresearch loop for a SOFTWARE thought experiment: '
550 'edit code, run experiments, score, keep best, iterate at hive scale. '
551 'For non-code experiments, use iterate_hypothesis instead.'
552 ),
553 'tags': ['thought_experiment', 'autoresearch'],
554 },
555 {
556 'name': 'get_experiment_research_status',
557 'func': get_experiment_research_status,
558 'description': 'Get autoresearch loop progress for a thought experiment',
559 'tags': ['thought_experiment', 'autoresearch'],
560 },
561]