Coverage for integrations / service_tools / media_agent.py: 49.1%

275 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Unified Media Generation Agent — single AutoGen tool for all media modalities. 

3 

4Any agent in the system can call `generate_media()` to produce content across 

5image, audio (speech/music), and video modalities. The agent auto-selects the 

6best available tool, auto-starts services if needed, and returns results in a 

7consistent JSON format. 

8 

9Routing table: 

10 image → txt2img (external service) 

11 audio_speech → tts_audio_suite (auto-start sidecar) 

12 audio_music → acestep (external: uv run acestep-api) 

13 audio_speech_music → tts_audio_suite + acestep 

14 video → wan2gp (VRAM >= 8GB) | ltx2 fallback 

15 video_with_audio → video tool + tts_audio_suite 

16 

17Companion tool: `check_media_status()` for polling async tasks. 

18""" 

19 

20import json 

21import logging 

22import time 

23from typing import Annotated, Optional 

24 

25logger = logging.getLogger(__name__) 

26 

27# Valid output modalities 

28VALID_MODALITIES = { 

29 'image', 'audio_speech', 'audio_music', 

30 'audio_speech_music', 'video', 'video_with_audio', 

31} 

32 

33 

34# ═══════════════════════════════════════════════════════════════ 

35# Runtime capability introspection 

36# ═══════════════════════════════════════════════════════════════ 

37 

38def _can_do(model_type: str, capability: str = None) -> bool: 

39 """Universal capability check — delegates to orchestrator. 

40 

41 Works for any model type or dynamic service category. 

42 Single source of truth: orchestrator merges catalog + services + runtime. 

43 """ 

44 try: 

45 from integrations.service_tools.model_orchestrator import get_orchestrator 

46 return get_orchestrator().can_do(model_type, capability) 

47 except Exception: 

48 return False 

49 

50 

51# ═══════════════════════════════════════════════════════════════ 

52# Auto-start helpers 

53# ═══════════════════════════════════════════════════════════════ 

54 

55def _ensure_tool_running(tool_name: str) -> bool: 

56 """Auto-start a tool if it's not running. Returns True if available.""" 

57 try: 

58 from integrations.service_tools.runtime_manager import runtime_tool_manager 

59 status = runtime_tool_manager.get_tool_status(tool_name) 

60 if status.get('running'): 

61 return True 

62 result = runtime_tool_manager.setup_tool(tool_name) 

63 return result.get('running', False) 

64 except Exception as e: 

65 logger.warning(f"Auto-start failed for {tool_name}: {e}") 

66 return False 

67 

68 

69def populate_videogen_catalog(catalog) -> int: 

70 """Register all video generation model variants into the ModelCatalog. 

71 

72 This is the single source of truth for video gen model names, VRAM 

73 thresholds, and capabilities — replacing the hardcoded free_gb >= 8.0 

74 threshold in _select_video_tool(). 

75 

76 Called by ModelCatalog._populate_videogen_models(). 

77 Returns number of new entries added. 

78 """ 

79 from integrations.service_tools.model_catalog import ModelEntry, ModelType 

80 

81 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier, 

82 # supports_cpu, supports_cpu_offload, caps) 

83 videogen_models = [ 

84 ( 

85 'video_gen-wan2gp', 'Wan2GP', 

86 8.0, 12.0, 15.0, 0.88, 0.65, 'full', 

87 False, False, 

88 {'txt2vid': True, 'img2vid': True, 'resolution': '512x320', 

89 'fps': 24, 'async_task': True}, 

90 ), 

91 ( 

92 'video_gen-ltx2', 'LTX-Video-2', 

93 4.0, 8.0, 10.0, 0.78, 0.78, 'standard', 

94 True, True, 

95 {'txt2vid': True, 'img2vid': False, 'resolution': '832x480', 

96 'fps': 24, 'async_task': True, 'cpu_offload': True}, 

97 ), 

98 ] 

99 

100 added = 0 

101 for (mid, name, vram, ram, disk, quality, speed, min_tier, 

102 sup_cpu, sup_offload, caps) in videogen_models: 

103 if catalog.get(mid) is not None: 

104 continue 

105 entry = ModelEntry( 

106 id=mid, name=name, model_type=ModelType.VIDEO_GEN, 

107 source='huggingface', 

108 vram_gb=vram, ram_gb=ram, disk_gb=disk, 

109 min_capability_tier=min_tier, 

110 backend='sidecar', 

111 supports_gpu=True, supports_cpu=sup_cpu, 

112 supports_cpu_offload=sup_offload, 

113 cpu_offload_method='restart_cpu' if sup_offload else 'none', 

114 idle_timeout_s=600, 

115 capabilities=caps, 

116 quality_score=quality, speed_score=speed, 

117 tags=['local', 'video_gen'], 

118 ) 

119 catalog.register(entry, persist=False) 

120 added += 1 

121 return added 

122 

123 

124def populate_audiogen_catalog(catalog) -> int: 

125 """Register audio generation models (music, singing) into ModelCatalog. 

126 

127 Same pattern as populate_videogen_catalog — capabilities-based routing 

128 so the orchestrator can select the right model for music/singing tasks. 

129 

130 Called by ModelCatalog._populate_audiogen_models(). 

131 Returns number of new entries added. 

132 """ 

133 from integrations.service_tools.model_catalog import ModelEntry, ModelType 

134 

135 audiogen_models = [ 

136 ( 

137 'audio_gen-acestep', 'ACE-Step 1.5', 

138 6.0, 6.0, 4.0, 0.85, 0.90, 'standard', 

139 False, False, 

140 {'music_gen': True, 'singing': True, 'lyrics_input': True, 

141 'genre_control': True, 'tempo_control': True, 

142 'max_duration_s': 120, 'async_task': True}, 

143 ), 

144 ( 

145 'audio_gen-diffrhythm', 'DiffRhythm v1.2', 

146 4.0, 4.0, 3.0, 0.80, 0.75, 'standard', 

147 True, True, 

148 {'music_gen': False, 'singing': True, 'singing_voice': True, 

149 'lyrics_input': True, 'voice_conversion': True, 

150 'max_duration_s': 60, 'async_task': False}, 

151 ), 

152 ] 

153 

154 added = 0 

155 for (mid, name, vram, ram, disk, quality, speed, min_tier, 

156 sup_cpu, sup_offload, caps) in audiogen_models: 

157 if catalog.get(mid) is not None: 

158 continue 

159 entry = ModelEntry( 

160 id=mid, name=name, model_type=ModelType.AUDIO_GEN, 

161 source='huggingface', 

162 vram_gb=vram, ram_gb=ram, disk_gb=disk, 

163 min_capability_tier=min_tier, 

164 backend='sidecar', 

165 supports_gpu=True, supports_cpu=sup_cpu, 

166 supports_cpu_offload=sup_offload, 

167 cpu_offload_method='restart_cpu' if sup_offload else 'none', 

168 idle_timeout_s=600, 

169 capabilities=caps, 

170 quality_score=quality, speed_score=speed, 

171 tags=['local', 'audio_gen'], 

172 ) 

173 catalog.register(entry, persist=False) 

174 added += 1 

175 return added 

176 

177 

178def _select_audio_tool(task: str = 'music') -> str: 

179 """Select the best audio generation tool for a task. 

180 

181 Consults ModelCatalog via orchestrator — same pattern as _select_video_tool. 

182 task: 'music' → ACE Step, 'sing'/'lyrics' → DiffRhythm (fallback ACE Step) 

183 """ 

184 cap_key = 'singing_voice' if task in ('sing', 'lyrics') else 'music_gen' 

185 try: 

186 from integrations.service_tools.model_orchestrator import get_orchestrator 

187 entry = get_orchestrator().select_best( 

188 'audio_gen', require_capability={cap_key: True}) 

189 if entry: 

190 _CATALOG_TO_TOOL = { 

191 'audio_gen-acestep': 'acestep', 

192 'audio_gen-diffrhythm': 'diffrhythm', 

193 } 

194 tool = _CATALOG_TO_TOOL.get(entry.id) 

195 if tool: 

196 return tool 

197 except Exception: 

198 pass 

199 # Fallback defaults 

200 return 'diffrhythm' if task in ('sing', 'lyrics') else 'acestep' 

201 

202 

203def _select_video_tool() -> str: 

204 """Select the best video generation tool for current hardware. 

205 

206 Consults ModelCatalog (single source of truth for VRAM thresholds). 

207 Falls back to direct VRAM query if catalog is unavailable. 

208 

209 Returns 'wan2gp' or 'ltx2'. 

210 """ 

211 # ── Primary path: ask the catalog/orchestrator ─────────────────────────── 

212 try: 

213 from integrations.service_tools.model_orchestrator import get_orchestrator 

214 entry = get_orchestrator().select_best('video_gen') 

215 if entry: 

216 # Map catalog ID → tool name used by service_tool_registry 

217 _CATALOG_TO_TOOL = { 

218 'video_gen-wan2gp': 'wan2gp', 

219 'video_gen-ltx2': 'ltx2', 

220 } 

221 tool = _CATALOG_TO_TOOL.get(entry.id) 

222 if tool: 

223 return tool 

224 except Exception: 

225 pass 

226 

227 # ── Fallback: direct VRAM query ────────────────────────────────────────── 

228 try: 

229 from integrations.service_tools.vram_manager import vram_manager 

230 info = vram_manager.detect_gpu() 

231 free_gb = info.get('free_gb', 0) 

232 if free_gb >= 8.0: 

233 return 'wan2gp' 

234 except Exception: 

235 pass 

236 return 'ltx2' 

237 

238 

239def _get_tool_base_url(tool_name: str) -> Optional[str]: 

240 """Get the base URL for a registered tool.""" 

241 try: 

242 from integrations.service_tools.registry import service_tool_registry 

243 tool = service_tool_registry._tools.get(tool_name) 

244 if tool: 

245 return tool.base_url.rstrip('/') 

246 except Exception: 

247 pass 

248 return None 

249 

250 

251# ═══════════════════════════════════════════════════════════════ 

252# Modality handlers 

253# ═══════════════════════════════════════════════════════════════ 

254 

255def _generate_image(context: str, input_text: str, style: str) -> dict: 

256 """Route to txt2img external service.""" 

257 prompt = input_text or context 

258 if style: 

259 prompt = f"{prompt}, {style} style" 

260 try: 

261 from helper import txt2img 

262 img_url = txt2img(prompt) 

263 return { 

264 'status': 'completed', 

265 'output_modality': 'image', 

266 'results': [{'type': 'image', 'url': img_url, 'format': 'png'}], 

267 'model_used': 'txt2img', 

268 } 

269 except Exception as e: 

270 return {'status': 'error', 'error': str(e), 'output_modality': 'image'} 

271 

272 

273def _generate_audio_speech(context: str, input_text: str, duration: int) -> dict: 

274 """Route to TTS-Audio-Suite for speech synthesis.""" 

275 text = input_text or context 

276 if not _ensure_tool_running('tts_audio_suite'): 

277 return { 

278 'status': 'error', 

279 'error': 'TTS-Audio-Suite not available and auto-start failed', 

280 'output_modality': 'audio_speech', 

281 } 

282 

283 base_url = _get_tool_base_url('tts_audio_suite') 

284 if not base_url: 

285 return {'status': 'error', 'error': 'TTS-Audio-Suite not registered', 

286 'output_modality': 'audio_speech'} 

287 

288 try: 

289 from core.http_pool import pooled_post 

290 resp = pooled_post( 

291 f"{base_url}/synthesize", 

292 json={'text': text}, 

293 headers={'Content-Type': 'application/json'}, 

294 timeout=120, 

295 ) 

296 if resp.status_code == 200: 

297 data = resp.json() 

298 audio_url = data.get('audio_url') or data.get('url', '') 

299 return { 

300 'status': 'completed', 

301 'output_modality': 'audio_speech', 

302 'results': [{'type': 'audio', 'url': audio_url, 

303 'format': 'wav'}], 

304 'model_used': 'tts_audio_suite', 

305 } 

306 return {'status': 'error', 'error': f'TTS HTTP {resp.status_code}', 

307 'output_modality': 'audio_speech'} 

308 except Exception as e: 

309 return {'status': 'error', 'error': str(e), 

310 'output_modality': 'audio_speech'} 

311 

312 

313def _generate_audio_music(context: str, input_text: str, 

314 duration: int, style: str) -> dict: 

315 """Route to AceStep for AI music generation.""" 

316 prompt = input_text or context 

317 if style: 

318 prompt = f"[{style}] {prompt}" 

319 

320 base_url = _get_tool_base_url('acestep') 

321 if not base_url: 

322 # Try default URL 

323 base_url = 'http://localhost:8001' 

324 

325 try: 

326 from core.http_pool import pooled_post 

327 payload = { 

328 'prompt': prompt, 

329 'duration': duration or 30, 

330 } 

331 if style: 

332 payload['genre'] = style 

333 resp = pooled_post( 

334 f"{base_url}/release_task", 

335 json=payload, 

336 headers={'Content-Type': 'application/json'}, 

337 timeout=30, 

338 ) 

339 if resp.status_code == 200: 

340 data = resp.json() 

341 task_id = data.get('task_id', '') 

342 return { 

343 'status': 'pending', 

344 'output_modality': 'audio_music', 

345 'task_id': f'acestep_{task_id}', 

346 'poll_tool': 'check_media_status', 

347 'message': 'Music generation started. Use check_media_status(task_id) to check progress.', 

348 'model_used': 'acestep', 

349 } 

350 return {'status': 'error', 'error': f'AceStep HTTP {resp.status_code}', 

351 'output_modality': 'audio_music'} 

352 except Exception as e: 

353 return {'status': 'error', 'error': str(e), 

354 'output_modality': 'audio_music'} 

355 

356 

357def _generate_video(context: str, input_text: str, 

358 duration: int, style: str, model: str) -> dict: 

359 """Route to wan2gp or ltx2 for video generation.""" 

360 prompt = input_text or context 

361 if style: 

362 prompt = f"{prompt}, {style}" 

363 

364 # Select tool 

365 tool = model if model != 'auto' else _select_video_tool() 

366 

367 if tool == 'wan2gp': 

368 if not _ensure_tool_running('wan2gp'): 

369 # Fall back to ltx2 

370 tool = 'ltx2' 

371 

372 if tool == 'wan2gp': 

373 return _generate_video_wan2gp(prompt, duration) 

374 else: 

375 return _generate_video_ltx2(prompt, duration) 

376 

377 

378def _generate_video_wan2gp(prompt: str, duration: int) -> dict: 

379 """Submit video generation to Wan2GP.""" 

380 base_url = _get_tool_base_url('wan2gp') 

381 if not base_url: 

382 return {'status': 'error', 'error': 'Wan2GP not registered', 

383 'output_modality': 'video'} 

384 

385 try: 

386 from core.http_pool import pooled_post 

387 # ~24fps, duration in seconds → frames 

388 num_frames = max(49, (duration or 2) * 24 + 1) 

389 resp = pooled_post( 

390 f"{base_url}/generate", 

391 json={'prompt': prompt, 'num_frames': num_frames, 

392 'width': 512, 'height': 320}, 

393 headers={'Content-Type': 'application/json'}, 

394 timeout=30, 

395 ) 

396 if resp.status_code == 200: 

397 data = resp.json() 

398 task_id = data.get('task_id', '') 

399 return { 

400 'status': 'pending', 

401 'output_modality': 'video', 

402 'task_id': f'wan2gp_{task_id}', 

403 'poll_tool': 'check_media_status', 

404 'message': 'Video generation started. Use check_media_status(task_id) to check progress.', 

405 'model_used': 'wan2gp', 

406 } 

407 return {'status': 'error', 'error': f'Wan2GP HTTP {resp.status_code}', 

408 'output_modality': 'video'} 

409 except Exception as e: 

410 return {'status': 'error', 'error': str(e), 'output_modality': 'video'} 

411 

412 

413def _generate_video_ltx2(prompt: str, duration: int) -> dict: 

414 """Submit video generation to LTX2 server (port 5002).""" 

415 ltx_url = 'http://localhost:5002' 

416 try: 

417 from core.http_pool import pooled_post 

418 num_frames = max(49, (duration or 2) * 24 + 1) 

419 resp = pooled_post( 

420 f"{ltx_url}/generate", 

421 json={ 

422 'prompt': prompt, 

423 'negative_prompt': 'worst quality, inconsistent motion, blurry', 

424 'num_frames': num_frames, 

425 'width': 832, 'height': 480, 

426 'num_inference_steps': 30, 

427 'guidance_scale': 3.0, 

428 'fps': 24, 

429 }, 

430 headers={'Content-Type': 'application/json'}, 

431 timeout=600, 

432 ) 

433 if resp.status_code == 200: 

434 data = resp.json() 

435 video_url = (data.get('video_url') or data.get('output_url') 

436 or data.get('video_path', '')) 

437 if video_url: 

438 return { 

439 'status': 'completed', 

440 'output_modality': 'video', 

441 'results': [{'type': 'video', 'url': video_url, 

442 'format': 'mp4'}], 

443 'model_used': 'ltx2', 

444 } 

445 # Async task pattern 

446 task_id = data.get('task_id', '') 

447 if task_id: 

448 return { 

449 'status': 'pending', 

450 'output_modality': 'video', 

451 'task_id': f'ltx2_{task_id}', 

452 'poll_tool': 'check_media_status', 

453 'message': 'Video generation started. Use check_media_status(task_id).', 

454 'model_used': 'ltx2', 

455 } 

456 return {'status': 'error', 'error': f'LTX2 HTTP {resp.status_code}', 

457 'output_modality': 'video'} 

458 except Exception as e: 

459 return {'status': 'error', 'error': str(e), 'output_modality': 'video'} 

460 

461 

462# ═══════════════════════════════════════════════════════════════ 

463# Core AutoGen tool functions 

464# ═══════════════════════════════════════════════════════════════ 

465 

466def generate_media( 

467 context: Annotated[str, "What to generate — a natural language description"], 

468 output_modality: Annotated[str, ( 

469 "Output type: 'image' | 'audio_speech' | 'audio_music' | " 

470 "'audio_speech_music' | 'video' | 'video_with_audio'" 

471 )], 

472 input_text: Annotated[Optional[str], "Text input (prompt, lyrics, script)"] = None, 

473 input_audio: Annotated[Optional[str], "Path to audio file (for voice cloning)"] = None, 

474 input_image: Annotated[Optional[str], "Path to image file (for img2vid)"] = None, 

475 model: Annotated[str, "Model: 'auto' or specific name"] = "auto", 

476 duration: Annotated[Optional[int], "Duration in seconds (audio/video)"] = None, 

477 style: Annotated[Optional[str], "Style hint (realistic, cartoon, cinematic)"] = None, 

478) -> str: 

479 """Unified media generation tool. 

480 

481 Auto-selects the best available tool, auto-starts services if needed, 

482 and returns results in a consistent JSON format. 

483 

484 Runtime capability-aware: checks what this node can do before attempting. 

485 Returns clear guidance when a modality is unavailable (not cryptic errors). 

486 """ 

487 t0 = time.time() 

488 modality = output_modality.lower().strip() 

489 

490 if modality not in VALID_MODALITIES: 

491 result = { 

492 'status': 'error', 

493 'error': f"Invalid output_modality '{output_modality}'. " 

494 f"Valid: {sorted(VALID_MODALITIES)}", 

495 } 

496 result['generation_time_seconds'] = round(time.time() - t0, 2) 

497 return json.dumps(result) 

498 

499 # Runtime capability gate — universal orchestrator check 

500 _MODALITY_TO_CHECK = { 

501 'audio_speech': ('tts', None), 

502 'audio_speech_music': ('tts', None), 

503 'audio_music': ('audio_gen', 'music_gen'), 

504 'video': ('video_gen', 'txt2vid'), 

505 'video_with_audio': ('video_gen', 'txt2vid'), 

506 'image': ('image_gen', None), 

507 } 

508 check = _MODALITY_TO_CHECK.get(modality) 

509 if check and not _can_do(*check): 

510 return json.dumps({ 

511 'status': 'unavailable', 

512 'error': f'{modality} not available on this node right now.', 

513 'modality': modality, 

514 'suggestion': f'Describe the {modality.replace("_", " ")} in text instead, ' 

515 f'or delegate to a node with {check[0]} capability.', 

516 }) 

517 

518 try: 

519 if modality == 'image': 

520 result = _generate_image(context, input_text, style) 

521 

522 elif modality == 'audio_speech': 

523 result = _generate_audio_speech(context, input_text, duration) 

524 

525 elif modality == 'audio_music': 

526 result = _generate_audio_music(context, input_text, duration, style) 

527 

528 elif modality == 'audio_speech_music': 

529 # Generate both speech and music 

530 speech = _generate_audio_speech(context, input_text, duration) 

531 music = _generate_audio_music(context, input_text, duration, style) 

532 results = [] 

533 if speech.get('status') == 'completed': 

534 results.extend(speech.get('results', [])) 

535 if music.get('status') == 'completed': 

536 results.extend(music.get('results', [])) 

537 # If music is pending, include task info 

538 if music.get('status') == 'pending': 

539 result = { 

540 'status': 'partial', 

541 'output_modality': 'audio_speech_music', 

542 'results': results, 

543 'pending_task_id': music.get('task_id'), 

544 'poll_tool': 'check_media_status', 

545 'message': 'Speech complete. Music generation pending.', 

546 } 

547 elif results: 

548 result = { 

549 'status': 'completed', 

550 'output_modality': 'audio_speech_music', 

551 'results': results, 

552 } 

553 else: 

554 result = { 

555 'status': 'error', 

556 'output_modality': 'audio_speech_music', 

557 'error': 'Both speech and music generation failed', 

558 'speech_error': speech.get('error'), 

559 'music_error': music.get('error'), 

560 } 

561 

562 elif modality == 'video': 

563 result = _generate_video(context, input_text, duration, style, model) 

564 

565 elif modality == 'video_with_audio': 

566 # Generate video + speech narration 

567 video = _generate_video(context, input_text, duration, style, model) 

568 speech = _generate_audio_speech(context, input_text, duration) 

569 results = [] 

570 if video.get('status') == 'completed': 

571 results.extend(video.get('results', [])) 

572 if speech.get('status') == 'completed': 

573 results.extend(speech.get('results', [])) 

574 if video.get('status') == 'pending': 

575 result = { 

576 'status': 'pending', 

577 'output_modality': 'video_with_audio', 

578 'task_id': video.get('task_id'), 

579 'poll_tool': 'check_media_status', 

580 'speech_results': speech.get('results', []), 

581 'message': 'Video generation pending. Speech may be ready.', 

582 } 

583 elif results: 

584 result = { 

585 'status': 'completed', 

586 'output_modality': 'video_with_audio', 

587 'results': results, 

588 } 

589 else: 

590 result = { 

591 'status': 'error', 

592 'output_modality': 'video_with_audio', 

593 'error': 'Media generation failed', 

594 'video_error': video.get('error'), 

595 'speech_error': speech.get('error'), 

596 } 

597 else: 

598 result = {'status': 'error', 'error': f'Unhandled modality: {modality}'} 

599 

600 except Exception as e: 

601 logger.error(f"generate_media error: {e}", exc_info=True) 

602 result = {'status': 'error', 'error': str(e)} 

603 

604 elapsed = round(time.time() - t0, 2) 

605 result['generation_time_seconds'] = elapsed 

606 

607 # Feed generation result to HevolveAI for dense error signal learning 

608 # Success: generated output becomes prediction target 

609 # Error: error pattern informs modality routing confidence 

610 try: 

611 from integrations.agent_engine.world_model_bridge import get_world_model_bridge 

612 bridge = get_world_model_bridge() 

613 bridge.submit_output_feedback( 

614 output_modality=result.get('output_modality', modality), 

615 status=result.get('status', 'error'), 

616 context=context[:2000], 

617 model_used=result.get('model_used', 'unknown'), 

618 error_message=result.get('error'), 

619 generation_time_seconds=elapsed, 

620 ) 

621 except Exception as e: 

622 logger.debug(f"[MediaAgent] Output feedback to HevolveAI skipped: {e}") 

623 

624 return json.dumps(result) 

625 

626 

627def check_media_status( 

628 task_id: Annotated[str, "Task ID from a pending generate_media call (e.g. 'wan2gp_abc123')"], 

629) -> str: 

630 """Check status of an async media generation task. 

631 

632 Parses the tool prefix from task_id and queries the correct backend. 

633 Returns JSON with status, progress percentage, and URL when done. 

634 """ 

635 if '_' not in task_id: 

636 return json.dumps({'status': 'error', 

637 'error': f'Invalid task_id format: {task_id}'}) 

638 

639 tool_prefix, raw_id = task_id.split('_', 1) 

640 

641 # Determine check endpoint 

642 if tool_prefix == 'wan2gp': 

643 base_url = _get_tool_base_url('wan2gp') 

644 check_path = '/check_result' 

645 elif tool_prefix == 'acestep': 

646 base_url = _get_tool_base_url('acestep') 

647 if not base_url: 

648 base_url = 'http://localhost:8001' 

649 check_path = '/query_result' 

650 elif tool_prefix == 'ltx2': 

651 base_url = 'http://localhost:5002' 

652 check_path = '/check_result' 

653 else: 

654 return json.dumps({'status': 'error', 

655 'error': f'Unknown tool prefix: {tool_prefix}'}) 

656 

657 if not base_url: 

658 return json.dumps({'status': 'error', 

659 'error': f'{tool_prefix} service not available'}) 

660 

661 try: 

662 from core.http_pool import pooled_post 

663 resp = pooled_post( 

664 f"{base_url}{check_path}", 

665 json={'task_id': raw_id}, 

666 headers={'Content-Type': 'application/json'}, 

667 timeout=30, 

668 ) 

669 if resp.status_code == 200: 

670 data = resp.json() 

671 # Normalize response 

672 status = data.get('status', 'unknown') 

673 result_url = (data.get('video_url') or data.get('audio_url') 

674 or data.get('url') or data.get('output_url', '')) 

675 progress = data.get('progress', data.get('percentage', 0)) 

676 

677 out = { 

678 'task_id': task_id, 

679 'status': status, 

680 'progress': progress, 

681 } 

682 if status in ('completed', 'done', 'finished') and result_url: 

683 media_type = 'video' if tool_prefix in ('wan2gp', 'ltx2') else 'audio' 

684 out['results'] = [{'type': media_type, 'url': result_url}] 

685 return json.dumps(out) 

686 

687 return json.dumps({'status': 'error', 

688 'error': f'HTTP {resp.status_code}'}) 

689 except Exception as e: 

690 return json.dumps({'status': 'error', 'error': str(e)}) 

691 

692 

693# ═══════════════════════════════════════════════════════════════ 

694def synthesize_multilingual_audio( 

695 text: Annotated[str, ( 

696 "Text to synthesize. May contain multiple languages (auto-detected by script) " 

697 "and media tags: <music genre='jazz' duration='10'>prompt</music>, " 

698 "<sing duration='15'>lyrics</sing>, <lyrics>song text</lyrics>. " 

699 "Each segment is routed to the best available engine." 

700 )], 

701 output_path: Annotated[Optional[str], "Path for output WAV. Auto-generated if omitted."] = None, 

702 task_id: Annotated[Optional[str], "Agent ledger task_id for pause/resume tracking."] = None, 

703) -> str: 

704 """Synthesize mixed-language text + media tags into one audio file. 

705 

706 Compute-aware: uses ModelOrchestrator to select the best model per 

707 segment. Agents can pause/resume via the agent_ledger task_id. 

708 Returns JSON with status, output_path, and degraded_segments (if any 

709 segment type was unavailable on this node). 

710 """ 

711 # Runtime capability gate: check what this node can actually do 

712 if not _can_do('tts'): 

713 return json.dumps({ 

714 'status': 'unavailable', 

715 'error': 'Audio synthesis not available on this node (text-only mode).', 

716 'suggestion': 'Return text content directly — the user will read it.', 

717 }) 

718 

719 try: 

720 from tts.tts_engine import get_tts_engine 

721 engine = get_tts_engine() 

722 if not engine: 

723 return json.dumps({'status': 'error', 'error': 'TTS engine not available'}) 

724 

725 from tts.language_segmenter import segment 

726 segments = segment(text) 

727 if not segments: 

728 return json.dumps({'status': 'error', 'error': 'No segments found in text'}) 

729 

730 # Filter out segment types this node can't handle, report them 

731 degraded = [] 

732 runnable = [] 

733 for seg in segments: 

734 seg_type = seg.get('type', 'speech') 

735 if seg_type == 'speech': 

736 runnable.append(seg) 

737 elif seg_type in ('music',) and not _can_do('audio_gen', 'music_gen'): 

738 degraded.append({'type': seg_type, 'text': seg.get('text', ''), 

739 'reason': 'music gen service offline'}) 

740 elif seg_type in ('sing', 'lyrics') and not _can_do('audio_gen', 'singing'): 

741 degraded.append({'type': seg_type, 'text': seg.get('text', ''), 

742 'reason': 'singing voice service offline'}) 

743 else: 

744 runnable.append(seg) 

745 

746 result = engine._synthesize_multilingual( 

747 runnable, output_path=output_path, task_id=task_id) if runnable else None 

748 

749 resp = { 

750 'status': 'completed' if result else 'partial', 

751 'output_path': result, 

752 'segments_total': len(segments), 

753 'segments_synthesized': len(runnable), 

754 'segment_types': [s.get('type', 'speech') for s in runnable], 

755 } 

756 if degraded: 

757 resp['degraded_segments'] = degraded 

758 resp['status'] = 'partial' 

759 return json.dumps(resp) 

760 except Exception as e: 

761 return json.dumps({'status': 'error', 'error': str(e)}) 

762 

763 

764# ═══════════════════════════════════════════════════════════════ 

765# Registration 

766# ═══════════════════════════════════════════════════════════════ 

767 

768def register_media_tools(helper, assistant): 

769 """Register generate_media + check_media_status as AutoGen tools. 

770 

771 Called from create_recipe.py alongside other tool registrations. 

772 Follows the same pattern as register_marketing_tools(). 

773 """ 

774 tools = [ 

775 ( 

776 'generate_media', 

777 'Unified media generation: create images, speech, music, or video from text. ' 

778 'Supports output_modality: image, audio_speech, audio_music, ' 

779 'audio_speech_music, video, video_with_audio. Auto-selects best tool.', 

780 generate_media, 

781 ), 

782 ( 

783 'check_media_status', 

784 'Check status of an async media generation task. ' 

785 'Pass the task_id from a pending generate_media result.', 

786 check_media_status, 

787 ), 

788 ( 

789 'synthesize_multilingual_audio', 

790 'Synthesize mixed-language text into one audio file. ' 

791 'Auto-detects languages by script (Tamil, Hindi, English, etc.) ' 

792 'and routes each segment to the best TTS engine. ' 

793 'Supports <music>, <sing>, <lyrics> tags for music/singing. ' 

794 'Pass task_id for pause/resume via agent ledger.', 

795 synthesize_multilingual_audio, 

796 ), 

797 ] 

798 

799 for name, desc, func in tools: 

800 helper.register_for_llm(name=name, description=desc)(func) 

801 assistant.register_for_execution(name=name)(func) 

802 

803 logger.info(f"Registered {len(tools)} media generation tools")