Coverage for integrations/service_tools/media

1"""

2Unified Media Generation Agent — single AutoGen tool for all media modalities.

4Any agent in the system can call `generate_media()` to produce content across

5image, audio (speech/music), and video modalities. The agent auto-selects the

6best available tool, auto-starts services if needed, and returns results in a

7consistent JSON format.

9Routing table:

10 image → txt2img (external service)

11 audio_speech → tts_audio_suite (auto-start sidecar)

12 audio_music → acestep (external: uv run acestep-api)

13 audio_speech_music → tts_audio_suite + acestep

14 video → wan2gp (VRAM >= 8GB) | ltx2 fallback

15 video_with_audio → video tool + tts_audio_suite

17Companion tool: `check_media_status()` for polling async tasks.

18"""

20import json

21import logging

22import time

23from typing import Annotated, Optional

25logger = logging.getLogger(__name__)

27# Valid output modalities

28VALID_MODALITIES = {

29 'image', 'audio_speech', 'audio_music',

30 'audio_speech_music', 'video', 'video_with_audio',

31}

34# ═══════════════════════════════════════════════════════════════

35# Runtime capability introspection

36# ═══════════════════════════════════════════════════════════════

38def _can_do(model_type: str, capability: str = None) -> bool:

39 """Universal capability check — delegates to orchestrator.

41 Works for any model type or dynamic service category.

42 Single source of truth: orchestrator merges catalog + services + runtime.

43 """

44 try:

45 from integrations.service_tools.model_orchestrator import get_orchestrator

46 return get_orchestrator().can_do(model_type, capability)

47 except Exception:

48 return False

51# ═══════════════════════════════════════════════════════════════

52# Auto-start helpers

53# ═══════════════════════════════════════════════════════════════

55def _ensure_tool_running(tool_name: str) -> bool:

56 """Auto-start a tool if it's not running. Returns True if available."""

57 try:

58 from integrations.service_tools.runtime_manager import runtime_tool_manager

59 status = runtime_tool_manager.get_tool_status(tool_name)

60 if status.get('running'):

61 return True

62 result = runtime_tool_manager.setup_tool(tool_name)

63 return result.get('running', False)

64 except Exception as e:

65 logger.warning(f"Auto-start failed for {tool_name}: {e}")

66 return False

69def populate_videogen_catalog(catalog) -> int:

70 """Register all video generation model variants into the ModelCatalog.

72 This is the single source of truth for video gen model names, VRAM

73 thresholds, and capabilities — replacing the hardcoded free_gb >= 8.0

74 threshold in _select_video_tool().

76 Called by ModelCatalog._populate_videogen_models().

77 Returns number of new entries added.

78 """

79 from integrations.service_tools.model_catalog import ModelEntry, ModelType

81 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier,

82 # supports_cpu, supports_cpu_offload, caps)

83 videogen_models = [

84 (

85 'video_gen-wan2gp', 'Wan2GP',

86 8.0, 12.0, 15.0, 0.88, 0.65, 'full',

87 False, False,

88 {'txt2vid': True, 'img2vid': True, 'resolution': '512x320',

89 'fps': 24, 'async_task': True},

90 ),

91 (

92 'video_gen-ltx2', 'LTX-Video-2',

93 4.0, 8.0, 10.0, 0.78, 0.78, 'standard',

94 True, True,

95 {'txt2vid': True, 'img2vid': False, 'resolution': '832x480',

96 'fps': 24, 'async_task': True, 'cpu_offload': True},

97 ),

98 ]

100 added = 0

101 for (mid, name, vram, ram, disk, quality, speed, min_tier,

102 sup_cpu, sup_offload, caps) in videogen_models:

103 if catalog.get(mid) is not None:

104 continue

105 entry = ModelEntry(

106 id=mid, name=name, model_type=ModelType.VIDEO_GEN,

107 source='huggingface',

108 vram_gb=vram, ram_gb=ram, disk_gb=disk,

109 min_capability_tier=min_tier,

110 backend='sidecar',

111 supports_gpu=True, supports_cpu=sup_cpu,

112 supports_cpu_offload=sup_offload,

113 cpu_offload_method='restart_cpu' if sup_offload else 'none',

114 idle_timeout_s=600,

115 capabilities=caps,

116 quality_score=quality, speed_score=speed,

117 tags=['local', 'video_gen'],

118 )

119 catalog.register(entry, persist=False)

120 added += 1

121 return added

122

123

124def populate_audiogen_catalog(catalog) -> int:

125 """Register audio generation models (music, singing) into ModelCatalog.

126

127 Same pattern as populate_videogen_catalog — capabilities-based routing

128 so the orchestrator can select the right model for music/singing tasks.

129

130 Called by ModelCatalog._populate_audiogen_models().

131 Returns number of new entries added.

132 """

133 from integrations.service_tools.model_catalog import ModelEntry, ModelType

134

135 audiogen_models = [

136 (

137 'audio_gen-acestep', 'ACE-Step 1.5',

138 6.0, 6.0, 4.0, 0.85, 0.90, 'standard',

139 False, False,

140 {'music_gen': True, 'singing': True, 'lyrics_input': True,

141 'genre_control': True, 'tempo_control': True,

142 'max_duration_s': 120, 'async_task': True},

143 ),

144 (

145 'audio_gen-diffrhythm', 'DiffRhythm v1.2',

146 4.0, 4.0, 3.0, 0.80, 0.75, 'standard',

147 True, True,

148 {'music_gen': False, 'singing': True, 'singing_voice': True,

149 'lyrics_input': True, 'voice_conversion': True,

150 'max_duration_s': 60, 'async_task': False},

151 ),

152 ]

153

154 added = 0

155 for (mid, name, vram, ram, disk, quality, speed, min_tier,

156 sup_cpu, sup_offload, caps) in audiogen_models:

157 if catalog.get(mid) is not None:

158 continue

159 entry = ModelEntry(

160 id=mid, name=name, model_type=ModelType.AUDIO_GEN,

161 source='huggingface',

162 vram_gb=vram, ram_gb=ram, disk_gb=disk,

163 min_capability_tier=min_tier,

164 backend='sidecar',

165 supports_gpu=True, supports_cpu=sup_cpu,

166 supports_cpu_offload=sup_offload,

167 cpu_offload_method='restart_cpu' if sup_offload else 'none',

168 idle_timeout_s=600,

169 capabilities=caps,

170 quality_score=quality, speed_score=speed,

171 tags=['local', 'audio_gen'],

172 )

173 catalog.register(entry, persist=False)

174 added += 1

175 return added

176

177

178def _select_audio_tool(task: str = 'music') -> str:

179 """Select the best audio generation tool for a task.

180

181 Consults ModelCatalog via orchestrator — same pattern as _select_video_tool.

182 task: 'music' → ACE Step, 'sing'/'lyrics' → DiffRhythm (fallback ACE Step)

183 """

184 cap_key = 'singing_voice' if task in ('sing', 'lyrics') else 'music_gen'

185 try:

186 from integrations.service_tools.model_orchestrator import get_orchestrator

187 entry = get_orchestrator().select_best(

188 'audio_gen', require_capability={cap_key: True})

189 if entry:

190 _CATALOG_TO_TOOL = {

191 'audio_gen-acestep': 'acestep',

192 'audio_gen-diffrhythm': 'diffrhythm',

193 }

194 tool = _CATALOG_TO_TOOL.get(entry.id)

195 if tool:

196 return tool

197 except Exception:

198 pass

199 # Fallback defaults

200 return 'diffrhythm' if task in ('sing', 'lyrics') else 'acestep'

201

202

203def _select_video_tool() -> str:

204 """Select the best video generation tool for current hardware.

205

206 Consults ModelCatalog (single source of truth for VRAM thresholds).

207 Falls back to direct VRAM query if catalog is unavailable.

208

209 Returns 'wan2gp' or 'ltx2'.

210 """

211 # ── Primary path: ask the catalog/orchestrator ───────────────────────────

212 try:

213 from integrations.service_tools.model_orchestrator import get_orchestrator

214 entry = get_orchestrator().select_best('video_gen')

215 if entry:

216 # Map catalog ID → tool name used by service_tool_registry

217 _CATALOG_TO_TOOL = {

218 'video_gen-wan2gp': 'wan2gp',

219 'video_gen-ltx2': 'ltx2',

220 }

221 tool = _CATALOG_TO_TOOL.get(entry.id)

222 if tool:

223 return tool

224 except Exception:

225 pass

226

227 # ── Fallback: direct VRAM query ──────────────────────────────────────────

228 try:

229 from integrations.service_tools.vram_manager import vram_manager

230 info = vram_manager.detect_gpu()

231 free_gb = info.get('free_gb', 0)

232 if free_gb >= 8.0:

233 return 'wan2gp'

234 except Exception:

235 pass

236 return 'ltx2'

237

238

239def _get_tool_base_url(tool_name: str) -> Optional[str]:

240 """Get the base URL for a registered tool."""

241 try:

242 from integrations.service_tools.registry import service_tool_registry

243 tool = service_tool_registry._tools.get(tool_name)

244 if tool:

245 return tool.base_url.rstrip('/')

246 except Exception:

247 pass

248 return None

249

250

251# ═══════════════════════════════════════════════════════════════

252# Modality handlers

253# ═══════════════════════════════════════════════════════════════

254

255def _generate_image(context: str, input_text: str, style: str) -> dict:

256 """Route to txt2img external service."""

257 prompt = input_text or context

258 if style:

259 prompt = f"{prompt}, {style} style"

260 try:

261 from helper import txt2img

262 img_url = txt2img(prompt)

263 return {

264 'status': 'completed',

265 'output_modality': 'image',

266 'results': [{'type': 'image', 'url': img_url, 'format': 'png'}],

267 'model_used': 'txt2img',

268 }

269 except Exception as e:

270 return {'status': 'error', 'error': str(e), 'output_modality': 'image'}

271

272

273def _generate_audio_speech(context: str, input_text: str, duration: int) -> dict:

274 """Route to TTS-Audio-Suite for speech synthesis."""

275 text = input_text or context

276 if not _ensure_tool_running('tts_audio_suite'):

277 return {

278 'status': 'error',

279 'error': 'TTS-Audio-Suite not available and auto-start failed',

280 'output_modality': 'audio_speech',

281 }

282

283 base_url = _get_tool_base_url('tts_audio_suite')

284 if not base_url:

285 return {'status': 'error', 'error': 'TTS-Audio-Suite not registered',

286 'output_modality': 'audio_speech'}

287

288 try:

289 from core.http_pool import pooled_post

290 resp = pooled_post(

291 f"{base_url}/synthesize",

292 json={'text': text},

293 headers={'Content-Type': 'application/json'},

294 timeout=120,

295 )

296 if resp.status_code == 200:

297 data = resp.json()

298 audio_url = data.get('audio_url') or data.get('url', '')

299 return {

300 'status': 'completed',

301 'output_modality': 'audio_speech',

302 'results': [{'type': 'audio', 'url': audio_url,

303 'format': 'wav'}],

304 'model_used': 'tts_audio_suite',

305 }

306 return {'status': 'error', 'error': f'TTS HTTP {resp.status_code}',

307 'output_modality': 'audio_speech'}

308 except Exception as e:

309 return {'status': 'error', 'error': str(e),

310 'output_modality': 'audio_speech'}

311

312

313def _generate_audio_music(context: str, input_text: str,

314 duration: int, style: str) -> dict:

315 """Route to AceStep for AI music generation."""

316 prompt = input_text or context

317 if style:

318 prompt = f"[{style}] {prompt}"

319

320 base_url = _get_tool_base_url('acestep')

321 if not base_url:

322 # Try default URL

323 base_url = 'http://localhost:8001'

324

325 try:

326 from core.http_pool import pooled_post

327 payload = {

328 'prompt': prompt,

329 'duration': duration or 30,

330 }

331 if style:

332 payload['genre'] = style

333 resp = pooled_post(

334 f"{base_url}/release_task",

335 json=payload,

336 headers={'Content-Type': 'application/json'},

337 timeout=30,

338 )

339 if resp.status_code == 200:

340 data = resp.json()

341 task_id = data.get('task_id', '')

342 return {

343 'status': 'pending',

344 'output_modality': 'audio_music',

345 'task_id': f'acestep_{task_id}',

346 'poll_tool': 'check_media_status',

347 'message': 'Music generation started. Use check_media_status(task_id) to check progress.',

348 'model_used': 'acestep',

349 }

350 return {'status': 'error', 'error': f'AceStep HTTP {resp.status_code}',

351 'output_modality': 'audio_music'}

352 except Exception as e:

353 return {'status': 'error', 'error': str(e),

354 'output_modality': 'audio_music'}

355

356

357def _generate_video(context: str, input_text: str,

358 duration: int, style: str, model: str) -> dict:

359 """Route to wan2gp or ltx2 for video generation."""

360 prompt = input_text or context

361 if style:

362 prompt = f"{prompt}, {style}"

363

364 # Select tool

365 tool = model if model != 'auto' else _select_video_tool()

366

367 if tool == 'wan2gp':

368 if not _ensure_tool_running('wan2gp'):

369 # Fall back to ltx2

370 tool = 'ltx2'

371

372 if tool == 'wan2gp':

373 return _generate_video_wan2gp(prompt, duration)

374 else:

375 return _generate_video_ltx2(prompt, duration)

376

377

378def _generate_video_wan2gp(prompt: str, duration: int) -> dict:

379 """Submit video generation to Wan2GP."""

380 base_url = _get_tool_base_url('wan2gp')

381 if not base_url:

382 return {'status': 'error', 'error': 'Wan2GP not registered',

383 'output_modality': 'video'}

384

385 try:

386 from core.http_pool import pooled_post

387 # ~24fps, duration in seconds → frames

388 num_frames = max(49, (duration or 2) * 24 + 1)

389 resp = pooled_post(

390 f"{base_url}/generate",

391 json={'prompt': prompt, 'num_frames': num_frames,

392 'width': 512, 'height': 320},

393 headers={'Content-Type': 'application/json'},

394 timeout=30,

395 )

396 if resp.status_code == 200:

397 data = resp.json()

398 task_id = data.get('task_id', '')

399 return {

400 'status': 'pending',

401 'output_modality': 'video',

402 'task_id': f'wan2gp_{task_id}',

403 'poll_tool': 'check_media_status',

404 'message': 'Video generation started. Use check_media_status(task_id) to check progress.',

405 'model_used': 'wan2gp',

406 }

407 return {'status': 'error', 'error': f'Wan2GP HTTP {resp.status_code}',

408 'output_modality': 'video'}

409 except Exception as e:

410 return {'status': 'error', 'error': str(e), 'output_modality': 'video'}

411

412

413def _generate_video_ltx2(prompt: str, duration: int) -> dict:

414 """Submit video generation to LTX2 server (port 5002)."""

415 ltx_url = 'http://localhost:5002'

416 try:

417 from core.http_pool import pooled_post

418 num_frames = max(49, (duration or 2) * 24 + 1)

419 resp = pooled_post(

420 f"{ltx_url}/generate",

421 json={

422 'prompt': prompt,

423 'negative_prompt': 'worst quality, inconsistent motion, blurry',

424 'num_frames': num_frames,

425 'width': 832, 'height': 480,

426 'num_inference_steps': 30,

427 'guidance_scale': 3.0,

428 'fps': 24,

429 },

430 headers={'Content-Type': 'application/json'},

431 timeout=600,

432 )

433 if resp.status_code == 200:

434 data = resp.json()

435 video_url = (data.get('video_url') or data.get('output_url')

436 or data.get('video_path', ''))

437 if video_url:

438 return {

439 'status': 'completed',

440 'output_modality': 'video',

441 'results': [{'type': 'video', 'url': video_url,

442 'format': 'mp4'}],

443 'model_used': 'ltx2',

444 }

445 # Async task pattern

446 task_id = data.get('task_id', '')

447 if task_id:

448 return {

449 'status': 'pending',

450 'output_modality': 'video',

451 'task_id': f'ltx2_{task_id}',

452 'poll_tool': 'check_media_status',

453 'message': 'Video generation started. Use check_media_status(task_id).',

454 'model_used': 'ltx2',

455 }

456 return {'status': 'error', 'error': f'LTX2 HTTP {resp.status_code}',

457 'output_modality': 'video'}

458 except Exception as e:

459 return {'status': 'error', 'error': str(e), 'output_modality': 'video'}

460

461

462# ═══════════════════════════════════════════════════════════════

463# Core AutoGen tool functions

464# ═══════════════════════════════════════════════════════════════

465

466def generate_media(

467 context: Annotated[str, "What to generate — a natural language description"],

468 output_modality: Annotated[str, (

469 "Output type: 'image' | 'audio_speech' | 'audio_music' | "

470 "'audio_speech_music' | 'video' | 'video_with_audio'"

471 )],

472 input_text: Annotated[Optional[str], "Text input (prompt, lyrics, script)"] = None,

473 input_audio: Annotated[Optional[str], "Path to audio file (for voice cloning)"] = None,

474 input_image: Annotated[Optional[str], "Path to image file (for img2vid)"] = None,

475 model: Annotated[str, "Model: 'auto' or specific name"] = "auto",

476 duration: Annotated[Optional[int], "Duration in seconds (audio/video)"] = None,

477 style: Annotated[Optional[str], "Style hint (realistic, cartoon, cinematic)"] = None,

478) -> str:

479 """Unified media generation tool.

480

481 Auto-selects the best available tool, auto-starts services if needed,

482 and returns results in a consistent JSON format.

483

484 Runtime capability-aware: checks what this node can do before attempting.

485 Returns clear guidance when a modality is unavailable (not cryptic errors).

486 """

487 t0 = time.time()

488 modality = output_modality.lower().strip()

489

490 if modality not in VALID_MODALITIES:

491 result = {

492 'status': 'error',

493 'error': f"Invalid output_modality '{output_modality}'. "

494 f"Valid: {sorted(VALID_MODALITIES)}",

495 }

496 result['generation_time_seconds'] = round(time.time() - t0, 2)

497 return json.dumps(result)

498

499 # Runtime capability gate — universal orchestrator check

500 _MODALITY_TO_CHECK = {

501 'audio_speech': ('tts', None),

502 'audio_speech_music': ('tts', None),

503 'audio_music': ('audio_gen', 'music_gen'),

504 'video': ('video_gen', 'txt2vid'),

505 'video_with_audio': ('video_gen', 'txt2vid'),

506 'image': ('image_gen', None),

507 }

508 check = _MODALITY_TO_CHECK.get(modality)

509 if check and not _can_do(*check):

510 return json.dumps({

511 'status': 'unavailable',

512 'error': f'{modality} not available on this node right now.',

513 'modality': modality,

514 'suggestion': f'Describe the {modality.replace("_", " ")} in text instead, '

515 f'or delegate to a node with {check[0]} capability.',

516 })

517

518 try:

519 if modality == 'image':

520 result = _generate_image(context, input_text, style)

521

522 elif modality == 'audio_speech':

523 result = _generate_audio_speech(context, input_text, duration)

524

525 elif modality == 'audio_music':

526 result = _generate_audio_music(context, input_text, duration, style)

527

528 elif modality == 'audio_speech_music':

529 # Generate both speech and music

530 speech = _generate_audio_speech(context, input_text, duration)

531 music = _generate_audio_music(context, input_text, duration, style)

532 results = []

533 if speech.get('status') == 'completed':

534 results.extend(speech.get('results', []))

535 if music.get('status') == 'completed':

536 results.extend(music.get('results', []))

537 # If music is pending, include task info

538 if music.get('status') == 'pending':

539 result = {

540 'status': 'partial',

541 'output_modality': 'audio_speech_music',

542 'results': results,

543 'pending_task_id': music.get('task_id'),

544 'poll_tool': 'check_media_status',

545 'message': 'Speech complete. Music generation pending.',

546 }

547 elif results:

548 result = {

549 'status': 'completed',

550 'output_modality': 'audio_speech_music',

551 'results': results,

552 }

553 else:

554 result = {

555 'status': 'error',

556 'output_modality': 'audio_speech_music',

557 'error': 'Both speech and music generation failed',

558 'speech_error': speech.get('error'),

559 'music_error': music.get('error'),

560 }

561

562 elif modality == 'video':

563 result = _generate_video(context, input_text, duration, style, model)

564

565 elif modality == 'video_with_audio':

566 # Generate video + speech narration

567 video = _generate_video(context, input_text, duration, style, model)

568 speech = _generate_audio_speech(context, input_text, duration)

569 results = []

570 if video.get('status') == 'completed':

571 results.extend(video.get('results', []))

572 if speech.get('status') == 'completed':

573 results.extend(speech.get('results', []))

574 if video.get('status') == 'pending':

575 result = {

576 'status': 'pending',

577 'output_modality': 'video_with_audio',

578 'task_id': video.get('task_id'),

579 'poll_tool': 'check_media_status',

580 'speech_results': speech.get('results', []),

581 'message': 'Video generation pending. Speech may be ready.',

582 }

583 elif results:

584 result = {

585 'status': 'completed',

586 'output_modality': 'video_with_audio',

587 'results': results,

588 }

589 else:

590 result = {

591 'status': 'error',

592 'output_modality': 'video_with_audio',

593 'error': 'Media generation failed',

594 'video_error': video.get('error'),

595 'speech_error': speech.get('error'),

596 }

597 else:

598 result = {'status': 'error', 'error': f'Unhandled modality: {modality}'}

599

600 except Exception as e:

601 logger.error(f"generate_media error: {e}", exc_info=True)

602 result = {'status': 'error', 'error': str(e)}

603

604 elapsed = round(time.time() - t0, 2)

605 result['generation_time_seconds'] = elapsed

606

607 # Feed generation result to HevolveAI for dense error signal learning

608 # Success: generated output becomes prediction target

609 # Error: error pattern informs modality routing confidence

610 try:

611 from integrations.agent_engine.world_model_bridge import get_world_model_bridge

612 bridge = get_world_model_bridge()

613 bridge.submit_output_feedback(

614 output_modality=result.get('output_modality', modality),

615 status=result.get('status', 'error'),

616 context=context[:2000],

617 model_used=result.get('model_used', 'unknown'),

618 error_message=result.get('error'),

619 generation_time_seconds=elapsed,

620 )

621 except Exception as e:

622 logger.debug(f"[MediaAgent] Output feedback to HevolveAI skipped: {e}")

623

624 return json.dumps(result)

625

626

627def check_media_status(

628 task_id: Annotated[str, "Task ID from a pending generate_media call (e.g. 'wan2gp_abc123')"],

629) -> str:

630 """Check status of an async media generation task.

631

632 Parses the tool prefix from task_id and queries the correct backend.

633 Returns JSON with status, progress percentage, and URL when done.

634 """

635 if '_' not in task_id:

636 return json.dumps({'status': 'error',

637 'error': f'Invalid task_id format: {task_id}'})

638

639 tool_prefix, raw_id = task_id.split('_', 1)

640

641 # Determine check endpoint

642 if tool_prefix == 'wan2gp':

643 base_url = _get_tool_base_url('wan2gp')

644 check_path = '/check_result'

645 elif tool_prefix == 'acestep':

646 base_url = _get_tool_base_url('acestep')

647 if not base_url:

648 base_url = 'http://localhost:8001'

649 check_path = '/query_result'

650 elif tool_prefix == 'ltx2':

651 base_url = 'http://localhost:5002'

652 check_path = '/check_result'

653 else:

654 return json.dumps({'status': 'error',

655 'error': f'Unknown tool prefix: {tool_prefix}'})

656

657 if not base_url:

658 return json.dumps({'status': 'error',

659 'error': f'{tool_prefix} service not available'})

660

661 try:

662 from core.http_pool import pooled_post

663 resp = pooled_post(

664 f"{base_url}{check_path}",

665 json={'task_id': raw_id},

666 headers={'Content-Type': 'application/json'},

667 timeout=30,

668 )

669 if resp.status_code == 200:

670 data = resp.json()

671 # Normalize response

672 status = data.get('status', 'unknown')

673 result_url = (data.get('video_url') or data.get('audio_url')

674 or data.get('url') or data.get('output_url', ''))

675 progress = data.get('progress', data.get('percentage', 0))

676

677 out = {

678 'task_id': task_id,

679 'status': status,

680 'progress': progress,

681 }

682 if status in ('completed', 'done', 'finished') and result_url:

683 media_type = 'video' if tool_prefix in ('wan2gp', 'ltx2') else 'audio'

684 out['results'] = [{'type': media_type, 'url': result_url}]

685 return json.dumps(out)

686

687 return json.dumps({'status': 'error',

688 'error': f'HTTP {resp.status_code}'})

689 except Exception as e:

690 return json.dumps({'status': 'error', 'error': str(e)})

691

692

693# ═══════════════════════════════════════════════════════════════

694def synthesize_multilingual_audio(

695 text: Annotated[str, (

696 "Text to synthesize. May contain multiple languages (auto-detected by script) "

697 "and media tags: <music genre='jazz' duration='10'>prompt</music>, "

698 "<sing duration='15'>lyrics</sing>, <lyrics>song text</lyrics>. "

699 "Each segment is routed to the best available engine."

700 )],

701 output_path: Annotated[Optional[str], "Path for output WAV. Auto-generated if omitted."] = None,

702 task_id: Annotated[Optional[str], "Agent ledger task_id for pause/resume tracking."] = None,

703) -> str:

704 """Synthesize mixed-language text + media tags into one audio file.

705

706 Compute-aware: uses ModelOrchestrator to select the best model per

707 segment. Agents can pause/resume via the agent_ledger task_id.

708 Returns JSON with status, output_path, and degraded_segments (if any

709 segment type was unavailable on this node).

710 """

711 # Runtime capability gate: check what this node can actually do

712 if not _can_do('tts'):

713 return json.dumps({

714 'status': 'unavailable',

715 'error': 'Audio synthesis not available on this node (text-only mode).',

716 'suggestion': 'Return text content directly — the user will read it.',

717 })

718

719 try:

720 from tts.tts_engine import get_tts_engine

721 engine = get_tts_engine()

722 if not engine:

723 return json.dumps({'status': 'error', 'error': 'TTS engine not available'})

724

725 from tts.language_segmenter import segment

726 segments = segment(text)

727 if not segments:

728 return json.dumps({'status': 'error', 'error': 'No segments found in text'})

729

730 # Filter out segment types this node can't handle, report them

731 degraded = []

732 runnable = []

733 for seg in segments:

734 seg_type = seg.get('type', 'speech')

735 if seg_type == 'speech':

736 runnable.append(seg)

737 elif seg_type in ('music',) and not _can_do('audio_gen', 'music_gen'):

738 degraded.append({'type': seg_type, 'text': seg.get('text', ''),

739 'reason': 'music gen service offline'})

740 elif seg_type in ('sing', 'lyrics') and not _can_do('audio_gen', 'singing'):

741 degraded.append({'type': seg_type, 'text': seg.get('text', ''),

742 'reason': 'singing voice service offline'})

743 else:

744 runnable.append(seg)

745

746 result = engine._synthesize_multilingual(

747 runnable, output_path=output_path, task_id=task_id) if runnable else None

748

749 resp = {

750 'status': 'completed' if result else 'partial',

751 'output_path': result,

752 'segments_total': len(segments),

753 'segments_synthesized': len(runnable),

754 'segment_types': [s.get('type', 'speech') for s in runnable],

755 }

756 if degraded:

757 resp['degraded_segments'] = degraded

758 resp['status'] = 'partial'

759 return json.dumps(resp)

760 except Exception as e:

761 return json.dumps({'status': 'error', 'error': str(e)})

762

763

764# ═══════════════════════════════════════════════════════════════

765# Registration

766# ═══════════════════════════════════════════════════════════════

767

768def register_media_tools(helper, assistant):

769 """Register generate_media + check_media_status as AutoGen tools.

770

771 Called from create_recipe.py alongside other tool registrations.

772 Follows the same pattern as register_marketing_tools().

773 """

774 tools = [

775 (

776 'generate_media',

777 'Unified media generation: create images, speech, music, or video from text. '

778 'Supports output_modality: image, audio_speech, audio_music, '

779 'audio_speech_music, video, video_with_audio. Auto-selects best tool.',

780 generate_media,

781 ),

782 (

783 'check_media_status',

784 'Check status of an async media generation task. '

785 'Pass the task_id from a pending generate_media result.',

786 check_media_status,

787 ),

788 (

789 'synthesize_multilingual_audio',

790 'Synthesize mixed-language text into one audio file. '

791 'Auto-detects languages by script (Tamil, Hindi, English, etc.) '

792 'and routes each segment to the best TTS engine. '

793 'Supports <music>, <sing>, <lyrics> tags for music/singing. '

794 'Pass task_id for pause/resume via agent ledger.',

795 synthesize_multilingual_audio,

796 ),

797 ]

798

799 for name, desc, func in tools:

800 helper.register_for_llm(name=name, description=desc)(func)

801 assistant.register_for_execution(name=name)(func)

802

803 logger.info(f"Registered {len(tools)} media generation tools")

Coverage for integrations / service_tools / media_agent.py: 49.1%

275 statements