Coverage for integrations / service_tools / media_agent.py: 49.1%
275 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Unified Media Generation Agent — single AutoGen tool for all media modalities.
4Any agent in the system can call `generate_media()` to produce content across
5image, audio (speech/music), and video modalities. The agent auto-selects the
6best available tool, auto-starts services if needed, and returns results in a
7consistent JSON format.
9Routing table:
10 image → txt2img (external service)
11 audio_speech → tts_audio_suite (auto-start sidecar)
12 audio_music → acestep (external: uv run acestep-api)
13 audio_speech_music → tts_audio_suite + acestep
14 video → wan2gp (VRAM >= 8GB) | ltx2 fallback
15 video_with_audio → video tool + tts_audio_suite
17Companion tool: `check_media_status()` for polling async tasks.
18"""
20import json
21import logging
22import time
23from typing import Annotated, Optional
25logger = logging.getLogger(__name__)
27# Valid output modalities
28VALID_MODALITIES = {
29 'image', 'audio_speech', 'audio_music',
30 'audio_speech_music', 'video', 'video_with_audio',
31}
34# ═══════════════════════════════════════════════════════════════
35# Runtime capability introspection
36# ═══════════════════════════════════════════════════════════════
38def _can_do(model_type: str, capability: str = None) -> bool:
39 """Universal capability check — delegates to orchestrator.
41 Works for any model type or dynamic service category.
42 Single source of truth: orchestrator merges catalog + services + runtime.
43 """
44 try:
45 from integrations.service_tools.model_orchestrator import get_orchestrator
46 return get_orchestrator().can_do(model_type, capability)
47 except Exception:
48 return False
51# ═══════════════════════════════════════════════════════════════
52# Auto-start helpers
53# ═══════════════════════════════════════════════════════════════
55def _ensure_tool_running(tool_name: str) -> bool:
56 """Auto-start a tool if it's not running. Returns True if available."""
57 try:
58 from integrations.service_tools.runtime_manager import runtime_tool_manager
59 status = runtime_tool_manager.get_tool_status(tool_name)
60 if status.get('running'):
61 return True
62 result = runtime_tool_manager.setup_tool(tool_name)
63 return result.get('running', False)
64 except Exception as e:
65 logger.warning(f"Auto-start failed for {tool_name}: {e}")
66 return False
69def populate_videogen_catalog(catalog) -> int:
70 """Register all video generation model variants into the ModelCatalog.
72 This is the single source of truth for video gen model names, VRAM
73 thresholds, and capabilities — replacing the hardcoded free_gb >= 8.0
74 threshold in _select_video_tool().
76 Called by ModelCatalog._populate_videogen_models().
77 Returns number of new entries added.
78 """
79 from integrations.service_tools.model_catalog import ModelEntry, ModelType
81 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier,
82 # supports_cpu, supports_cpu_offload, caps)
83 videogen_models = [
84 (
85 'video_gen-wan2gp', 'Wan2GP',
86 8.0, 12.0, 15.0, 0.88, 0.65, 'full',
87 False, False,
88 {'txt2vid': True, 'img2vid': True, 'resolution': '512x320',
89 'fps': 24, 'async_task': True},
90 ),
91 (
92 'video_gen-ltx2', 'LTX-Video-2',
93 4.0, 8.0, 10.0, 0.78, 0.78, 'standard',
94 True, True,
95 {'txt2vid': True, 'img2vid': False, 'resolution': '832x480',
96 'fps': 24, 'async_task': True, 'cpu_offload': True},
97 ),
98 ]
100 added = 0
101 for (mid, name, vram, ram, disk, quality, speed, min_tier,
102 sup_cpu, sup_offload, caps) in videogen_models:
103 if catalog.get(mid) is not None:
104 continue
105 entry = ModelEntry(
106 id=mid, name=name, model_type=ModelType.VIDEO_GEN,
107 source='huggingface',
108 vram_gb=vram, ram_gb=ram, disk_gb=disk,
109 min_capability_tier=min_tier,
110 backend='sidecar',
111 supports_gpu=True, supports_cpu=sup_cpu,
112 supports_cpu_offload=sup_offload,
113 cpu_offload_method='restart_cpu' if sup_offload else 'none',
114 idle_timeout_s=600,
115 capabilities=caps,
116 quality_score=quality, speed_score=speed,
117 tags=['local', 'video_gen'],
118 )
119 catalog.register(entry, persist=False)
120 added += 1
121 return added
124def populate_audiogen_catalog(catalog) -> int:
125 """Register audio generation models (music, singing) into ModelCatalog.
127 Same pattern as populate_videogen_catalog — capabilities-based routing
128 so the orchestrator can select the right model for music/singing tasks.
130 Called by ModelCatalog._populate_audiogen_models().
131 Returns number of new entries added.
132 """
133 from integrations.service_tools.model_catalog import ModelEntry, ModelType
135 audiogen_models = [
136 (
137 'audio_gen-acestep', 'ACE-Step 1.5',
138 6.0, 6.0, 4.0, 0.85, 0.90, 'standard',
139 False, False,
140 {'music_gen': True, 'singing': True, 'lyrics_input': True,
141 'genre_control': True, 'tempo_control': True,
142 'max_duration_s': 120, 'async_task': True},
143 ),
144 (
145 'audio_gen-diffrhythm', 'DiffRhythm v1.2',
146 4.0, 4.0, 3.0, 0.80, 0.75, 'standard',
147 True, True,
148 {'music_gen': False, 'singing': True, 'singing_voice': True,
149 'lyrics_input': True, 'voice_conversion': True,
150 'max_duration_s': 60, 'async_task': False},
151 ),
152 ]
154 added = 0
155 for (mid, name, vram, ram, disk, quality, speed, min_tier,
156 sup_cpu, sup_offload, caps) in audiogen_models:
157 if catalog.get(mid) is not None:
158 continue
159 entry = ModelEntry(
160 id=mid, name=name, model_type=ModelType.AUDIO_GEN,
161 source='huggingface',
162 vram_gb=vram, ram_gb=ram, disk_gb=disk,
163 min_capability_tier=min_tier,
164 backend='sidecar',
165 supports_gpu=True, supports_cpu=sup_cpu,
166 supports_cpu_offload=sup_offload,
167 cpu_offload_method='restart_cpu' if sup_offload else 'none',
168 idle_timeout_s=600,
169 capabilities=caps,
170 quality_score=quality, speed_score=speed,
171 tags=['local', 'audio_gen'],
172 )
173 catalog.register(entry, persist=False)
174 added += 1
175 return added
178def _select_audio_tool(task: str = 'music') -> str:
179 """Select the best audio generation tool for a task.
181 Consults ModelCatalog via orchestrator — same pattern as _select_video_tool.
182 task: 'music' → ACE Step, 'sing'/'lyrics' → DiffRhythm (fallback ACE Step)
183 """
184 cap_key = 'singing_voice' if task in ('sing', 'lyrics') else 'music_gen'
185 try:
186 from integrations.service_tools.model_orchestrator import get_orchestrator
187 entry = get_orchestrator().select_best(
188 'audio_gen', require_capability={cap_key: True})
189 if entry:
190 _CATALOG_TO_TOOL = {
191 'audio_gen-acestep': 'acestep',
192 'audio_gen-diffrhythm': 'diffrhythm',
193 }
194 tool = _CATALOG_TO_TOOL.get(entry.id)
195 if tool:
196 return tool
197 except Exception:
198 pass
199 # Fallback defaults
200 return 'diffrhythm' if task in ('sing', 'lyrics') else 'acestep'
203def _select_video_tool() -> str:
204 """Select the best video generation tool for current hardware.
206 Consults ModelCatalog (single source of truth for VRAM thresholds).
207 Falls back to direct VRAM query if catalog is unavailable.
209 Returns 'wan2gp' or 'ltx2'.
210 """
211 # ── Primary path: ask the catalog/orchestrator ───────────────────────────
212 try:
213 from integrations.service_tools.model_orchestrator import get_orchestrator
214 entry = get_orchestrator().select_best('video_gen')
215 if entry:
216 # Map catalog ID → tool name used by service_tool_registry
217 _CATALOG_TO_TOOL = {
218 'video_gen-wan2gp': 'wan2gp',
219 'video_gen-ltx2': 'ltx2',
220 }
221 tool = _CATALOG_TO_TOOL.get(entry.id)
222 if tool:
223 return tool
224 except Exception:
225 pass
227 # ── Fallback: direct VRAM query ──────────────────────────────────────────
228 try:
229 from integrations.service_tools.vram_manager import vram_manager
230 info = vram_manager.detect_gpu()
231 free_gb = info.get('free_gb', 0)
232 if free_gb >= 8.0:
233 return 'wan2gp'
234 except Exception:
235 pass
236 return 'ltx2'
239def _get_tool_base_url(tool_name: str) -> Optional[str]:
240 """Get the base URL for a registered tool."""
241 try:
242 from integrations.service_tools.registry import service_tool_registry
243 tool = service_tool_registry._tools.get(tool_name)
244 if tool:
245 return tool.base_url.rstrip('/')
246 except Exception:
247 pass
248 return None
251# ═══════════════════════════════════════════════════════════════
252# Modality handlers
253# ═══════════════════════════════════════════════════════════════
255def _generate_image(context: str, input_text: str, style: str) -> dict:
256 """Route to txt2img external service."""
257 prompt = input_text or context
258 if style:
259 prompt = f"{prompt}, {style} style"
260 try:
261 from helper import txt2img
262 img_url = txt2img(prompt)
263 return {
264 'status': 'completed',
265 'output_modality': 'image',
266 'results': [{'type': 'image', 'url': img_url, 'format': 'png'}],
267 'model_used': 'txt2img',
268 }
269 except Exception as e:
270 return {'status': 'error', 'error': str(e), 'output_modality': 'image'}
273def _generate_audio_speech(context: str, input_text: str, duration: int) -> dict:
274 """Route to TTS-Audio-Suite for speech synthesis."""
275 text = input_text or context
276 if not _ensure_tool_running('tts_audio_suite'):
277 return {
278 'status': 'error',
279 'error': 'TTS-Audio-Suite not available and auto-start failed',
280 'output_modality': 'audio_speech',
281 }
283 base_url = _get_tool_base_url('tts_audio_suite')
284 if not base_url:
285 return {'status': 'error', 'error': 'TTS-Audio-Suite not registered',
286 'output_modality': 'audio_speech'}
288 try:
289 from core.http_pool import pooled_post
290 resp = pooled_post(
291 f"{base_url}/synthesize",
292 json={'text': text},
293 headers={'Content-Type': 'application/json'},
294 timeout=120,
295 )
296 if resp.status_code == 200:
297 data = resp.json()
298 audio_url = data.get('audio_url') or data.get('url', '')
299 return {
300 'status': 'completed',
301 'output_modality': 'audio_speech',
302 'results': [{'type': 'audio', 'url': audio_url,
303 'format': 'wav'}],
304 'model_used': 'tts_audio_suite',
305 }
306 return {'status': 'error', 'error': f'TTS HTTP {resp.status_code}',
307 'output_modality': 'audio_speech'}
308 except Exception as e:
309 return {'status': 'error', 'error': str(e),
310 'output_modality': 'audio_speech'}
313def _generate_audio_music(context: str, input_text: str,
314 duration: int, style: str) -> dict:
315 """Route to AceStep for AI music generation."""
316 prompt = input_text or context
317 if style:
318 prompt = f"[{style}] {prompt}"
320 base_url = _get_tool_base_url('acestep')
321 if not base_url:
322 # Try default URL
323 base_url = 'http://localhost:8001'
325 try:
326 from core.http_pool import pooled_post
327 payload = {
328 'prompt': prompt,
329 'duration': duration or 30,
330 }
331 if style:
332 payload['genre'] = style
333 resp = pooled_post(
334 f"{base_url}/release_task",
335 json=payload,
336 headers={'Content-Type': 'application/json'},
337 timeout=30,
338 )
339 if resp.status_code == 200:
340 data = resp.json()
341 task_id = data.get('task_id', '')
342 return {
343 'status': 'pending',
344 'output_modality': 'audio_music',
345 'task_id': f'acestep_{task_id}',
346 'poll_tool': 'check_media_status',
347 'message': 'Music generation started. Use check_media_status(task_id) to check progress.',
348 'model_used': 'acestep',
349 }
350 return {'status': 'error', 'error': f'AceStep HTTP {resp.status_code}',
351 'output_modality': 'audio_music'}
352 except Exception as e:
353 return {'status': 'error', 'error': str(e),
354 'output_modality': 'audio_music'}
357def _generate_video(context: str, input_text: str,
358 duration: int, style: str, model: str) -> dict:
359 """Route to wan2gp or ltx2 for video generation."""
360 prompt = input_text or context
361 if style:
362 prompt = f"{prompt}, {style}"
364 # Select tool
365 tool = model if model != 'auto' else _select_video_tool()
367 if tool == 'wan2gp':
368 if not _ensure_tool_running('wan2gp'):
369 # Fall back to ltx2
370 tool = 'ltx2'
372 if tool == 'wan2gp':
373 return _generate_video_wan2gp(prompt, duration)
374 else:
375 return _generate_video_ltx2(prompt, duration)
378def _generate_video_wan2gp(prompt: str, duration: int) -> dict:
379 """Submit video generation to Wan2GP."""
380 base_url = _get_tool_base_url('wan2gp')
381 if not base_url:
382 return {'status': 'error', 'error': 'Wan2GP not registered',
383 'output_modality': 'video'}
385 try:
386 from core.http_pool import pooled_post
387 # ~24fps, duration in seconds → frames
388 num_frames = max(49, (duration or 2) * 24 + 1)
389 resp = pooled_post(
390 f"{base_url}/generate",
391 json={'prompt': prompt, 'num_frames': num_frames,
392 'width': 512, 'height': 320},
393 headers={'Content-Type': 'application/json'},
394 timeout=30,
395 )
396 if resp.status_code == 200:
397 data = resp.json()
398 task_id = data.get('task_id', '')
399 return {
400 'status': 'pending',
401 'output_modality': 'video',
402 'task_id': f'wan2gp_{task_id}',
403 'poll_tool': 'check_media_status',
404 'message': 'Video generation started. Use check_media_status(task_id) to check progress.',
405 'model_used': 'wan2gp',
406 }
407 return {'status': 'error', 'error': f'Wan2GP HTTP {resp.status_code}',
408 'output_modality': 'video'}
409 except Exception as e:
410 return {'status': 'error', 'error': str(e), 'output_modality': 'video'}
413def _generate_video_ltx2(prompt: str, duration: int) -> dict:
414 """Submit video generation to LTX2 server (port 5002)."""
415 ltx_url = 'http://localhost:5002'
416 try:
417 from core.http_pool import pooled_post
418 num_frames = max(49, (duration or 2) * 24 + 1)
419 resp = pooled_post(
420 f"{ltx_url}/generate",
421 json={
422 'prompt': prompt,
423 'negative_prompt': 'worst quality, inconsistent motion, blurry',
424 'num_frames': num_frames,
425 'width': 832, 'height': 480,
426 'num_inference_steps': 30,
427 'guidance_scale': 3.0,
428 'fps': 24,
429 },
430 headers={'Content-Type': 'application/json'},
431 timeout=600,
432 )
433 if resp.status_code == 200:
434 data = resp.json()
435 video_url = (data.get('video_url') or data.get('output_url')
436 or data.get('video_path', ''))
437 if video_url:
438 return {
439 'status': 'completed',
440 'output_modality': 'video',
441 'results': [{'type': 'video', 'url': video_url,
442 'format': 'mp4'}],
443 'model_used': 'ltx2',
444 }
445 # Async task pattern
446 task_id = data.get('task_id', '')
447 if task_id:
448 return {
449 'status': 'pending',
450 'output_modality': 'video',
451 'task_id': f'ltx2_{task_id}',
452 'poll_tool': 'check_media_status',
453 'message': 'Video generation started. Use check_media_status(task_id).',
454 'model_used': 'ltx2',
455 }
456 return {'status': 'error', 'error': f'LTX2 HTTP {resp.status_code}',
457 'output_modality': 'video'}
458 except Exception as e:
459 return {'status': 'error', 'error': str(e), 'output_modality': 'video'}
462# ═══════════════════════════════════════════════════════════════
463# Core AutoGen tool functions
464# ═══════════════════════════════════════════════════════════════
466def generate_media(
467 context: Annotated[str, "What to generate — a natural language description"],
468 output_modality: Annotated[str, (
469 "Output type: 'image' | 'audio_speech' | 'audio_music' | "
470 "'audio_speech_music' | 'video' | 'video_with_audio'"
471 )],
472 input_text: Annotated[Optional[str], "Text input (prompt, lyrics, script)"] = None,
473 input_audio: Annotated[Optional[str], "Path to audio file (for voice cloning)"] = None,
474 input_image: Annotated[Optional[str], "Path to image file (for img2vid)"] = None,
475 model: Annotated[str, "Model: 'auto' or specific name"] = "auto",
476 duration: Annotated[Optional[int], "Duration in seconds (audio/video)"] = None,
477 style: Annotated[Optional[str], "Style hint (realistic, cartoon, cinematic)"] = None,
478) -> str:
479 """Unified media generation tool.
481 Auto-selects the best available tool, auto-starts services if needed,
482 and returns results in a consistent JSON format.
484 Runtime capability-aware: checks what this node can do before attempting.
485 Returns clear guidance when a modality is unavailable (not cryptic errors).
486 """
487 t0 = time.time()
488 modality = output_modality.lower().strip()
490 if modality not in VALID_MODALITIES:
491 result = {
492 'status': 'error',
493 'error': f"Invalid output_modality '{output_modality}'. "
494 f"Valid: {sorted(VALID_MODALITIES)}",
495 }
496 result['generation_time_seconds'] = round(time.time() - t0, 2)
497 return json.dumps(result)
499 # Runtime capability gate — universal orchestrator check
500 _MODALITY_TO_CHECK = {
501 'audio_speech': ('tts', None),
502 'audio_speech_music': ('tts', None),
503 'audio_music': ('audio_gen', 'music_gen'),
504 'video': ('video_gen', 'txt2vid'),
505 'video_with_audio': ('video_gen', 'txt2vid'),
506 'image': ('image_gen', None),
507 }
508 check = _MODALITY_TO_CHECK.get(modality)
509 if check and not _can_do(*check):
510 return json.dumps({
511 'status': 'unavailable',
512 'error': f'{modality} not available on this node right now.',
513 'modality': modality,
514 'suggestion': f'Describe the {modality.replace("_", " ")} in text instead, '
515 f'or delegate to a node with {check[0]} capability.',
516 })
518 try:
519 if modality == 'image':
520 result = _generate_image(context, input_text, style)
522 elif modality == 'audio_speech':
523 result = _generate_audio_speech(context, input_text, duration)
525 elif modality == 'audio_music':
526 result = _generate_audio_music(context, input_text, duration, style)
528 elif modality == 'audio_speech_music':
529 # Generate both speech and music
530 speech = _generate_audio_speech(context, input_text, duration)
531 music = _generate_audio_music(context, input_text, duration, style)
532 results = []
533 if speech.get('status') == 'completed':
534 results.extend(speech.get('results', []))
535 if music.get('status') == 'completed':
536 results.extend(music.get('results', []))
537 # If music is pending, include task info
538 if music.get('status') == 'pending':
539 result = {
540 'status': 'partial',
541 'output_modality': 'audio_speech_music',
542 'results': results,
543 'pending_task_id': music.get('task_id'),
544 'poll_tool': 'check_media_status',
545 'message': 'Speech complete. Music generation pending.',
546 }
547 elif results:
548 result = {
549 'status': 'completed',
550 'output_modality': 'audio_speech_music',
551 'results': results,
552 }
553 else:
554 result = {
555 'status': 'error',
556 'output_modality': 'audio_speech_music',
557 'error': 'Both speech and music generation failed',
558 'speech_error': speech.get('error'),
559 'music_error': music.get('error'),
560 }
562 elif modality == 'video':
563 result = _generate_video(context, input_text, duration, style, model)
565 elif modality == 'video_with_audio':
566 # Generate video + speech narration
567 video = _generate_video(context, input_text, duration, style, model)
568 speech = _generate_audio_speech(context, input_text, duration)
569 results = []
570 if video.get('status') == 'completed':
571 results.extend(video.get('results', []))
572 if speech.get('status') == 'completed':
573 results.extend(speech.get('results', []))
574 if video.get('status') == 'pending':
575 result = {
576 'status': 'pending',
577 'output_modality': 'video_with_audio',
578 'task_id': video.get('task_id'),
579 'poll_tool': 'check_media_status',
580 'speech_results': speech.get('results', []),
581 'message': 'Video generation pending. Speech may be ready.',
582 }
583 elif results:
584 result = {
585 'status': 'completed',
586 'output_modality': 'video_with_audio',
587 'results': results,
588 }
589 else:
590 result = {
591 'status': 'error',
592 'output_modality': 'video_with_audio',
593 'error': 'Media generation failed',
594 'video_error': video.get('error'),
595 'speech_error': speech.get('error'),
596 }
597 else:
598 result = {'status': 'error', 'error': f'Unhandled modality: {modality}'}
600 except Exception as e:
601 logger.error(f"generate_media error: {e}", exc_info=True)
602 result = {'status': 'error', 'error': str(e)}
604 elapsed = round(time.time() - t0, 2)
605 result['generation_time_seconds'] = elapsed
607 # Feed generation result to HevolveAI for dense error signal learning
608 # Success: generated output becomes prediction target
609 # Error: error pattern informs modality routing confidence
610 try:
611 from integrations.agent_engine.world_model_bridge import get_world_model_bridge
612 bridge = get_world_model_bridge()
613 bridge.submit_output_feedback(
614 output_modality=result.get('output_modality', modality),
615 status=result.get('status', 'error'),
616 context=context[:2000],
617 model_used=result.get('model_used', 'unknown'),
618 error_message=result.get('error'),
619 generation_time_seconds=elapsed,
620 )
621 except Exception as e:
622 logger.debug(f"[MediaAgent] Output feedback to HevolveAI skipped: {e}")
624 return json.dumps(result)
627def check_media_status(
628 task_id: Annotated[str, "Task ID from a pending generate_media call (e.g. 'wan2gp_abc123')"],
629) -> str:
630 """Check status of an async media generation task.
632 Parses the tool prefix from task_id and queries the correct backend.
633 Returns JSON with status, progress percentage, and URL when done.
634 """
635 if '_' not in task_id:
636 return json.dumps({'status': 'error',
637 'error': f'Invalid task_id format: {task_id}'})
639 tool_prefix, raw_id = task_id.split('_', 1)
641 # Determine check endpoint
642 if tool_prefix == 'wan2gp':
643 base_url = _get_tool_base_url('wan2gp')
644 check_path = '/check_result'
645 elif tool_prefix == 'acestep':
646 base_url = _get_tool_base_url('acestep')
647 if not base_url:
648 base_url = 'http://localhost:8001'
649 check_path = '/query_result'
650 elif tool_prefix == 'ltx2':
651 base_url = 'http://localhost:5002'
652 check_path = '/check_result'
653 else:
654 return json.dumps({'status': 'error',
655 'error': f'Unknown tool prefix: {tool_prefix}'})
657 if not base_url:
658 return json.dumps({'status': 'error',
659 'error': f'{tool_prefix} service not available'})
661 try:
662 from core.http_pool import pooled_post
663 resp = pooled_post(
664 f"{base_url}{check_path}",
665 json={'task_id': raw_id},
666 headers={'Content-Type': 'application/json'},
667 timeout=30,
668 )
669 if resp.status_code == 200:
670 data = resp.json()
671 # Normalize response
672 status = data.get('status', 'unknown')
673 result_url = (data.get('video_url') or data.get('audio_url')
674 or data.get('url') or data.get('output_url', ''))
675 progress = data.get('progress', data.get('percentage', 0))
677 out = {
678 'task_id': task_id,
679 'status': status,
680 'progress': progress,
681 }
682 if status in ('completed', 'done', 'finished') and result_url:
683 media_type = 'video' if tool_prefix in ('wan2gp', 'ltx2') else 'audio'
684 out['results'] = [{'type': media_type, 'url': result_url}]
685 return json.dumps(out)
687 return json.dumps({'status': 'error',
688 'error': f'HTTP {resp.status_code}'})
689 except Exception as e:
690 return json.dumps({'status': 'error', 'error': str(e)})
693# ═══════════════════════════════════════════════════════════════
694def synthesize_multilingual_audio(
695 text: Annotated[str, (
696 "Text to synthesize. May contain multiple languages (auto-detected by script) "
697 "and media tags: <music genre='jazz' duration='10'>prompt</music>, "
698 "<sing duration='15'>lyrics</sing>, <lyrics>song text</lyrics>. "
699 "Each segment is routed to the best available engine."
700 )],
701 output_path: Annotated[Optional[str], "Path for output WAV. Auto-generated if omitted."] = None,
702 task_id: Annotated[Optional[str], "Agent ledger task_id for pause/resume tracking."] = None,
703) -> str:
704 """Synthesize mixed-language text + media tags into one audio file.
706 Compute-aware: uses ModelOrchestrator to select the best model per
707 segment. Agents can pause/resume via the agent_ledger task_id.
708 Returns JSON with status, output_path, and degraded_segments (if any
709 segment type was unavailable on this node).
710 """
711 # Runtime capability gate: check what this node can actually do
712 if not _can_do('tts'):
713 return json.dumps({
714 'status': 'unavailable',
715 'error': 'Audio synthesis not available on this node (text-only mode).',
716 'suggestion': 'Return text content directly — the user will read it.',
717 })
719 try:
720 from tts.tts_engine import get_tts_engine
721 engine = get_tts_engine()
722 if not engine:
723 return json.dumps({'status': 'error', 'error': 'TTS engine not available'})
725 from tts.language_segmenter import segment
726 segments = segment(text)
727 if not segments:
728 return json.dumps({'status': 'error', 'error': 'No segments found in text'})
730 # Filter out segment types this node can't handle, report them
731 degraded = []
732 runnable = []
733 for seg in segments:
734 seg_type = seg.get('type', 'speech')
735 if seg_type == 'speech':
736 runnable.append(seg)
737 elif seg_type in ('music',) and not _can_do('audio_gen', 'music_gen'):
738 degraded.append({'type': seg_type, 'text': seg.get('text', ''),
739 'reason': 'music gen service offline'})
740 elif seg_type in ('sing', 'lyrics') and not _can_do('audio_gen', 'singing'):
741 degraded.append({'type': seg_type, 'text': seg.get('text', ''),
742 'reason': 'singing voice service offline'})
743 else:
744 runnable.append(seg)
746 result = engine._synthesize_multilingual(
747 runnable, output_path=output_path, task_id=task_id) if runnable else None
749 resp = {
750 'status': 'completed' if result else 'partial',
751 'output_path': result,
752 'segments_total': len(segments),
753 'segments_synthesized': len(runnable),
754 'segment_types': [s.get('type', 'speech') for s in runnable],
755 }
756 if degraded:
757 resp['degraded_segments'] = degraded
758 resp['status'] = 'partial'
759 return json.dumps(resp)
760 except Exception as e:
761 return json.dumps({'status': 'error', 'error': str(e)})
764# ═══════════════════════════════════════════════════════════════
765# Registration
766# ═══════════════════════════════════════════════════════════════
768def register_media_tools(helper, assistant):
769 """Register generate_media + check_media_status as AutoGen tools.
771 Called from create_recipe.py alongside other tool registrations.
772 Follows the same pattern as register_marketing_tools().
773 """
774 tools = [
775 (
776 'generate_media',
777 'Unified media generation: create images, speech, music, or video from text. '
778 'Supports output_modality: image, audio_speech, audio_music, '
779 'audio_speech_music, video, video_with_audio. Auto-selects best tool.',
780 generate_media,
781 ),
782 (
783 'check_media_status',
784 'Check status of an async media generation task. '
785 'Pass the task_id from a pending generate_media result.',
786 check_media_status,
787 ),
788 (
789 'synthesize_multilingual_audio',
790 'Synthesize mixed-language text into one audio file. '
791 'Auto-detects languages by script (Tamil, Hindi, English, etc.) '
792 'and routes each segment to the best TTS engine. '
793 'Supports <music>, <sing>, <lyrics> tags for music/singing. '
794 'Pass task_id for pause/resume via agent ledger.',
795 synthesize_multilingual_audio,
796 ),
797 ]
799 for name, desc, func in tools:
800 helper.register_for_llm(name=name, description=desc)(func)
801 assistant.register_for_execution(name=name)(func)
803 logger.info(f"Registered {len(tools)} media generation tools")