Coverage for integrations / channels / media / tts.py: 56.8%
271 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Text-to-Speech System for audio synthesis.
4Active providers: LuxTTS (offline, 24kHz, voice cloning), Pocket TTS (offline, CPU, MIT).
5Cloud providers (openai, elevenlabs, edge, google, amazon) are disabled — HART OS is
6offline-first with no closed-source TTS dependencies.
7"""
9import asyncio
10from dataclasses import dataclass, field
11from enum import Enum
12from typing import Optional, List, Dict, Any, Union
13from pathlib import Path
14import logging
15import os
17logger = logging.getLogger(__name__)
19# Docker-compatible paths
20TEMP_DIR = os.environ.get("TTS_TEMP_DIR", "/tmp/tts")
21APP_TEMP_DIR = os.environ.get("APP_TEMP_DIR", "/app/temp")
24class TTSProvider(Enum):
25 """Supported TTS providers."""
26 LUXTTS = "luxtts" # Offline: LuxTTS 24kHz — GPU/CPU, voice cloning, Apache 2.0
27 POCKET = "pocket" # Offline: Pocket TTS (Kyutai) — 100M params, CPU, MIT
28 CHATTERBOX = "chatterbox" # GPU: English, emotional, voice cloning, 3.8GB VRAM
29 CHATTERBOX_ML = "chatterbox_ml" # GPU: 23 languages, voice cloning, 12GB VRAM
30 COSYVOICE = "cosyvoice" # GPU: 9 languages (zh/ja/ko/de/es/fr/it/ru/en), 3.5GB
31 F5 = "f5_tts" # GPU: English + Chinese, voice cloning, 1.3GB VRAM
32 INDIC_PARLER = "indic_parler" # GPU: 22 Indic languages + English, 1.8GB VRAM
33 ESPEAK = "espeak" # CPU: 100+ languages, robotic quality, instant
34 # Cloud providers — kept for config compatibility, disabled at runtime
35 OPENAI = "openai" # Disabled: closed-source cloud API
36 ELEVENLABS = "elevenlabs" # Disabled: closed-source cloud API
37 EDGE = "edge" # Disabled: closed-source cloud API
38 GOOGLE = "google" # Disabled: closed-source cloud API
39 AMAZON = "amazon" # Disabled: closed-source cloud API
42class AudioFormat(Enum):
43 """Supported audio output formats."""
44 MP3 = "mp3"
45 OPUS = "opus"
46 WAV = "wav"
47 OGG = "ogg"
48 AAC = "aac"
49 FLAC = "flac"
50 PCM = "pcm"
53@dataclass
54class VoiceInfo:
55 """Information about an available voice."""
56 id: str
57 name: str
58 language: str
59 gender: Optional[str] = None
60 description: Optional[str] = None
61 preview_url: Optional[str] = None
62 provider: Optional[str] = None
63 styles: List[str] = field(default_factory=list)
64 sample_rate: int = 24000
65 metadata: Dict[str, Any] = field(default_factory=dict)
67 def to_dict(self) -> Dict[str, Any]:
68 return {
69 "id": self.id,
70 "name": self.name,
71 "language": self.language,
72 "gender": self.gender,
73 "description": self.description,
74 "preview_url": self.preview_url,
75 "provider": self.provider,
76 "styles": self.styles,
77 "sample_rate": self.sample_rate,
78 "metadata": self.metadata
79 }
82@dataclass
83class SynthesisResult:
84 """Result of a TTS synthesis operation."""
85 audio: bytes
86 format: AudioFormat
87 duration: float
88 sample_rate: int
89 voice_id: str
90 metadata: Dict[str, Any] = field(default_factory=dict)
92 def to_dict(self) -> Dict[str, Any]:
93 return {
94 "format": self.format.value,
95 "duration": self.duration,
96 "sample_rate": self.sample_rate,
97 "voice_id": self.voice_id,
98 "size": len(self.audio),
99 "metadata": self.metadata
100 }
103@dataclass
104class SSMLConfig:
105 """SSML synthesis configuration."""
106 rate: Optional[str] = None # x-slow, slow, medium, fast, x-fast
107 pitch: Optional[str] = None # x-low, low, medium, high, x-high
108 volume: Optional[str] = None # silent, x-soft, soft, medium, loud, x-loud
109 emphasis: Optional[str] = None # strong, moderate, reduced
110 language: Optional[str] = None
112 def to_dict(self) -> Dict[str, Any]:
113 return {
114 "rate": self.rate,
115 "pitch": self.pitch,
116 "volume": self.volume,
117 "emphasis": self.emphasis,
118 "language": self.language
119 }
122class TTSEngine:
123 """
124 Text-to-Speech engine for audio synthesis.
126 Supports multiple providers for converting text to speech.
127 """
129 # Optimal formats per channel
130 CHANNEL_FORMATS = {
131 "telegram": AudioFormat.OGG,
132 "discord": AudioFormat.OPUS,
133 "whatsapp": AudioFormat.OGG,
134 "slack": AudioFormat.MP3,
135 "web": AudioFormat.MP3,
136 "default": AudioFormat.MP3
137 }
139 # Default voices per provider
140 DEFAULT_VOICES = {
141 TTSProvider.LUXTTS: "default",
142 TTSProvider.POCKET: "alba",
143 TTSProvider.OPENAI: "alloy",
144 TTSProvider.ELEVENLABS: "21m00Tcm4TlvDq8ikWAM", # Rachel
145 TTSProvider.EDGE: "en-US-AriaNeural",
146 TTSProvider.GOOGLE: "en-US-Standard-A",
147 TTSProvider.AMAZON: "Joanna"
148 }
150 # Models per provider
151 DEFAULT_MODELS = {
152 TTSProvider.LUXTTS: "luxtts-48k",
153 TTSProvider.POCKET: "pocket-100m",
154 TTSProvider.OPENAI: "tts-1",
155 TTSProvider.ELEVENLABS: "eleven_monolingual_v1",
156 TTSProvider.EDGE: "neural",
157 TTSProvider.GOOGLE: "standard",
158 TTSProvider.AMAZON: "neural"
159 }
161 def __init__(
162 self,
163 provider: Union[TTSProvider, str] = TTSProvider.POCKET,
164 api_key: Optional[str] = None,
165 model: Optional[str] = None,
166 default_voice: Optional[str] = None,
167 config: Optional[Dict[str, Any]] = None
168 ):
169 """
170 Initialize TTS engine.
172 Args:
173 provider: TTS provider to use
174 api_key: API key for the provider
175 model: Specific model to use
176 default_voice: Default voice ID
177 config: Additional configuration options
178 """
179 if isinstance(provider, str):
180 provider = TTSProvider(provider.lower())
182 self.provider = provider
183 self.api_key = api_key
184 self.config = config or {}
186 # Set default model and voice per provider
187 self.model = model or self.DEFAULT_MODELS.get(provider, "default")
188 self.default_voice = default_voice or self.DEFAULT_VOICES.get(provider)
190 # Initialize provider-specific client
191 self._client = None
192 self._initialized = False
194 # Cache for voices
195 self._voices_cache: Optional[List[VoiceInfo]] = None
196 self._cache_timestamp: float = 0
198 # Ensure temp directories exist
199 self._ensure_temp_dirs()
201 def _ensure_temp_dirs(self):
202 """Ensure temp directories exist (Docker-compatible)."""
203 for dir_path in [TEMP_DIR, APP_TEMP_DIR]:
204 try:
205 Path(dir_path).mkdir(parents=True, exist_ok=True)
206 except (PermissionError, OSError):
207 # In Docker, these might already exist or need root
208 pass
210 async def _ensure_initialized(self):
211 """Ensure provider client is initialized."""
212 if self._initialized:
213 return
215 if self.provider in (TTSProvider.OPENAI, TTSProvider.ELEVENLABS,
216 TTSProvider.EDGE, TTSProvider.GOOGLE,
217 TTSProvider.AMAZON):
218 logger.info("%s provider selected but disabled (closed-source). "
219 "Synthesis calls will return empty audio.", self.provider.value)
221 self._initialized = True
223 async def synthesize(
224 self,
225 text: str,
226 voice: Optional[str] = None,
227 format: Optional[AudioFormat] = None,
228 speed: float = 1.0
229 ) -> bytes:
230 """
231 Synthesize text to speech.
233 Args:
234 text: Text to synthesize
235 voice: Voice ID (uses default if not specified)
236 format: Output audio format
237 speed: Speech speed multiplier (0.5 to 2.0)
239 Returns:
240 Audio bytes in the specified format
241 """
242 await self._ensure_initialized()
244 voice = voice or self.default_voice
245 format = format or AudioFormat.MP3
246 speed = max(0.5, min(2.0, speed)) # Clamp speed
248 logger.info(f"Synthesizing {len(text)} chars with voice {voice}")
250 # Provider-specific synthesis
251 if self.provider == TTSProvider.LUXTTS:
252 return await self._synthesize_luxtts(text, voice, format, speed)
253 elif self.provider == TTSProvider.POCKET:
254 return await self._synthesize_pocket(text, voice, format, speed)
255 elif self.provider in (TTSProvider.OPENAI, TTSProvider.ELEVENLABS,
256 TTSProvider.EDGE, TTSProvider.GOOGLE,
257 TTSProvider.AMAZON):
258 return await self._synthesize_cloud_disabled(self.provider.value)
260 return b""
262 async def _synthesize_luxtts(
263 self,
264 text: str,
265 voice: str,
266 format: AudioFormat,
267 speed: float
268 ) -> bytes:
269 """Synthesize using LuxTTS (offline, GPU/CPU, 48kHz, voice cloning).
271 Uses integrations.service_tools.luxtts_tool for actual synthesis,
272 then reads the output WAV file and returns raw bytes.
273 """
274 import json as _json
275 try:
276 from integrations.service_tools.luxtts_tool import luxtts_synthesize
277 result = _json.loads(luxtts_synthesize(
278 text,
279 voice_audio=voice if voice != "default" else None,
280 speed=speed,
281 ))
282 if 'error' in result:
283 logger.warning(f"LuxTTS error: {result['error']}")
284 return b""
285 wav_path = result.get('path', '')
286 if wav_path and os.path.isfile(wav_path):
287 with open(wav_path, 'rb') as f:
288 return f.read()
289 return b""
290 except ImportError:
291 logger.warning("luxtts_tool not available")
292 return b""
293 except Exception as e:
294 logger.warning(f"LuxTTS synthesis failed: {e}")
295 return b""
297 async def _synthesize_pocket(
298 self,
299 text: str,
300 voice: str,
301 format: AudioFormat,
302 speed: float
303 ) -> bytes:
304 """Synthesize using Pocket TTS (offline, CPU, 100M params).
306 Uses integrations.service_tools.pocket_tts_tool for actual synthesis,
307 then reads the output WAV file and returns raw bytes.
308 """
309 import json as _json
310 try:
311 from integrations.service_tools.pocket_tts_tool import pocket_tts_synthesize
312 result = _json.loads(pocket_tts_synthesize(text, voice))
313 if 'error' in result:
314 logger.warning(f"Pocket TTS error: {result['error']}")
315 return b""
316 wav_path = result.get('path', '')
317 if wav_path and os.path.isfile(wav_path):
318 with open(wav_path, 'rb') as f:
319 audio_bytes = f.read()
320 # WAV is native format; convert if needed
321 if format == AudioFormat.WAV:
322 return audio_bytes
323 # For other formats, return WAV (caller can convert)
324 return audio_bytes
325 return b""
326 except ImportError:
327 logger.warning("pocket_tts_tool not available")
328 return b""
329 except Exception as e:
330 logger.warning(f"Pocket TTS synthesis failed: {e}")
331 return b""
333 async def _synthesize_cloud_disabled(self, provider_name: str) -> bytes:
334 """Return empty bytes for disabled cloud TTS providers.
336 HART OS is offline-first — no closed-source TTS APIs.
337 Use TTSProvider.POCKET or TTSProvider.LUXTTS instead.
338 """
339 logger.warning(
340 "%s TTS is disabled (closed-source cloud API). "
341 "Use POCKET or LUXTTS for offline synthesis.", provider_name
342 )
343 return b""
345 async def synthesize_ssml(
346 self,
347 ssml: str,
348 voice: Optional[str] = None,
349 format: Optional[AudioFormat] = None
350 ) -> bytes:
351 """
352 Synthesize SSML to speech.
354 Args:
355 ssml: SSML markup to synthesize
356 voice: Voice ID (uses default if not specified)
357 format: Output audio format
359 Returns:
360 Audio bytes in the specified format
361 """
362 await self._ensure_initialized()
364 voice = voice or self.default_voice
365 format = format or AudioFormat.MP3
367 logger.info(f"Synthesizing SSML with voice {voice}")
369 # Provider-specific SSML synthesis
370 # Most providers support SSML with varying feature sets
372 if self.provider in (TTSProvider.LUXTTS, TTSProvider.POCKET):
373 # LuxTTS and Pocket TTS don't support SSML — strip tags, synthesize plain text
374 import re
375 plain = re.sub(r'<[^>]+>', '', ssml).strip()
376 if plain:
377 return await self.synthesize(plain, voice, format)
378 return b""
380 # Cloud providers: disabled
381 logger.warning("SSML synthesis not available (cloud providers disabled)")
382 return b""
384 async def list_voices(
385 self,
386 language: Optional[str] = None,
387 gender: Optional[str] = None,
388 use_cache: bool = True
389 ) -> List[VoiceInfo]:
390 """
391 List available voices.
393 Args:
394 language: Filter by language code (e.g., "en-US")
395 gender: Filter by gender ("male", "female", "neutral")
396 use_cache: Whether to use cached voice list
398 Returns:
399 List of available voices
400 """
401 await self._ensure_initialized()
403 import time
405 # Check cache
406 if use_cache and self._voices_cache is not None:
407 cache_age = time.time() - self._cache_timestamp
408 if cache_age < 3600: # 1 hour cache
409 voices = self._voices_cache
410 return self._filter_voices(voices, language, gender)
412 # Fetch voices from provider
413 voices = await self._fetch_voices()
415 # Update cache
416 self._voices_cache = voices
417 self._cache_timestamp = time.time()
419 return self._filter_voices(voices, language, gender)
421 async def _fetch_voices(self) -> List[VoiceInfo]:
422 """Fetch available voices from provider."""
423 voices = []
425 if self.provider == TTSProvider.LUXTTS:
426 # LuxTTS cloned voices
427 try:
428 import json as _json
429 from integrations.service_tools.luxtts_tool import luxtts_list_voices
430 data = _json.loads(luxtts_list_voices())
431 for v in data.get('voices', []):
432 voices.append(VoiceInfo(
433 id=v['id'], name=v['name'], language="en",
434 provider="luxtts", sample_rate=24000,
435 metadata={"type": v.get('type', 'cloned')},
436 ))
437 except (ImportError, Exception):
438 voices.append(VoiceInfo(
439 id="default", name="Default", language="en",
440 provider="luxtts", sample_rate=24000,
441 ))
442 elif self.provider == TTSProvider.POCKET:
443 # Pocket TTS built-in + cloned voices
444 try:
445 import json as _json
446 from integrations.service_tools.pocket_tts_tool import pocket_tts_list_voices
447 data = _json.loads(pocket_tts_list_voices())
448 for v in data.get('voices', []):
449 voices.append(VoiceInfo(
450 id=v['id'], name=v['name'], language="en",
451 provider="pocket", metadata={"type": v.get('type', 'builtin')},
452 ))
453 except (ImportError, Exception):
454 # Fallback: correct 8 built-in voices (pocket-tts 1.1.1)
455 for name in ["alba", "marius", "javert", "jean",
456 "fantine", "cosette", "eponine", "azelma"]:
457 voices.append(VoiceInfo(
458 id=name, name=name.title(), language="en", provider="pocket",
459 ))
460 # Cloud providers: disabled, no voices to list
462 return voices
464 def _filter_voices(
465 self,
466 voices: List[VoiceInfo],
467 language: Optional[str],
468 gender: Optional[str]
469 ) -> List[VoiceInfo]:
470 """Filter voices by criteria."""
471 filtered = voices
473 if language:
474 filtered = [v for v in filtered if v.language.lower().startswith(language.lower())]
476 if gender:
477 filtered = [v for v in filtered if v.gender and v.gender.lower() == gender.lower()]
479 return filtered
481 def get_optimal_format(self, channel: str) -> str:
482 """
483 Get optimal audio format for a channel.
485 Args:
486 channel: Channel name (telegram, discord, etc.)
488 Returns:
489 Optimal format string (opus, mp3, wav, ogg)
490 """
491 format_enum = self.CHANNEL_FORMATS.get(
492 channel.lower(),
493 self.CHANNEL_FORMATS["default"]
494 )
495 return format_enum.value
497 def get_supported_formats(self) -> List[str]:
498 """Get list of supported output formats."""
499 formats = {
500 TTSProvider.LUXTTS: ["wav"],
501 TTSProvider.POCKET: ["wav"],
502 TTSProvider.OPENAI: ["mp3", "opus", "aac", "flac", "wav", "pcm"],
503 TTSProvider.ELEVENLABS: ["mp3", "wav", "ogg"],
504 TTSProvider.EDGE: ["mp3", "wav", "ogg"],
505 TTSProvider.GOOGLE: ["mp3", "wav", "ogg"],
506 TTSProvider.AMAZON: ["mp3", "ogg", "pcm"]
507 }
508 return formats.get(self.provider, ["mp3", "wav"])
510 def get_max_text_length(self) -> int:
511 """Get maximum text length for single request."""
512 limits = {
513 TTSProvider.LUXTTS: 10000, # Local — no API limits, just memory
514 TTSProvider.POCKET: 10000, # Local — no API limits, just memory
515 TTSProvider.OPENAI: 4096,
516 TTSProvider.ELEVENLABS: 5000,
517 TTSProvider.EDGE: 10000,
518 TTSProvider.GOOGLE: 5000,
519 TTSProvider.AMAZON: 3000
520 }
521 return limits.get(self.provider, 4096)
523 async def synthesize_long_text(
524 self,
525 text: str,
526 voice: Optional[str] = None,
527 format: Optional[AudioFormat] = None
528 ) -> bytes:
529 """
530 Synthesize long text by chunking.
532 Args:
533 text: Text to synthesize (can exceed max length)
534 voice: Voice ID
535 format: Output audio format
537 Returns:
538 Combined audio bytes
539 """
540 max_length = self.get_max_text_length()
542 if len(text) <= max_length:
543 return await self.synthesize(text, voice, format)
545 # Split text into chunks at sentence boundaries
546 chunks = self._split_text(text, max_length)
548 # Synthesize each chunk
549 audio_parts = []
550 for chunk in chunks:
551 audio = await self.synthesize(chunk, voice, format)
552 audio_parts.append(audio)
554 # Combine audio parts
555 return self._combine_audio(audio_parts, format or AudioFormat.MP3)
557 def _split_text(self, text: str, max_length: int) -> List[str]:
558 """Split text into chunks at sentence boundaries."""
559 sentences = []
560 current = ""
562 # Simple sentence splitting
563 for char in text:
564 current += char
565 if char in ".!?" and len(current) > 0:
566 sentences.append(current.strip())
567 current = ""
569 if current.strip():
570 sentences.append(current.strip())
572 # Combine sentences into chunks
573 chunks = []
574 current_chunk = ""
576 for sentence in sentences:
577 if len(current_chunk) + len(sentence) + 1 <= max_length:
578 current_chunk += (" " if current_chunk else "") + sentence
579 else:
580 if current_chunk:
581 chunks.append(current_chunk)
582 current_chunk = sentence
584 if current_chunk:
585 chunks.append(current_chunk)
587 return chunks
589 def _combine_audio(self, parts: List[bytes], format: AudioFormat) -> bytes:
590 """Combine multiple audio parts."""
591 # For MP3/OGG, simple concatenation often works
592 # For WAV, would need proper header handling
593 if format in [AudioFormat.MP3, AudioFormat.OGG, AudioFormat.OPUS]:
594 return b"".join(parts)
596 # For WAV, would need proper combination
597 # This is a placeholder - real implementation would use audio library
598 return b"".join(parts)
600 async def save_to_file(
601 self,
602 text: str,
603 file_path: str,
604 voice: Optional[str] = None,
605 format: Optional[AudioFormat] = None
606 ) -> str:
607 """
608 Synthesize and save to file.
610 Args:
611 text: Text to synthesize
612 file_path: Output file path
613 voice: Voice ID
614 format: Output audio format
616 Returns:
617 Path to saved file
618 """
619 audio = await self.synthesize(text, voice, format)
621 path = Path(file_path)
622 path.parent.mkdir(parents=True, exist_ok=True)
624 with open(path, 'wb') as f:
625 f.write(audio)
627 return str(path)
629 def build_ssml(
630 self,
631 text: str,
632 config: Optional[SSMLConfig] = None
633 ) -> str:
634 """
635 Build SSML from text and configuration.
637 Args:
638 text: Plain text
639 config: SSML configuration options
641 Returns:
642 SSML markup string
643 """
644 config = config or SSMLConfig()
646 ssml_parts = ['<speak>']
648 # Add prosody if configured
649 prosody_attrs = []
650 if config.rate:
651 prosody_attrs.append(f'rate="{config.rate}"')
652 if config.pitch:
653 prosody_attrs.append(f'pitch="{config.pitch}"')
654 if config.volume:
655 prosody_attrs.append(f'volume="{config.volume}"')
657 if prosody_attrs:
658 ssml_parts.append(f'<prosody {" ".join(prosody_attrs)}>')
660 # Add emphasis if configured
661 if config.emphasis:
662 ssml_parts.append(f'<emphasis level="{config.emphasis}">')
663 ssml_parts.append(text)
664 ssml_parts.append('</emphasis>')
665 else:
666 ssml_parts.append(text)
668 if prosody_attrs:
669 ssml_parts.append('</prosody>')
671 ssml_parts.append('</speak>')
673 return "".join(ssml_parts)
675 def estimate_duration(self, text: str, speed: float = 1.0) -> float:
676 """
677 Estimate audio duration for text.
679 Args:
680 text: Text to estimate
681 speed: Speech speed multiplier
683 Returns:
684 Estimated duration in seconds
685 """
686 # Average speaking rate is ~150 words per minute
687 words = len(text.split())
688 base_duration = (words / 150) * 60 # seconds
689 return base_duration / speed
691 def get_provider_info(self) -> Dict[str, Any]:
692 """Get information about the current provider."""
693 _cloud = (TTSProvider.OPENAI, TTSProvider.ELEVENLABS,
694 TTSProvider.EDGE, TTSProvider.GOOGLE, TTSProvider.AMAZON)
695 return {
696 "provider": self.provider.value,
697 "model": self.model,
698 "default_voice": self.default_voice,
699 "max_text_length": self.get_max_text_length(),
700 "supported_formats": self.get_supported_formats(),
701 "ssml_support": False, # No active provider supports SSML
702 "offline": self.provider not in _cloud,
703 "disabled": self.provider in _cloud,
704 }