Coverage for integrations / service_tools / chatterbox_tool.py: 38.2%
76 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Chatterbox TTS tool — GPU-accelerated emotional speech synthesis.
4Two variants, each in its own subprocess worker:
5 - Turbo: English, 3.8GB VRAM, [laugh]/[chuckle] tags
6 - ML: 23 languages, 12GB VRAM, voice cloning
8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the
9worker subprocess entry point. Variant is selected via CLI arg:
10 python -m integrations.service_tools.chatterbox_tool turbo
11 python -m integrations.service_tools.chatterbox_tool ml
13CUDA OOM (especially likely with the 12GB ML model on consumer GPUs)
14only kills the worker. Parent receives `transient: true` and falls back.
16Public API (parent side):
17 chatterbox_synthesize(text, language, voice, output_path) → JSON [Turbo]
18 chatterbox_ml_synthesize(text, language, voice, output_path) → JSON [ML]
19 unload_chatterbox() → None
20"""
22from typing import Optional
24import os
25import sys
27from integrations.service_tools.gpu_worker import ToolWorker
29# Default reference voice for Chatterbox voice cloning — same path Nunba
30# used historically so existing ref audio keeps working.
31_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')
33# Silence pad appended after every generation to prevent chopped endings.
34_END_PAD_SECONDS = 0.3
37def _resolve_ref_voice(req: dict) -> str:
38 ref = req.get('voice')
39 if not ref and os.path.isfile(_DEFAULT_REF_VOICE):
40 ref = _DEFAULT_REF_VOICE
41 return ref or ''
44def _save_wav_with_padding(wav, sample_rate: int, output_path: str) -> float:
45 """Save a tensor as WAV with 0.3s end silence pad. Returns duration."""
46 import torch
47 import torchaudio
49 # Normalize shape to (channels, samples)
50 if wav.ndim == 1:
51 wav = wav.unsqueeze(0)
52 wav = wav.cpu()
54 # Pad trailing silence to prevent chopped endings
55 pad = torch.zeros(
56 wav.shape[0], int(sample_rate * _END_PAD_SECONDS),
57 dtype=wav.dtype, device=wav.device,
58 )
59 wav_padded = torch.cat([wav, pad], dim=-1)
61 torchaudio.save(output_path, wav_padded, sample_rate)
62 return wav_padded.shape[-1] / sample_rate
65# ── Chatterbox Turbo (English, 3.8GB VRAM) ──────────────────────
67def _load_turbo():
68 """Load Chatterbox Turbo (English, voice cloning) on CUDA.
70 Ported from Nunba's _LazyChatterboxTurbo:
71 - Uses ChatterboxTurboTTS (not the base ChatterboxTTS)
72 - Applies Windows safetensors CPU-first workaround to avoid
73 segfaults on sequential CUDA loads (known safetensors bug
74 on Windows).
75 """
76 from chatterbox.tts_turbo import ChatterboxTurboTTS
78 if sys.platform == 'win32':
79 # safetensors segfaults on sequential CUDA loads on Windows.
80 # Patch load_file to always load to CPU first, then let
81 # .to(device) do the CUDA transfer.
82 import safetensors.torch as _st
83 _orig_load = _st.load_file
85 def _cpu_first_load(path, device=None):
86 return _orig_load(path, device='cpu')
88 _st.load_file = _cpu_first_load
89 try:
90 return ChatterboxTurboTTS.from_pretrained(device='cuda')
91 finally:
92 _st.load_file = _orig_load
94 return ChatterboxTurboTTS.from_pretrained(device='cuda')
97def _synthesize_turbo(model, req: dict) -> dict:
98 text = req.get('text', '')
99 if not text or not text.strip():
100 return {'error': 'Text is required'}
101 output_path = req.get('output_path')
102 if not output_path:
103 return {'error': 'output_path is required'}
105 ref = _resolve_ref_voice(req)
106 wav = model.generate(text, audio_prompt_path=ref)
107 duration = _save_wav_with_padding(wav, model.sr, output_path)
109 return {
110 'path': output_path,
111 'duration': round(duration, 2),
112 'sample_rate': model.sr,
113 'engine': 'chatterbox-turbo',
114 'device': 'cuda',
115 'voice': ref or 'default',
116 }
119# ── Chatterbox Multilingual (23 languages, 12GB VRAM) ───────────
121def _load_ml():
122 """Load Chatterbox Multilingual (23 languages, 12GB VRAM) on CUDA.
124 Ported from Nunba's _LazyChatterboxMultilingual:
125 - Uses ChatterboxMultilingualTTS (not base ChatterboxTTS)
126 """
127 from chatterbox.tts import ChatterboxMultilingualTTS
128 return ChatterboxMultilingualTTS.from_pretrained(device='cuda')
131def _synthesize_ml(model, req: dict) -> dict:
132 text = req.get('text', '')
133 if not text or not text.strip():
134 return {'error': 'Text is required'}
135 output_path = req.get('output_path')
136 if not output_path:
137 return {'error': 'output_path is required'}
139 language = req.get('language', 'en')
140 ref = _resolve_ref_voice(req)
141 # ChatterboxMultilingualTTS uses language_id=, not lang=
142 wav = model.generate(text, audio_prompt_path=ref, language_id=language)
143 duration = _save_wav_with_padding(wav, model.sr, output_path)
145 return {
146 'path': output_path,
147 'duration': round(duration, 2),
148 'sample_rate': model.sr,
149 'engine': 'chatterbox-ml',
150 'device': 'cuda',
151 'language': language,
152 'voice': ref or 'default',
153 }
156# ── Parent-side: one ToolWorker per variant ──────────────────────
158_turbo = ToolWorker(
159 tool_name='chatterbox_turbo',
160 tool_module='integrations.service_tools.chatterbox_tool',
161 variant='turbo',
162 vram_budget='tts_chatterbox_turbo',
163 output_subdir='chatterbox/output',
164 engine='chatterbox-turbo',
165 startup_timeout=120.0,
166 request_timeout=120.0,
167)
169_ml = ToolWorker(
170 tool_name='chatterbox_ml',
171 tool_module='integrations.service_tools.chatterbox_tool',
172 variant='ml',
173 vram_budget='tts_chatterbox_ml',
174 output_subdir='chatterbox/output',
175 engine='chatterbox-ml',
176 startup_timeout=240.0, # 12GB model takes a while
177 request_timeout=180.0,
178)
181def chatterbox_synthesize(
182 text: str,
183 language: str = 'en',
184 voice: Optional[str] = None,
185 output_path: Optional[str] = None,
186) -> str:
187 """Synthesize with Chatterbox Turbo (English, GPU subprocess)."""
188 return _turbo.synthesize(
189 text=text, language='en', voice=voice, output_path=output_path,
190 )
193def chatterbox_ml_synthesize(
194 text: str,
195 language: str = 'en',
196 voice: Optional[str] = None,
197 output_path: Optional[str] = None,
198) -> str:
199 """Synthesize with Chatterbox ML (23 languages, GPU subprocess)."""
200 return _ml.synthesize(
201 text=text, language=language, voice=voice, output_path=output_path,
202 )
205def unload_chatterbox():
206 """Stop both Chatterbox worker subprocesses and free VRAM."""
207 _turbo.stop()
208 _ml.stop()
211class ChatterboxTool:
212 """Register Chatterbox as an in-process service tool."""
214 @classmethod
215 def register_functions(cls):
216 from .registry import ServiceToolInfo, service_tool_registry
217 tool_info = ServiceToolInfo(
218 name="chatterbox",
219 description=(
220 "GPU-accelerated emotional TTS. Turbo: English + [laugh]/[chuckle] tags, "
221 "3.8GB VRAM. ML: 23 languages, 12GB VRAM. Voice cloning. "
222 "Requires: pip install chatterbox"
223 ),
224 base_url="inprocess://chatterbox",
225 endpoints={
226 "synthesize": {
227 "path": "/synthesize",
228 "method": "POST",
229 "description": "Synthesize with Chatterbox Turbo (English, GPU).",
230 "params_schema": {
231 "text": {"type": "string"},
232 "voice": {"type": "string", "description": "Reference audio path"},
233 },
234 },
235 "synthesize_ml": {
236 "path": "/synthesize_ml",
237 "method": "POST",
238 "description": "Synthesize with Chatterbox ML (23 languages, GPU).",
239 "params_schema": {
240 "text": {"type": "string"},
241 "language": {"type": "string"},
242 "voice": {"type": "string"},
243 },
244 },
245 },
246 tags=["tts", "speech", "voice-cloning", "gpu", "chatterbox"],
247 timeout=60,
248 )
249 tool_info.is_healthy = True
250 service_tool_registry._tools["chatterbox"] = tool_info
251 return True
254# NOTE: no `if __name__ == '__main__':` block here. The centralized
255# dispatcher in gpu_worker picks up `_load_turbo`/`_synthesize_turbo`
256# when spawned with variant='turbo', and `_load_ml`/`_synthesize_ml`
257# when variant='ml'.