Coverage for integrations/service_tools/chatterbox

1"""

2Chatterbox TTS tool — GPU-accelerated emotional speech synthesis.

4Two variants, each in its own subprocess worker:

5 - Turbo: English, 3.8GB VRAM, [laugh]/[chuckle] tags

6 - ML: 23 languages, 12GB VRAM, voice cloning

8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the

9worker subprocess entry point. Variant is selected via CLI arg:

10 python -m integrations.service_tools.chatterbox_tool turbo

11 python -m integrations.service_tools.chatterbox_tool ml

13CUDA OOM (especially likely with the 12GB ML model on consumer GPUs)

14only kills the worker. Parent receives `transient: true` and falls back.

16Public API (parent side):

17 chatterbox_synthesize(text, language, voice, output_path) → JSON [Turbo]

18 chatterbox_ml_synthesize(text, language, voice, output_path) → JSON [ML]

19 unload_chatterbox() → None

20"""

22from typing import Optional

24import os

25import sys

27from integrations.service_tools.gpu_worker import ToolWorker

29# Default reference voice for Chatterbox voice cloning — same path Nunba

30# used historically so existing ref audio keeps working.

31_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')

33# Silence pad appended after every generation to prevent chopped endings.

34_END_PAD_SECONDS = 0.3

37def _resolve_ref_voice(req: dict) -> str:

38 ref = req.get('voice')

39 if not ref and os.path.isfile(_DEFAULT_REF_VOICE):

40 ref = _DEFAULT_REF_VOICE

41 return ref or ''

44def _save_wav_with_padding(wav, sample_rate: int, output_path: str) -> float:

45 """Save a tensor as WAV with 0.3s end silence pad. Returns duration."""

46 import torch

47 import torchaudio

49 # Normalize shape to (channels, samples)

50 if wav.ndim == 1:

51 wav = wav.unsqueeze(0)

52 wav = wav.cpu()

54 # Pad trailing silence to prevent chopped endings

55 pad = torch.zeros(

56 wav.shape[0], int(sample_rate * _END_PAD_SECONDS),

57 dtype=wav.dtype, device=wav.device,

58 )

59 wav_padded = torch.cat([wav, pad], dim=-1)

61 torchaudio.save(output_path, wav_padded, sample_rate)

62 return wav_padded.shape[-1] / sample_rate

65# ── Chatterbox Turbo (English, 3.8GB VRAM) ──────────────────────

67def _load_turbo():

68 """Load Chatterbox Turbo (English, voice cloning) on CUDA.

70 Ported from Nunba's _LazyChatterboxTurbo:

71 - Uses ChatterboxTurboTTS (not the base ChatterboxTTS)

72 - Applies Windows safetensors CPU-first workaround to avoid

73 segfaults on sequential CUDA loads (known safetensors bug

74 on Windows).

75 """

76 from chatterbox.tts_turbo import ChatterboxTurboTTS

78 if sys.platform == 'win32':

79 # safetensors segfaults on sequential CUDA loads on Windows.

80 # Patch load_file to always load to CPU first, then let

81 # .to(device) do the CUDA transfer.

82 import safetensors.torch as _st

83 _orig_load = _st.load_file

85 def _cpu_first_load(path, device=None):

86 return _orig_load(path, device='cpu')

88 _st.load_file = _cpu_first_load

89 try:

90 return ChatterboxTurboTTS.from_pretrained(device='cuda')

91 finally:

92 _st.load_file = _orig_load

94 return ChatterboxTurboTTS.from_pretrained(device='cuda')

97def _synthesize_turbo(model, req: dict) -> dict:

98 text = req.get('text', '')

99 if not text or not text.strip():

100 return {'error': 'Text is required'}

101 output_path = req.get('output_path')

102 if not output_path:

103 return {'error': 'output_path is required'}

104

105 ref = _resolve_ref_voice(req)

106 wav = model.generate(text, audio_prompt_path=ref)

107 duration = _save_wav_with_padding(wav, model.sr, output_path)

108

109 return {

110 'path': output_path,

111 'duration': round(duration, 2),

112 'sample_rate': model.sr,

113 'engine': 'chatterbox-turbo',

114 'device': 'cuda',

115 'voice': ref or 'default',

116 }

117

118

119# ── Chatterbox Multilingual (23 languages, 12GB VRAM) ───────────

120

121def _load_ml():

122 """Load Chatterbox Multilingual (23 languages, 12GB VRAM) on CUDA.

123

124 Ported from Nunba's _LazyChatterboxMultilingual:

125 - Uses ChatterboxMultilingualTTS (not base ChatterboxTTS)

126 """

127 from chatterbox.tts import ChatterboxMultilingualTTS

128 return ChatterboxMultilingualTTS.from_pretrained(device='cuda')

129

130

131def _synthesize_ml(model, req: dict) -> dict:

132 text = req.get('text', '')

133 if not text or not text.strip():

134 return {'error': 'Text is required'}

135 output_path = req.get('output_path')

136 if not output_path:

137 return {'error': 'output_path is required'}

138

139 language = req.get('language', 'en')

140 ref = _resolve_ref_voice(req)

141 # ChatterboxMultilingualTTS uses language_id=, not lang=

142 wav = model.generate(text, audio_prompt_path=ref, language_id=language)

143 duration = _save_wav_with_padding(wav, model.sr, output_path)

144

145 return {

146 'path': output_path,

147 'duration': round(duration, 2),

148 'sample_rate': model.sr,

149 'engine': 'chatterbox-ml',

150 'device': 'cuda',

151 'language': language,

152 'voice': ref or 'default',

153 }

154

155

156# ── Parent-side: one ToolWorker per variant ──────────────────────

157

158_turbo = ToolWorker(

159 tool_name='chatterbox_turbo',

160 tool_module='integrations.service_tools.chatterbox_tool',

161 variant='turbo',

162 vram_budget='tts_chatterbox_turbo',

163 output_subdir='chatterbox/output',

164 engine='chatterbox-turbo',

165 startup_timeout=120.0,

166 request_timeout=120.0,

167)

168

169_ml = ToolWorker(

170 tool_name='chatterbox_ml',

171 tool_module='integrations.service_tools.chatterbox_tool',

172 variant='ml',

173 vram_budget='tts_chatterbox_ml',

174 output_subdir='chatterbox/output',

175 engine='chatterbox-ml',

176 startup_timeout=240.0, # 12GB model takes a while

177 request_timeout=180.0,

178)

179

180

181def chatterbox_synthesize(

182 text: str,

183 language: str = 'en',

184 voice: Optional[str] = None,

185 output_path: Optional[str] = None,

186) -> str:

187 """Synthesize with Chatterbox Turbo (English, GPU subprocess)."""

188 return _turbo.synthesize(

189 text=text, language='en', voice=voice, output_path=output_path,

190 )

191

192

193def chatterbox_ml_synthesize(

194 text: str,

195 language: str = 'en',

196 voice: Optional[str] = None,

197 output_path: Optional[str] = None,

198) -> str:

199 """Synthesize with Chatterbox ML (23 languages, GPU subprocess)."""

200 return _ml.synthesize(

201 text=text, language=language, voice=voice, output_path=output_path,

202 )

203

204

205def unload_chatterbox():

206 """Stop both Chatterbox worker subprocesses and free VRAM."""

207 _turbo.stop()

208 _ml.stop()

209

210

211class ChatterboxTool:

212 """Register Chatterbox as an in-process service tool."""

213

214 @classmethod

215 def register_functions(cls):

216 from .registry import ServiceToolInfo, service_tool_registry

217 tool_info = ServiceToolInfo(

218 name="chatterbox",

219 description=(

220 "GPU-accelerated emotional TTS. Turbo: English + [laugh]/[chuckle] tags, "

221 "3.8GB VRAM. ML: 23 languages, 12GB VRAM. Voice cloning. "

222 "Requires: pip install chatterbox"

223 ),

224 base_url="inprocess://chatterbox",

225 endpoints={

226 "synthesize": {

227 "path": "/synthesize",

228 "method": "POST",

229 "description": "Synthesize with Chatterbox Turbo (English, GPU).",

230 "params_schema": {

231 "text": {"type": "string"},

232 "voice": {"type": "string", "description": "Reference audio path"},

233 },

234 },

235 "synthesize_ml": {

236 "path": "/synthesize_ml",

237 "method": "POST",

238 "description": "Synthesize with Chatterbox ML (23 languages, GPU).",

239 "params_schema": {

240 "text": {"type": "string"},

241 "language": {"type": "string"},

242 "voice": {"type": "string"},

243 },

244 },

245 },

246 tags=["tts", "speech", "voice-cloning", "gpu", "chatterbox"],

247 timeout=60,

248 )

249 tool_info.is_healthy = True

250 service_tool_registry._tools["chatterbox"] = tool_info

251 return True

252

253

254# NOTE: no `if __name__ == '__main__':` block here. The centralized

255# dispatcher in gpu_worker picks up `_load_turbo`/`_synthesize_turbo`

256# when spawned with variant='turbo', and `_load_ml`/`_synthesize_ml`

257# when variant='ml'.

Coverage for integrations / service_tools / chatterbox_tool.py: 38.2%

76 statements