Coverage for integrations / service_tools / chatterbox_tool.py: 38.2%

76 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Chatterbox TTS tool — GPU-accelerated emotional speech synthesis. 

3 

4Two variants, each in its own subprocess worker: 

5 - Turbo: English, 3.8GB VRAM, [laugh]/[chuckle] tags 

6 - ML: 23 languages, 12GB VRAM, voice cloning 

7 

8SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the 

9worker subprocess entry point. Variant is selected via CLI arg: 

10 python -m integrations.service_tools.chatterbox_tool turbo 

11 python -m integrations.service_tools.chatterbox_tool ml 

12 

13CUDA OOM (especially likely with the 12GB ML model on consumer GPUs) 

14only kills the worker. Parent receives `transient: true` and falls back. 

15 

16Public API (parent side): 

17 chatterbox_synthesize(text, language, voice, output_path) → JSON [Turbo] 

18 chatterbox_ml_synthesize(text, language, voice, output_path) → JSON [ML] 

19 unload_chatterbox() → None 

20""" 

21 

22from typing import Optional 

23 

24import os 

25import sys 

26 

27from integrations.service_tools.gpu_worker import ToolWorker 

28 

29# Default reference voice for Chatterbox voice cloning — same path Nunba 

30# used historically so existing ref audio keeps working. 

31_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3') 

32 

33# Silence pad appended after every generation to prevent chopped endings. 

34_END_PAD_SECONDS = 0.3 

35 

36 

37def _resolve_ref_voice(req: dict) -> str: 

38 ref = req.get('voice') 

39 if not ref and os.path.isfile(_DEFAULT_REF_VOICE): 

40 ref = _DEFAULT_REF_VOICE 

41 return ref or '' 

42 

43 

44def _save_wav_with_padding(wav, sample_rate: int, output_path: str) -> float: 

45 """Save a tensor as WAV with 0.3s end silence pad. Returns duration.""" 

46 import torch 

47 import torchaudio 

48 

49 # Normalize shape to (channels, samples) 

50 if wav.ndim == 1: 

51 wav = wav.unsqueeze(0) 

52 wav = wav.cpu() 

53 

54 # Pad trailing silence to prevent chopped endings 

55 pad = torch.zeros( 

56 wav.shape[0], int(sample_rate * _END_PAD_SECONDS), 

57 dtype=wav.dtype, device=wav.device, 

58 ) 

59 wav_padded = torch.cat([wav, pad], dim=-1) 

60 

61 torchaudio.save(output_path, wav_padded, sample_rate) 

62 return wav_padded.shape[-1] / sample_rate 

63 

64 

65# ── Chatterbox Turbo (English, 3.8GB VRAM) ────────────────────── 

66 

67def _load_turbo(): 

68 """Load Chatterbox Turbo (English, voice cloning) on CUDA. 

69 

70 Ported from Nunba's _LazyChatterboxTurbo: 

71 - Uses ChatterboxTurboTTS (not the base ChatterboxTTS) 

72 - Applies Windows safetensors CPU-first workaround to avoid 

73 segfaults on sequential CUDA loads (known safetensors bug 

74 on Windows). 

75 """ 

76 from chatterbox.tts_turbo import ChatterboxTurboTTS 

77 

78 if sys.platform == 'win32': 

79 # safetensors segfaults on sequential CUDA loads on Windows. 

80 # Patch load_file to always load to CPU first, then let 

81 # .to(device) do the CUDA transfer. 

82 import safetensors.torch as _st 

83 _orig_load = _st.load_file 

84 

85 def _cpu_first_load(path, device=None): 

86 return _orig_load(path, device='cpu') 

87 

88 _st.load_file = _cpu_first_load 

89 try: 

90 return ChatterboxTurboTTS.from_pretrained(device='cuda') 

91 finally: 

92 _st.load_file = _orig_load 

93 

94 return ChatterboxTurboTTS.from_pretrained(device='cuda') 

95 

96 

97def _synthesize_turbo(model, req: dict) -> dict: 

98 text = req.get('text', '') 

99 if not text or not text.strip(): 

100 return {'error': 'Text is required'} 

101 output_path = req.get('output_path') 

102 if not output_path: 

103 return {'error': 'output_path is required'} 

104 

105 ref = _resolve_ref_voice(req) 

106 wav = model.generate(text, audio_prompt_path=ref) 

107 duration = _save_wav_with_padding(wav, model.sr, output_path) 

108 

109 return { 

110 'path': output_path, 

111 'duration': round(duration, 2), 

112 'sample_rate': model.sr, 

113 'engine': 'chatterbox-turbo', 

114 'device': 'cuda', 

115 'voice': ref or 'default', 

116 } 

117 

118 

119# ── Chatterbox Multilingual (23 languages, 12GB VRAM) ─────────── 

120 

121def _load_ml(): 

122 """Load Chatterbox Multilingual (23 languages, 12GB VRAM) on CUDA. 

123 

124 Ported from Nunba's _LazyChatterboxMultilingual: 

125 - Uses ChatterboxMultilingualTTS (not base ChatterboxTTS) 

126 """ 

127 from chatterbox.tts import ChatterboxMultilingualTTS 

128 return ChatterboxMultilingualTTS.from_pretrained(device='cuda') 

129 

130 

131def _synthesize_ml(model, req: dict) -> dict: 

132 text = req.get('text', '') 

133 if not text or not text.strip(): 

134 return {'error': 'Text is required'} 

135 output_path = req.get('output_path') 

136 if not output_path: 

137 return {'error': 'output_path is required'} 

138 

139 language = req.get('language', 'en') 

140 ref = _resolve_ref_voice(req) 

141 # ChatterboxMultilingualTTS uses language_id=, not lang= 

142 wav = model.generate(text, audio_prompt_path=ref, language_id=language) 

143 duration = _save_wav_with_padding(wav, model.sr, output_path) 

144 

145 return { 

146 'path': output_path, 

147 'duration': round(duration, 2), 

148 'sample_rate': model.sr, 

149 'engine': 'chatterbox-ml', 

150 'device': 'cuda', 

151 'language': language, 

152 'voice': ref or 'default', 

153 } 

154 

155 

156# ── Parent-side: one ToolWorker per variant ────────────────────── 

157 

158_turbo = ToolWorker( 

159 tool_name='chatterbox_turbo', 

160 tool_module='integrations.service_tools.chatterbox_tool', 

161 variant='turbo', 

162 vram_budget='tts_chatterbox_turbo', 

163 output_subdir='chatterbox/output', 

164 engine='chatterbox-turbo', 

165 startup_timeout=120.0, 

166 request_timeout=120.0, 

167) 

168 

169_ml = ToolWorker( 

170 tool_name='chatterbox_ml', 

171 tool_module='integrations.service_tools.chatterbox_tool', 

172 variant='ml', 

173 vram_budget='tts_chatterbox_ml', 

174 output_subdir='chatterbox/output', 

175 engine='chatterbox-ml', 

176 startup_timeout=240.0, # 12GB model takes a while 

177 request_timeout=180.0, 

178) 

179 

180 

181def chatterbox_synthesize( 

182 text: str, 

183 language: str = 'en', 

184 voice: Optional[str] = None, 

185 output_path: Optional[str] = None, 

186) -> str: 

187 """Synthesize with Chatterbox Turbo (English, GPU subprocess).""" 

188 return _turbo.synthesize( 

189 text=text, language='en', voice=voice, output_path=output_path, 

190 ) 

191 

192 

193def chatterbox_ml_synthesize( 

194 text: str, 

195 language: str = 'en', 

196 voice: Optional[str] = None, 

197 output_path: Optional[str] = None, 

198) -> str: 

199 """Synthesize with Chatterbox ML (23 languages, GPU subprocess).""" 

200 return _ml.synthesize( 

201 text=text, language=language, voice=voice, output_path=output_path, 

202 ) 

203 

204 

205def unload_chatterbox(): 

206 """Stop both Chatterbox worker subprocesses and free VRAM.""" 

207 _turbo.stop() 

208 _ml.stop() 

209 

210 

211class ChatterboxTool: 

212 """Register Chatterbox as an in-process service tool.""" 

213 

214 @classmethod 

215 def register_functions(cls): 

216 from .registry import ServiceToolInfo, service_tool_registry 

217 tool_info = ServiceToolInfo( 

218 name="chatterbox", 

219 description=( 

220 "GPU-accelerated emotional TTS. Turbo: English + [laugh]/[chuckle] tags, " 

221 "3.8GB VRAM. ML: 23 languages, 12GB VRAM. Voice cloning. " 

222 "Requires: pip install chatterbox" 

223 ), 

224 base_url="inprocess://chatterbox", 

225 endpoints={ 

226 "synthesize": { 

227 "path": "/synthesize", 

228 "method": "POST", 

229 "description": "Synthesize with Chatterbox Turbo (English, GPU).", 

230 "params_schema": { 

231 "text": {"type": "string"}, 

232 "voice": {"type": "string", "description": "Reference audio path"}, 

233 }, 

234 }, 

235 "synthesize_ml": { 

236 "path": "/synthesize_ml", 

237 "method": "POST", 

238 "description": "Synthesize with Chatterbox ML (23 languages, GPU).", 

239 "params_schema": { 

240 "text": {"type": "string"}, 

241 "language": {"type": "string"}, 

242 "voice": {"type": "string"}, 

243 }, 

244 }, 

245 }, 

246 tags=["tts", "speech", "voice-cloning", "gpu", "chatterbox"], 

247 timeout=60, 

248 ) 

249 tool_info.is_healthy = True 

250 service_tool_registry._tools["chatterbox"] = tool_info 

251 return True 

252 

253 

254# NOTE: no `if __name__ == '__main__':` block here. The centralized 

255# dispatcher in gpu_worker picks up `_load_turbo`/`_synthesize_turbo` 

256# when spawned with variant='turbo', and `_load_ml`/`_synthesize_ml` 

257# when variant='ml'.