Coverage for integrations/service_tools/pocket_tts

1"""

2TTS tool — in-process text-to-speech via Pocket TTS (Kyutai).

4Pocket TTS benefits:

5 - 100M params — runs at 6x real-time on CPU (no GPU required)

6 - MIT license, fully open source

7 - Zero-shot voice cloning from 5 seconds of audio

8 - ~200ms latency for first audio chunk

9 - English (more languages planned by upstream)

10 - 100% local, zero cloud costs

12Model downloaded lazily on first use to ~/.hevolve/models/tts/

14Fallback: espeak-ng (if pocket-tts not installed).

16Public API:

17 pocket_tts_synthesize(text, voice, output_path) → JSON

18 pocket_tts_list_voices() → JSON

19 pocket_tts_clone_voice(audio_path, name) → JSON

20 unload_pocket_tts() → None

21"""

23import json

24import logging

25import os

26from pathlib import Path

27from typing import Optional

29from .registry import ServiceToolInfo, service_tool_registry

31logger = logging.getLogger(__name__)

33# ═══════════════════════════════════════════════════════════════

34# Built-in voices (shipped with pocket-tts)

35# ═══════════════════════════════════════════════════════════════

37_BUILTIN_VOICES = [

38 "alba", "marius", "javert", "jean",

39 "fantine", "cosette", "eponine", "azelma",

40]

42# ═══════════════════════════════════════════════════════════════

43# Cached model (avoid reloading on every call)

44# ═══════════════════════════════════════════════════════════════

46_tts_model = None

47_voice_states = {} # voice_name -> voice_state cache

50# ═══════════════════════════════════════════════════════════════

51# Model management

52# ═══════════════════════════════════════════════════════════════

54def _get_tts_dir() -> Path:

55 """Get the TTS model/output storage directory."""

56 try:

57 from .model_storage import model_storage

58 tts_dir = model_storage.get_tool_dir("tts")

59 except (ImportError, Exception):

60 tts_dir = Path(os.path.expanduser("~/.hevolve/models/tts"))

61 tts_dir.mkdir(parents=True, exist_ok=True)

62 return tts_dir

65def _get_output_dir() -> Path:

66 """Get the audio output directory."""

67 out_dir = _get_tts_dir() / "output"

68 out_dir.mkdir(parents=True, exist_ok=True)

69 return out_dir

72def _get_custom_voices_dir() -> Path:

73 """Get the directory for user-cloned voice states."""

74 vdir = _get_tts_dir() / "voices"

75 vdir.mkdir(parents=True, exist_ok=True)

76 return vdir

79def _load_model():

80 """Load Pocket TTS model (lazy, cached)."""

81 global _tts_model

82 if _tts_model is not None:

83 return _tts_model

85 from pocket_tts import TTSModel

87 logger.info("Loading Pocket TTS model (100M params, CPU)...")

88 _tts_model = TTSModel.load_model()

89 logger.info("Pocket TTS model ready")

90 return _tts_model

93def _get_voice_state(voice: str):

94 """Get or create a cached voice state."""

95 if voice in _voice_states:

96 return _voice_states[voice]

98 model = _load_model()

100 # Check custom cloned voices first

101 custom_path = _get_custom_voices_dir() / f"{voice}.safetensors"

102 if custom_path.exists():

103 from safetensors.torch import load_file

104 state = load_file(str(custom_path))

105 _voice_states[voice] = state

106 logger.info(f"Loaded custom voice: {voice}")

107 return state

108

109 # Check if it's a path to an audio file (for ad-hoc cloning)

110 if os.path.isfile(voice):

111 state = model.get_state_for_audio_prompt(voice)

112 _voice_states[voice] = state

113 return state

114

115 # Built-in voice

116 state = model.get_state_for_audio_prompt(voice)

117 _voice_states[voice] = state

118 return state

119

120

121# ═══════════════════════════════════════════════════════════════

122# espeak-ng fallback (for systems without pocket-tts)

123# ═══════════════════════════════════════════════════════════════

124

125def _espeak_synthesize(text: str, output_path: str, voice: str = "en") -> bool:

126 """Fallback: use espeak-ng for basic TTS."""

127 import subprocess

128 try:

129 _kw = dict(capture_output=True, text=True, timeout=30)

130 if hasattr(subprocess, 'CREATE_NO_WINDOW'):

131 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW

132 result = subprocess.run(

133 ["espeak-ng", "-v", voice, "-w", output_path, text],

134 **_kw)

135 return result.returncode == 0

136 except (FileNotFoundError, subprocess.TimeoutExpired):

137 return False

138

139

140# ═══════════════════════════════════════════════════════════════

141# Public API

142# ═══════════════════════════════════════════════════════════════

143

144def pocket_tts_synthesize(

145 text: str,

146 voice: str = "alba",

147 output_path: Optional[str] = None,

148 sample_rate: Optional[int] = None,

149) -> str:

150 """Synthesize text to speech using Pocket TTS.

151

152 Tries pocket-tts first (high quality, CPU, 6x real-time),

153 falls back to espeak-ng (basic quality, always available on NixOS).

154

155 Args:

156 text: Text to synthesize.

157 voice: Voice name (built-in like 'alba', custom name, or path to .wav).

158 output_path: Optional output .wav path. Auto-generated if None.

159 sample_rate: Override sample rate (default: model's native rate).

160

161 Returns:

162 JSON string with 'path', 'duration', 'voice', 'engine' keys.

163 """

164 if not text or not text.strip():

165 return json.dumps({"error": "Text is required"})

166

167 if output_path is None:

168 import hashlib

169 h = hashlib.md5(f"{text[:50]}:{voice}".encode()).hexdigest()[:12]

170 output_path = str(_get_output_dir() / f"tts_{h}.wav")

171

172 # Try Pocket TTS (preferred)

173 import time as _time

174 _t0 = _time.monotonic()

175 try:

176 import numpy as np

177 model = _load_model()

178 voice_state = _get_voice_state(voice)

179 audio = model.generate_audio(voice_state, text)

180

181 sr = sample_rate or model.sample_rate

182 audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio

183

184 import scipy.io.wavfile

185 scipy.io.wavfile.write(output_path, sr, audio_np)

186

187 duration = len(audio_np) / sr

188 # Success log — required for #88: previously there was no

189 # runtime confirmation that pocket_tts was actually synthesizing

190 # (only failure paths logged). Operators couldn't tell from

191 # langchain.log alone whether TTS was working or silently no-oping.

192 _elapsed_ms = int((_time.monotonic() - _t0) * 1000)

193 logger.info(

194 f"pocket_tts synthesized {len(text)}ch → {output_path} "

195 f"(sr={sr}Hz, dur={duration:.2f}s, voice={voice}, "

196 f"latency={_elapsed_ms}ms)"

197 )

198 return json.dumps({

199 "path": output_path,

200 "duration": round(duration, 2),

201 "sample_rate": sr,

202 "voice": voice,

203 "engine": "pocket-tts",

204 })

205 except ImportError:

206 logger.info("pocket-tts not installed, trying espeak-ng fallback")

207 except Exception as e:

208 logger.warning(f"Pocket TTS synthesis failed: {e}")

209

210 # Fallback: espeak-ng

211 if _espeak_synthesize(text, output_path, voice="en"):

212 _elapsed_ms = int((_time.monotonic() - _t0) * 1000)

213 logger.info(

214 f"pocket_tts(espeak fallback) synthesized {len(text)}ch → "

215 f"{output_path} (latency={_elapsed_ms}ms)"

216 )

217 return json.dumps({

218 "path": output_path,

219 "duration": 0, # espeak doesn't report duration

220 "voice": "en",

221 "engine": "espeak-ng",

222 })

223

224 return json.dumps({"error": "No TTS engine available (install pocket-tts or espeak-ng)"})

225

226

227def pocket_tts_list_voices() -> str:

228 """List available TTS voices.

229

230 Returns built-in voices plus any user-cloned voices.

231

232 Returns:

233 JSON string with 'voices' list and 'engine' info.

234 """

235 voices = []

236

237 # Built-in voices

238 for name in _BUILTIN_VOICES:

239 voices.append({

240 "id": name,

241 "name": name.title(),

242 "type": "builtin",

243 "language": "en",

244 })

245

246 # Custom cloned voices

247 custom_dir = _get_custom_voices_dir()

248 if custom_dir.exists():

249 for f in sorted(custom_dir.glob("*.safetensors")):

250 name = f.stem

251 if name not in _BUILTIN_VOICES:

252 voices.append({

253 "id": name,

254 "name": name.title(),

255 "type": "cloned",

256 "language": "en",

257 })

258

259 # Check which engine is available

260 engine = "none"

261 try:

262 import pocket_tts # noqa: F401

263 engine = "pocket-tts"

264 except ImportError:

265 try:

266 import subprocess

267 _kw2 = dict(capture_output=True, text=True, timeout=5)

268 if hasattr(subprocess, 'CREATE_NO_WINDOW'):

269 _kw2['creationflags'] = subprocess.CREATE_NO_WINDOW

270 r = subprocess.run(["espeak-ng", "--version"], **_kw2)

271 if r.returncode == 0:

272 engine = "espeak-ng"

273 except (FileNotFoundError, subprocess.TimeoutExpired):

274 pass

275

276 return json.dumps({

277 "voices": voices,

278 "count": len(voices),

279 "engine": engine,

280 "builtin_count": len(_BUILTIN_VOICES),

281 })

282

283

284def pocket_tts_clone_voice(audio_path: str, name: str) -> str:

285 """Clone a voice from an audio sample (5+ seconds recommended).

286

287 Extracts voice embedding from the audio and saves it for reuse.

288 Requires pocket-tts (no fallback for voice cloning).

289

290 Args:

291 audio_path: Path to .wav/.mp3 audio sample (5+ seconds of clear speech).

292 name: Name to save the cloned voice as.

293

294 Returns:

295 JSON string with 'cloned', 'name', 'path' keys.

296 """

297 if not audio_path or not os.path.isfile(audio_path):

298 return json.dumps({"error": "Valid audio_path required"})

299 if not name or not name.strip():

300 return json.dumps({"error": "Voice name required"})

301

302 name = name.strip().lower().replace(" ", "-")

303

304 try:

305 model = _load_model()

306 voice_state = model.get_state_for_audio_prompt(audio_path)

307

308 # Export voice embedding for fast loading (not full model weights)

309 save_path = _get_custom_voices_dir() / f"{name}.safetensors"

310 from safetensors.torch import save_file

311 if isinstance(voice_state, dict):

312 save_file(voice_state, str(save_path))

313 else:

314 save_file({"voice_embedding": voice_state}, str(save_path))

315

316 # Cache it

317 _voice_states[name] = voice_state

318 logger.info(f"Voice cloned: {name} from {audio_path}")

319

320 return json.dumps({

321 "cloned": True,

322 "name": name,

323 "path": str(save_path),

324 })

325 except ImportError:

326 return json.dumps({"error": "pocket-tts required for voice cloning"})

327 except Exception as e:

328 return json.dumps({"error": f"Voice cloning failed: {e}"})

329

330

331def unload_pocket_tts():

332 """Unload Pocket TTS model to free memory."""

333 global _tts_model, _voice_states

334 _tts_model = None

335 _voice_states.clear()

336

337 from .vram_manager import clear_cuda_cache

338 clear_cuda_cache()

339

340 import gc

341 gc.collect()

342 logger.info("Pocket TTS model unloaded")

343

344

345# ═══════════════════════════════════════════════════════════════

346# Service tool registration

347# ═══════════════════════════════════════════════════════════════

348

349class PocketTTSTool:

350 """Register Pocket TTS as an in-process service tool.

351

352 Like WhisperTool, runs in-process (no sidecar server).

353 Functions are registered directly as callables.

354 """

355

356 @classmethod

357 def register_functions(cls):

358 """Register TTS functions with service_tool_registry."""

359 tool_info = ServiceToolInfo(

360 name="pocket_tts",

361 description=(

362 "Offline text-to-speech via Pocket TTS (Kyutai). "

363 "100M params, 6x real-time on CPU, zero-shot voice cloning "

364 "from 5s audio. Falls back to espeak-ng if unavailable. "

365 "MIT license, 100% local, zero cloud costs."

366 ),

367 base_url="inprocess://pocket_tts",

368 endpoints={

369 "synthesize": {

370 "path": "/synthesize",

371 "method": "POST",

372 "description": (

373 "Convert text to speech audio. "

374 "Input: text (string), voice (optional voice name, "

375 "default 'alba'), output_path (optional .wav path). "

376 "Returns JSON with audio file path and duration."

377 ),

378 "params_schema": {

379 "text": {"type": "string", "description": "Text to speak"},

380 "voice": {"type": "string", "description": "Voice name or .wav path (default: alba)"},

381 "output_path": {"type": "string", "description": "Output .wav path (optional)"},

382 },

383 },

384 "list_voices": {

385 "path": "/voices",

386 "method": "GET",

387 "description": "List available TTS voices (built-in + cloned).",

388 "params_schema": {},

389 },

390 "clone_voice": {

391 "path": "/clone",

392 "method": "POST",

393 "description": (

394 "Clone a voice from an audio sample. "

395 "Input: audio_path (path to .wav), name (voice name to save). "

396 "Requires 5+ seconds of clear speech."

397 ),

398 "params_schema": {

399 "audio_path": {"type": "string", "description": "Path to audio sample"},

400 "name": {"type": "string", "description": "Name for the cloned voice"},

401 },

402 },

403 },

404 health_endpoint="/health",

405 tags=["tts", "speech", "synthesis", "voice", "offline", "pocket-tts"],

406 timeout=60,

407 )

408 tool_info.is_healthy = True

409 service_tool_registry._tools["pocket_tts"] = tool_info

410 return True

Coverage for integrations / service_tools / pocket_tts_tool.py: 70.6%

136 statements