Coverage for integrations / service_tools / luxtts_tool.py: 49.2%

189 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2LuxTTS tool — high-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx). 

3 

4Benefits: 

5 - Zero-shot voice cloning from 3+ seconds of audio 

6 - 4-step distilled flow-matching diffusion 

7 - 24kHz output via Vocos vocoder (ONNX INT8) 

8 - Runs on CPU (sherpa-onnx), no GPU required 

9 - ~130MB models, auto-downloaded from GitHub releases 

10 - espeak-ng G2P for multilingual phonemization (EN + ZH) 

11 - Apache 2.0 license 

12 

13Models: sherpa-onnx-zipvoice-distill-int8-zh-en-emilia + vocos_24khz.onnx 

14 

15Fallback chain: LuxTTS (sherpa-onnx) → Pocket TTS → espeak-ng 

16 

17Public API: 

18 luxtts_synthesize(text, voice_audio, output_path, ...) → JSON 

19 luxtts_list_voices() → JSON 

20 luxtts_clone_voice(audio_path, name) → JSON 

21 luxtts_benchmark(text, ...) → JSON 

22 unload_luxtts() → None 

23""" 

24 

25import json 

26import logging 

27import os 

28import time 

29import wave as wave_mod 

30from pathlib import Path 

31from typing import Optional 

32 

33import numpy as np 

34 

35from .registry import ServiceToolInfo, service_tool_registry 

36 

37logger = logging.getLogger(__name__) 

38 

39# ═══════════════════════════════════════════════════════════════ 

40# Constants 

41# ═══════════════════════════════════════════════════════════════ 

42 

43SAMPLE_RATE = 24000 

44MODEL_TARBALL = "sherpa-onnx-zipvoice-distill-int8-zh-en-emilia" 

45MODEL_DOWNLOAD_URL = ( 

46 "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/" 

47 f"{MODEL_TARBALL}.tar.bz2" 

48) 

49VOCODER_URL = ( 

50 "https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/" 

51 "vocos_24khz.onnx" 

52) 

53 

54# ═══════════════════════════════════════════════════════════════ 

55# Cached engine (singleton) 

56# ═══════════════════════════════════════════════════════════════ 

57 

58_tts_engine = None 

59_prompt_cache = {} # voice_name -> (samples, sample_rate) 

60 

61 

62# ═══════════════════════════════════════════════════════════════ 

63# Directory helpers 

64# ═══════════════════════════════════════════════════════════════ 

65 

66def _get_tts_dir() -> Path: 

67 """Get the LuxTTS model/output storage directory.""" 

68 try: 

69 from .model_storage import model_storage 

70 tts_dir = model_storage.get_tool_dir("luxtts") 

71 except (ImportError, Exception): 

72 tts_dir = Path(os.path.expanduser("~/.hevolve/models/luxtts")) 

73 tts_dir.mkdir(parents=True, exist_ok=True) 

74 return tts_dir 

75 

76 

77def _get_output_dir() -> Path: 

78 out_dir = _get_tts_dir() / "output" 

79 out_dir.mkdir(parents=True, exist_ok=True) 

80 return out_dir 

81 

82 

83def _get_voices_dir() -> Path: 

84 vdir = _get_tts_dir() / "voices" 

85 vdir.mkdir(parents=True, exist_ok=True) 

86 return vdir 

87 

88 

89def _get_model_dir() -> Path: 

90 return _get_tts_dir() / MODEL_TARBALL 

91 

92 

93# ═══════════════════════════════════════════════════════════════ 

94# Model download & engine init 

95# ═══════════════════════════════════════════════════════════════ 

96 

97def _ensure_models() -> Path: 

98 """Download models if not cached. Returns model directory path.""" 

99 model_dir = _get_model_dir() 

100 encoder_path = model_dir / "encoder.int8.onnx" 

101 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx" 

102 

103 if encoder_path.exists() and vocoder_path.exists(): 

104 return model_dir 

105 

106 import urllib.request 

107 import tarfile 

108 

109 # Download and extract model tarball 

110 if not encoder_path.exists(): 

111 tarball_path = _get_tts_dir() / f"{MODEL_TARBALL}.tar.bz2" 

112 if not tarball_path.exists(): 

113 logger.info(f"Downloading ZipVoice models (~109MB)...") 

114 urllib.request.urlretrieve(MODEL_DOWNLOAD_URL, str(tarball_path)) 

115 logger.info("Download complete.") 

116 

117 logger.info("Extracting models...") 

118 with tarfile.open(str(tarball_path), 'r:bz2') as tar: 

119 # CVE-2007-4559: validate member paths before extraction 

120 for member in tar.getmembers(): 

121 if member.name.startswith('/') or '..' in member.name: 

122 raise ValueError(f"Unsafe tar member: {member.name}") 

123 tar.extractall(str(_get_tts_dir())) 

124 tarball_path.unlink(missing_ok=True) 

125 logger.info("Models extracted.") 

126 

127 # Download vocoder 

128 if not vocoder_path.exists(): 

129 logger.info("Downloading Vocos vocoder (~54MB)...") 

130 urllib.request.urlretrieve(VOCODER_URL, str(vocoder_path)) 

131 logger.info("Vocoder downloaded.") 

132 

133 return model_dir 

134 

135 

136def _load_engine(): 

137 """Load sherpa-onnx TTS engine (lazy, cached).""" 

138 global _tts_engine 

139 

140 if _tts_engine is not None: 

141 return _tts_engine 

142 

143 import sherpa_onnx 

144 

145 model_dir = _ensure_models() 

146 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx" 

147 num_threads = int(os.environ.get('LUXTTS_CPU_THREADS', '4')) 

148 

149 tts_config = sherpa_onnx.OfflineTtsConfig( 

150 model=sherpa_onnx.OfflineTtsModelConfig( 

151 zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig( 

152 tokens=str(model_dir / "tokens.txt"), 

153 encoder=str(model_dir / "encoder.int8.onnx"), 

154 decoder=str(model_dir / "decoder.int8.onnx"), 

155 data_dir=str(model_dir / "espeak-ng-data"), 

156 lexicon=str(model_dir / "lexicon.txt"), 

157 vocoder=str(vocoder_path), 

158 feat_scale=0.15, 

159 t_shift=0.4, 

160 target_rms=0.1, 

161 guidance_scale=1.2, 

162 ), 

163 provider='cpu', 

164 debug=False, 

165 num_threads=num_threads, 

166 ), 

167 max_num_sentences=1, 

168 ) 

169 

170 if not tts_config.validate(): 

171 raise RuntimeError("ZipVoice TTS config validation failed") 

172 

173 logger.info(f"Loading ZipVoice TTS engine ({num_threads} threads)...") 

174 _tts_engine = sherpa_onnx.OfflineTts(tts_config) 

175 logger.info("ZipVoice TTS engine ready.") 

176 return _tts_engine 

177 

178 

179def _read_prompt_wav(wav_path: str): 

180 """Read a WAV file as float32 samples + sample_rate for sherpa-onnx prompt.""" 

181 with wave_mod.open(wav_path) as f: 

182 assert f.getnchannels() == 1, f"Expected mono, got {f.getnchannels()} channels" 

183 assert f.getsampwidth() == 2, f"Expected 16-bit, got {f.getsampwidth()*8}-bit" 

184 num_samples = f.getnframes() 

185 raw = f.readframes(num_samples) 

186 samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 

187 return samples, f.getframerate() 

188 

189 

190def _get_prompt(voice: Optional[str]): 

191 """Resolve voice name to (samples, sample_rate) tuple.""" 

192 if voice and voice in _prompt_cache: 

193 return _prompt_cache[voice] 

194 

195 # Find the WAV file 

196 wav_path = None 

197 if voice: 

198 # Check voices directory 

199 saved = _get_voices_dir() / f"{voice}.wav" 

200 if saved.exists(): 

201 wav_path = str(saved) 

202 elif os.path.isfile(voice): 

203 wav_path = voice 

204 

205 if wav_path is None: 

206 # Try default voice 

207 default = _get_voices_dir() / "default.wav" 

208 if default.exists(): 

209 wav_path = str(default) 

210 voice = "default" 

211 

212 if wav_path is None: 

213 return None 

214 

215 samples, sr = _read_prompt_wav(wav_path) 

216 _prompt_cache[voice] = (samples, sr) 

217 return samples, sr 

218 

219 

220# ═══════════════════════════════════════════════════════════════ 

221# Public API 

222# ═══════════════════════════════════════════════════════════════ 

223 

224def luxtts_synthesize( 

225 text: str, 

226 voice_audio: Optional[str] = None, 

227 output_path: Optional[str] = None, 

228 device: Optional[str] = None, 

229 num_steps: int = 4, 

230 speed: float = 1.0, 

231 rms: float = 0.01, 

232) -> str: 

233 """Synthesize text to speech using ZipVoice (sherpa-onnx). 

234 

235 Args: 

236 text: Text to synthesize. 

237 voice_audio: Path to reference voice audio (.wav, mono 16-bit, 3+ seconds). 

238 Or name of a previously cloned voice. 

239 output_path: Optional output .wav path. Auto-generated if None. 

240 device: Ignored (sherpa-onnx uses CPU; kept for API compat). 

241 num_steps: Diffusion steps (3-4 optimal). Default 4. 

242 speed: Playback speed. Default 1.0. 

243 rms: Ignored (sherpa-onnx handles internally; kept for API compat). 

244 

245 Returns: 

246 JSON string with 'path', 'duration', 'device', 'rtf', 'latency_ms'. 

247 """ 

248 if not text or not text.strip(): 

249 return json.dumps({"error": "Text is required"}) 

250 

251 if output_path is None: 

252 import hashlib 

253 h = hashlib.md5(f"{text[:50]}:{voice_audio or 'default'}".encode()).hexdigest()[:12] 

254 output_path = str(_get_output_dir() / f"luxtts_{h}.wav") 

255 

256 try: 

257 engine = _load_engine() 

258 

259 # Resolve voice prompt 

260 prompt = _get_prompt(voice_audio) 

261 if prompt is None: 

262 return json.dumps({ 

263 "error": "voice_audio required — provide a .wav reference (mono 16-bit, 3+ seconds)" 

264 }) 

265 

266 prompt_samples, prompt_sr = prompt 

267 # Use a generic prompt text (sherpa-onnx needs it for alignment) 

268 prompt_text = "This is a sample of my voice." 

269 

270 t0 = time.time() 

271 audio = engine.generate( 

272 text, 

273 prompt_text, 

274 prompt_samples, 

275 prompt_sr, 

276 speed=speed, 

277 num_steps=num_steps, 

278 ) 

279 gen_time = time.time() - t0 

280 

281 if len(audio.samples) == 0: 

282 return json.dumps({"error": "TTS generation produced no audio"}) 

283 

284 import soundfile as sf 

285 sf.write(output_path, audio.samples, samplerate=audio.sample_rate, subtype='PCM_16') 

286 

287 duration = len(audio.samples) / audio.sample_rate 

288 rtf = gen_time / duration if duration > 0 else 0 

289 

290 return json.dumps({ 

291 "path": output_path, 

292 "duration": round(duration, 2), 

293 "sample_rate": audio.sample_rate, 

294 "voice": voice_audio or "default", 

295 "engine": "zipvoice-sherpa-onnx", 

296 "device": "cpu", 

297 "num_steps": num_steps, 

298 "latency_ms": round(gen_time * 1000, 1), 

299 "rtf": round(rtf, 4), 

300 "realtime_factor": round(1.0 / rtf, 1) if rtf > 0 else 0, 

301 }) 

302 except ImportError as e: 

303 logger.info(f"sherpa-onnx not installed: {e}") 

304 return json.dumps({"error": f"sherpa-onnx not available (pip install sherpa-onnx): {e}"}) 

305 except Exception as e: 

306 logger.warning(f"LuxTTS synthesis failed: {e}") 

307 return json.dumps({"error": f"LuxTTS synthesis failed: {e}"}) 

308 

309 

310def luxtts_list_voices() -> str: 

311 """List available cloned voices for LuxTTS. 

312 

313 Returns: 

314 JSON with 'voices' list and availability info. 

315 """ 

316 voices = [] 

317 voices_dir = _get_voices_dir() 

318 if voices_dir.exists(): 

319 for f in sorted(voices_dir.glob("*.wav")): 

320 voices.append({ 

321 "id": f.stem, 

322 "name": f.stem.replace("-", " ").replace("_", " ").title(), 

323 "type": "cloned", 

324 "format": "wav", 

325 "path": str(f), 

326 }) 

327 

328 engine_available = False 

329 try: 

330 import sherpa_onnx # noqa: F401 

331 engine_available = True 

332 except ImportError: 

333 pass 

334 

335 return json.dumps({ 

336 "voices": voices, 

337 "count": len(voices), 

338 "engine": "zipvoice-sherpa-onnx" if engine_available else "not_installed", 

339 "device": "cpu", 

340 "sample_rate": SAMPLE_RATE, 

341 }) 

342 

343 

344def luxtts_clone_voice(audio_path: str, name: str) -> str: 

345 """Save a voice reference audio for LuxTTS voice cloning. 

346 

347 LuxTTS encodes the reference at synthesis time, so this just copies 

348 the audio file to the voices directory for reuse. 

349 

350 Args: 

351 audio_path: Path to .wav/.mp3 audio (3+ seconds of clear speech). 

352 name: Name to save the voice as. 

353 

354 Returns: 

355 JSON with 'saved', 'name', 'path'. 

356 """ 

357 if not audio_path or not os.path.isfile(audio_path): 

358 return json.dumps({"error": "Valid audio_path required"}) 

359 if not name or not name.strip(): 

360 return json.dumps({"error": "Voice name required"}) 

361 

362 name = name.strip().lower().replace(" ", "-") 

363 save_path = _get_voices_dir() / f"{name}.wav" 

364 

365 try: 

366 import shutil 

367 # If not WAV, convert using soundfile 

368 if not audio_path.lower().endswith('.wav'): 

369 import soundfile as sf 

370 data, sr = sf.read(audio_path) 

371 sf.write(str(save_path), data, sr) 

372 else: 

373 shutil.copy2(audio_path, str(save_path)) 

374 

375 # Clear cache entry so it reloads 

376 _prompt_cache.pop(name, None) 

377 

378 logger.info(f"LuxTTS voice saved: {name} from {audio_path}") 

379 return json.dumps({ 

380 "saved": True, 

381 "name": name, 

382 "path": str(save_path), 

383 }) 

384 except Exception as e: 

385 return json.dumps({"error": f"Voice save failed: {e}"}) 

386 

387 

388def luxtts_benchmark( 

389 text: str = "Hello, this is a benchmark test for measuring text to speech performance.", 

390 device: Optional[str] = None, 

391 voice_audio: Optional[str] = None, 

392 num_runs: int = 3, 

393) -> str: 

394 """Benchmark LuxTTS performance on the current hardware. 

395 

396 Args: 

397 text: Text to synthesize for benchmarking. 

398 device: Ignored (kept for API compat). 

399 voice_audio: Reference voice audio path or voice name. 

400 num_runs: Number of benchmark runs (default 3). 

401 

402 Returns: 

403 JSON with timing statistics, RTF, device info. 

404 """ 

405 try: 

406 engine = _load_engine() 

407 

408 prompt = _get_prompt(voice_audio) 

409 if prompt is None: 

410 return json.dumps({"error": "voice_audio required for benchmark"}) 

411 

412 prompt_samples, prompt_sr = prompt 

413 prompt_text = "This is a sample of my voice." 

414 

415 # Warmup 

416 engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4) 

417 

418 times = [] 

419 durations = [] 

420 for _ in range(num_runs): 

421 t0 = time.time() 

422 audio = engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4) 

423 elapsed = time.time() - t0 

424 times.append(elapsed) 

425 durations.append(len(audio.samples) / audio.sample_rate) 

426 

427 avg_time = sum(times) / len(times) 

428 avg_duration = sum(durations) / len(durations) 

429 avg_rtf = avg_time / avg_duration if avg_duration > 0 else 0 

430 

431 return json.dumps({ 

432 "engine": "zipvoice-sherpa-onnx", 

433 "device": "cpu", 

434 "num_runs": num_runs, 

435 "text_length": len(text), 

436 "avg_gen_time_ms": round(avg_time * 1000, 1), 

437 "min_gen_time_ms": round(min(times) * 1000, 1), 

438 "max_gen_time_ms": round(max(times) * 1000, 1), 

439 "avg_audio_duration_s": round(avg_duration, 2), 

440 "avg_rtf": round(avg_rtf, 4), 

441 "avg_realtime_factor": round(1.0 / avg_rtf, 1) if avg_rtf > 0 else 0, 

442 "sample_rate": SAMPLE_RATE, 

443 }) 

444 except ImportError as e: 

445 return json.dumps({"error": f"sherpa-onnx not available: {e}"}) 

446 except Exception as e: 

447 return json.dumps({"error": f"Benchmark failed: {e}"}) 

448 

449 

450def unload_luxtts(): 

451 """Unload LuxTTS engine to free memory.""" 

452 global _tts_engine 

453 _tts_engine = None 

454 _prompt_cache.clear() 

455 

456 import gc 

457 gc.collect() 

458 logger.info("LuxTTS engine unloaded") 

459 

460 

461# ═══════════════════════════════════════════════════════════════ 

462# Service tool registration 

463# ═══════════════════════════════════════════════════════════════ 

464 

465class LuxTTSTool: 

466 """Register LuxTTS as an in-process service tool.""" 

467 

468 @classmethod 

469 def register_functions(cls): 

470 """Register LuxTTS functions with service_tool_registry.""" 

471 tool_info = ServiceToolInfo( 

472 name="luxtts", 

473 description=( 

474 "High-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx). " 

475 "Zero-shot voice cloning from 3s audio. 24kHz Vocos vocoder. " 

476 "CPU ONNX inference. Apache 2.0 license." 

477 ), 

478 base_url="inprocess://luxtts", 

479 endpoints={ 

480 "synthesize": { 

481 "path": "/synthesize", 

482 "method": "POST", 

483 "description": ( 

484 "Synthesize text to speech with voice cloning. " 

485 "Input: text, voice_audio (path to reference .wav), " 

486 "device (cuda/cpu/mps), num_steps (3-4). " 

487 "Returns 48kHz WAV." 

488 ), 

489 "params_schema": { 

490 "text": {"type": "string", "description": "Text to speak"}, 

491 "voice_audio": {"type": "string", "description": "Path to reference voice audio"}, 

492 "device": {"type": "string", "description": "cuda, cpu, or mps"}, 

493 "num_steps": {"type": "integer", "description": "Diffusion steps (3-4 optimal)"}, 

494 }, 

495 }, 

496 "list_voices": { 

497 "path": "/voices", 

498 "method": "GET", 

499 "description": "List saved voice references.", 

500 "params_schema": {}, 

501 }, 

502 "clone_voice": { 

503 "path": "/clone", 

504 "method": "POST", 

505 "description": "Save a voice reference for reuse (3+ seconds audio).", 

506 "params_schema": { 

507 "audio_path": {"type": "string", "description": "Path to audio sample"}, 

508 "name": {"type": "string", "description": "Name for the voice"}, 

509 }, 

510 }, 

511 "benchmark": { 

512 "path": "/benchmark", 

513 "method": "POST", 

514 "description": "Run performance benchmark on current hardware.", 

515 "params_schema": { 

516 "text": {"type": "string", "description": "Text to benchmark with"}, 

517 "device": {"type": "string", "description": "cuda, cpu, or mps"}, 

518 }, 

519 }, 

520 }, 

521 health_endpoint="/health", 

522 tags=["tts", "speech", "voice-cloning", "luxtts", "48khz", "gpu"], 

523 timeout=60, 

524 ) 

525 tool_info.is_healthy = True 

526 service_tool_registry._tools["luxtts"] = tool_info 

527 return True