Coverage for integrations/service_tools/luxtts

1"""

2LuxTTS tool — high-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx).

4Benefits:

5 - Zero-shot voice cloning from 3+ seconds of audio

6 - 4-step distilled flow-matching diffusion

7 - 24kHz output via Vocos vocoder (ONNX INT8)

8 - Runs on CPU (sherpa-onnx), no GPU required

9 - ~130MB models, auto-downloaded from GitHub releases

10 - espeak-ng G2P for multilingual phonemization (EN + ZH)

11 - Apache 2.0 license

13Models: sherpa-onnx-zipvoice-distill-int8-zh-en-emilia + vocos_24khz.onnx

15Fallback chain: LuxTTS (sherpa-onnx) → Pocket TTS → espeak-ng

17Public API:

18 luxtts_synthesize(text, voice_audio, output_path, ...) → JSON

19 luxtts_list_voices() → JSON

20 luxtts_clone_voice(audio_path, name) → JSON

21 luxtts_benchmark(text, ...) → JSON

22 unload_luxtts() → None

23"""

25import json

26import logging

27import os

28import time

29import wave as wave_mod

30from pathlib import Path

31from typing import Optional

33import numpy as np

35from .registry import ServiceToolInfo, service_tool_registry

37logger = logging.getLogger(__name__)

39# ═══════════════════════════════════════════════════════════════

40# Constants

41# ═══════════════════════════════════════════════════════════════

43SAMPLE_RATE = 24000

44MODEL_TARBALL = "sherpa-onnx-zipvoice-distill-int8-zh-en-emilia"

45MODEL_DOWNLOAD_URL = (

46 "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/"

47 f"{MODEL_TARBALL}.tar.bz2"

48)

49VOCODER_URL = (

50 "https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/"

51 "vocos_24khz.onnx"

52)

54# ═══════════════════════════════════════════════════════════════

55# Cached engine (singleton)

56# ═══════════════════════════════════════════════════════════════

58_tts_engine = None

59_prompt_cache = {} # voice_name -> (samples, sample_rate)

62# ═══════════════════════════════════════════════════════════════

63# Directory helpers

64# ═══════════════════════════════════════════════════════════════

66def _get_tts_dir() -> Path:

67 """Get the LuxTTS model/output storage directory."""

68 try:

69 from .model_storage import model_storage

70 tts_dir = model_storage.get_tool_dir("luxtts")

71 except (ImportError, Exception):

72 tts_dir = Path(os.path.expanduser("~/.hevolve/models/luxtts"))

73 tts_dir.mkdir(parents=True, exist_ok=True)

74 return tts_dir

77def _get_output_dir() -> Path:

78 out_dir = _get_tts_dir() / "output"

79 out_dir.mkdir(parents=True, exist_ok=True)

80 return out_dir

83def _get_voices_dir() -> Path:

84 vdir = _get_tts_dir() / "voices"

85 vdir.mkdir(parents=True, exist_ok=True)

86 return vdir

89def _get_model_dir() -> Path:

90 return _get_tts_dir() / MODEL_TARBALL

93# ═══════════════════════════════════════════════════════════════

94# Model download & engine init

95# ═══════════════════════════════════════════════════════════════

97def _ensure_models() -> Path:

98 """Download models if not cached. Returns model directory path."""

99 model_dir = _get_model_dir()

100 encoder_path = model_dir / "encoder.int8.onnx"

101 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx"

102

103 if encoder_path.exists() and vocoder_path.exists():

104 return model_dir

105

106 import urllib.request

107 import tarfile

108

109 # Download and extract model tarball

110 if not encoder_path.exists():

111 tarball_path = _get_tts_dir() / f"{MODEL_TARBALL}.tar.bz2"

112 if not tarball_path.exists():

113 logger.info(f"Downloading ZipVoice models (~109MB)...")

114 urllib.request.urlretrieve(MODEL_DOWNLOAD_URL, str(tarball_path))

115 logger.info("Download complete.")

116

117 logger.info("Extracting models...")

118 with tarfile.open(str(tarball_path), 'r:bz2') as tar:

119 # CVE-2007-4559: validate member paths before extraction

120 for member in tar.getmembers():

121 if member.name.startswith('/') or '..' in member.name:

122 raise ValueError(f"Unsafe tar member: {member.name}")

123 tar.extractall(str(_get_tts_dir()))

124 tarball_path.unlink(missing_ok=True)

125 logger.info("Models extracted.")

126

127 # Download vocoder

128 if not vocoder_path.exists():

129 logger.info("Downloading Vocos vocoder (~54MB)...")

130 urllib.request.urlretrieve(VOCODER_URL, str(vocoder_path))

131 logger.info("Vocoder downloaded.")

132

133 return model_dir

134

135

136def _load_engine():

137 """Load sherpa-onnx TTS engine (lazy, cached)."""

138 global _tts_engine

139

140 if _tts_engine is not None:

141 return _tts_engine

142

143 import sherpa_onnx

144

145 model_dir = _ensure_models()

146 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx"

147 num_threads = int(os.environ.get('LUXTTS_CPU_THREADS', '4'))

148

149 tts_config = sherpa_onnx.OfflineTtsConfig(

150 model=sherpa_onnx.OfflineTtsModelConfig(

151 zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig(

152 tokens=str(model_dir / "tokens.txt"),

153 encoder=str(model_dir / "encoder.int8.onnx"),

154 decoder=str(model_dir / "decoder.int8.onnx"),

155 data_dir=str(model_dir / "espeak-ng-data"),

156 lexicon=str(model_dir / "lexicon.txt"),

157 vocoder=str(vocoder_path),

158 feat_scale=0.15,

159 t_shift=0.4,

160 target_rms=0.1,

161 guidance_scale=1.2,

162 ),

163 provider='cpu',

164 debug=False,

165 num_threads=num_threads,

166 ),

167 max_num_sentences=1,

168 )

169

170 if not tts_config.validate():

171 raise RuntimeError("ZipVoice TTS config validation failed")

172

173 logger.info(f"Loading ZipVoice TTS engine ({num_threads} threads)...")

174 _tts_engine = sherpa_onnx.OfflineTts(tts_config)

175 logger.info("ZipVoice TTS engine ready.")

176 return _tts_engine

177

178

179def _read_prompt_wav(wav_path: str):

180 """Read a WAV file as float32 samples + sample_rate for sherpa-onnx prompt."""

181 with wave_mod.open(wav_path) as f:

182 assert f.getnchannels() == 1, f"Expected mono, got {f.getnchannels()} channels"

183 assert f.getsampwidth() == 2, f"Expected 16-bit, got {f.getsampwidth()*8}-bit"

184 num_samples = f.getnframes()

185 raw = f.readframes(num_samples)

186 samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0

187 return samples, f.getframerate()

188

189

190def _get_prompt(voice: Optional[str]):

191 """Resolve voice name to (samples, sample_rate) tuple."""

192 if voice and voice in _prompt_cache:

193 return _prompt_cache[voice]

194

195 # Find the WAV file

196 wav_path = None

197 if voice:

198 # Check voices directory

199 saved = _get_voices_dir() / f"{voice}.wav"

200 if saved.exists():

201 wav_path = str(saved)

202 elif os.path.isfile(voice):

203 wav_path = voice

204

205 if wav_path is None:

206 # Try default voice

207 default = _get_voices_dir() / "default.wav"

208 if default.exists():

209 wav_path = str(default)

210 voice = "default"

211

212 if wav_path is None:

213 return None

214

215 samples, sr = _read_prompt_wav(wav_path)

216 _prompt_cache[voice] = (samples, sr)

217 return samples, sr

218

219

220# ═══════════════════════════════════════════════════════════════

221# Public API

222# ═══════════════════════════════════════════════════════════════

223

224def luxtts_synthesize(

225 text: str,

226 voice_audio: Optional[str] = None,

227 output_path: Optional[str] = None,

228 device: Optional[str] = None,

229 num_steps: int = 4,

230 speed: float = 1.0,

231 rms: float = 0.01,

232) -> str:

233 """Synthesize text to speech using ZipVoice (sherpa-onnx).

234

235 Args:

236 text: Text to synthesize.

237 voice_audio: Path to reference voice audio (.wav, mono 16-bit, 3+ seconds).

238 Or name of a previously cloned voice.

239 output_path: Optional output .wav path. Auto-generated if None.

240 device: Ignored (sherpa-onnx uses CPU; kept for API compat).

241 num_steps: Diffusion steps (3-4 optimal). Default 4.

242 speed: Playback speed. Default 1.0.

243 rms: Ignored (sherpa-onnx handles internally; kept for API compat).

244

245 Returns:

246 JSON string with 'path', 'duration', 'device', 'rtf', 'latency_ms'.

247 """

248 if not text or not text.strip():

249 return json.dumps({"error": "Text is required"})

250

251 if output_path is None:

252 import hashlib

253 h = hashlib.md5(f"{text[:50]}:{voice_audio or 'default'}".encode()).hexdigest()[:12]

254 output_path = str(_get_output_dir() / f"luxtts_{h}.wav")

255

256 try:

257 engine = _load_engine()

258

259 # Resolve voice prompt

260 prompt = _get_prompt(voice_audio)

261 if prompt is None:

262 return json.dumps({

263 "error": "voice_audio required — provide a .wav reference (mono 16-bit, 3+ seconds)"

264 })

265

266 prompt_samples, prompt_sr = prompt

267 # Use a generic prompt text (sherpa-onnx needs it for alignment)

268 prompt_text = "This is a sample of my voice."

269

270 t0 = time.time()

271 audio = engine.generate(

272 text,

273 prompt_text,

274 prompt_samples,

275 prompt_sr,

276 speed=speed,

277 num_steps=num_steps,

278 )

279 gen_time = time.time() - t0

280

281 if len(audio.samples) == 0:

282 return json.dumps({"error": "TTS generation produced no audio"})

283

284 import soundfile as sf

285 sf.write(output_path, audio.samples, samplerate=audio.sample_rate, subtype='PCM_16')

286

287 duration = len(audio.samples) / audio.sample_rate

288 rtf = gen_time / duration if duration > 0 else 0

289

290 return json.dumps({

291 "path": output_path,

292 "duration": round(duration, 2),

293 "sample_rate": audio.sample_rate,

294 "voice": voice_audio or "default",

295 "engine": "zipvoice-sherpa-onnx",

296 "device": "cpu",

297 "num_steps": num_steps,

298 "latency_ms": round(gen_time * 1000, 1),

299 "rtf": round(rtf, 4),

300 "realtime_factor": round(1.0 / rtf, 1) if rtf > 0 else 0,

301 })

302 except ImportError as e:

303 logger.info(f"sherpa-onnx not installed: {e}")

304 return json.dumps({"error": f"sherpa-onnx not available (pip install sherpa-onnx): {e}"})

305 except Exception as e:

306 logger.warning(f"LuxTTS synthesis failed: {e}")

307 return json.dumps({"error": f"LuxTTS synthesis failed: {e}"})

308

309

310def luxtts_list_voices() -> str:

311 """List available cloned voices for LuxTTS.

312

313 Returns:

314 JSON with 'voices' list and availability info.

315 """

316 voices = []

317 voices_dir = _get_voices_dir()

318 if voices_dir.exists():

319 for f in sorted(voices_dir.glob("*.wav")):

320 voices.append({

321 "id": f.stem,

322 "name": f.stem.replace("-", " ").replace("_", " ").title(),

323 "type": "cloned",

324 "format": "wav",

325 "path": str(f),

326 })

327

328 engine_available = False

329 try:

330 import sherpa_onnx # noqa: F401

331 engine_available = True

332 except ImportError:

333 pass

334

335 return json.dumps({

336 "voices": voices,

337 "count": len(voices),

338 "engine": "zipvoice-sherpa-onnx" if engine_available else "not_installed",

339 "device": "cpu",

340 "sample_rate": SAMPLE_RATE,

341 })

342

343

344def luxtts_clone_voice(audio_path: str, name: str) -> str:

345 """Save a voice reference audio for LuxTTS voice cloning.

346

347 LuxTTS encodes the reference at synthesis time, so this just copies

348 the audio file to the voices directory for reuse.

349

350 Args:

351 audio_path: Path to .wav/.mp3 audio (3+ seconds of clear speech).

352 name: Name to save the voice as.

353

354 Returns:

355 JSON with 'saved', 'name', 'path'.

356 """

357 if not audio_path or not os.path.isfile(audio_path):

358 return json.dumps({"error": "Valid audio_path required"})

359 if not name or not name.strip():

360 return json.dumps({"error": "Voice name required"})

361

362 name = name.strip().lower().replace(" ", "-")

363 save_path = _get_voices_dir() / f"{name}.wav"

364

365 try:

366 import shutil

367 # If not WAV, convert using soundfile

368 if not audio_path.lower().endswith('.wav'):

369 import soundfile as sf

370 data, sr = sf.read(audio_path)

371 sf.write(str(save_path), data, sr)

372 else:

373 shutil.copy2(audio_path, str(save_path))

374

375 # Clear cache entry so it reloads

376 _prompt_cache.pop(name, None)

377

378 logger.info(f"LuxTTS voice saved: {name} from {audio_path}")

379 return json.dumps({

380 "saved": True,

381 "name": name,

382 "path": str(save_path),

383 })

384 except Exception as e:

385 return json.dumps({"error": f"Voice save failed: {e}"})

386

387

388def luxtts_benchmark(

389 text: str = "Hello, this is a benchmark test for measuring text to speech performance.",

390 device: Optional[str] = None,

391 voice_audio: Optional[str] = None,

392 num_runs: int = 3,

393) -> str:

394 """Benchmark LuxTTS performance on the current hardware.

395

396 Args:

397 text: Text to synthesize for benchmarking.

398 device: Ignored (kept for API compat).

399 voice_audio: Reference voice audio path or voice name.

400 num_runs: Number of benchmark runs (default 3).

401

402 Returns:

403 JSON with timing statistics, RTF, device info.

404 """

405 try:

406 engine = _load_engine()

407

408 prompt = _get_prompt(voice_audio)

409 if prompt is None:

410 return json.dumps({"error": "voice_audio required for benchmark"})

411

412 prompt_samples, prompt_sr = prompt

413 prompt_text = "This is a sample of my voice."

414

415 # Warmup

416 engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4)

417

418 times = []

419 durations = []

420 for _ in range(num_runs):

421 t0 = time.time()

422 audio = engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4)

423 elapsed = time.time() - t0

424 times.append(elapsed)

425 durations.append(len(audio.samples) / audio.sample_rate)

426

427 avg_time = sum(times) / len(times)

428 avg_duration = sum(durations) / len(durations)

429 avg_rtf = avg_time / avg_duration if avg_duration > 0 else 0

430

431 return json.dumps({

432 "engine": "zipvoice-sherpa-onnx",

433 "device": "cpu",

434 "num_runs": num_runs,

435 "text_length": len(text),

436 "avg_gen_time_ms": round(avg_time * 1000, 1),

437 "min_gen_time_ms": round(min(times) * 1000, 1),

438 "max_gen_time_ms": round(max(times) * 1000, 1),

439 "avg_audio_duration_s": round(avg_duration, 2),

440 "avg_rtf": round(avg_rtf, 4),

441 "avg_realtime_factor": round(1.0 / avg_rtf, 1) if avg_rtf > 0 else 0,

442 "sample_rate": SAMPLE_RATE,

443 })

444 except ImportError as e:

445 return json.dumps({"error": f"sherpa-onnx not available: {e}"})

446 except Exception as e:

447 return json.dumps({"error": f"Benchmark failed: {e}"})

448

449

450def unload_luxtts():

451 """Unload LuxTTS engine to free memory."""

452 global _tts_engine

453 _tts_engine = None

454 _prompt_cache.clear()

455

456 import gc

457 gc.collect()

458 logger.info("LuxTTS engine unloaded")

459

460

461# ═══════════════════════════════════════════════════════════════

462# Service tool registration

463# ═══════════════════════════════════════════════════════════════

464

465class LuxTTSTool:

466 """Register LuxTTS as an in-process service tool."""

467

468 @classmethod

469 def register_functions(cls):

470 """Register LuxTTS functions with service_tool_registry."""

471 tool_info = ServiceToolInfo(

472 name="luxtts",

473 description=(

474 "High-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx). "

475 "Zero-shot voice cloning from 3s audio. 24kHz Vocos vocoder. "

476 "CPU ONNX inference. Apache 2.0 license."

477 ),

478 base_url="inprocess://luxtts",

479 endpoints={

480 "synthesize": {

481 "path": "/synthesize",

482 "method": "POST",

483 "description": (

484 "Synthesize text to speech with voice cloning. "

485 "Input: text, voice_audio (path to reference .wav), "

486 "device (cuda/cpu/mps), num_steps (3-4). "

487 "Returns 48kHz WAV."

488 ),

489 "params_schema": {

490 "text": {"type": "string", "description": "Text to speak"},

491 "voice_audio": {"type": "string", "description": "Path to reference voice audio"},

492 "device": {"type": "string", "description": "cuda, cpu, or mps"},

493 "num_steps": {"type": "integer", "description": "Diffusion steps (3-4 optimal)"},

494 },

495 },

496 "list_voices": {

497 "path": "/voices",

498 "method": "GET",

499 "description": "List saved voice references.",

500 "params_schema": {},

501 },

502 "clone_voice": {

503 "path": "/clone",

504 "method": "POST",

505 "description": "Save a voice reference for reuse (3+ seconds audio).",

506 "params_schema": {

507 "audio_path": {"type": "string", "description": "Path to audio sample"},

508 "name": {"type": "string", "description": "Name for the voice"},

509 },

510 },

511 "benchmark": {

512 "path": "/benchmark",

513 "method": "POST",

514 "description": "Run performance benchmark on current hardware.",

515 "params_schema": {

516 "text": {"type": "string", "description": "Text to benchmark with"},

517 "device": {"type": "string", "description": "cuda, cpu, or mps"},

518 },

519 },

520 },

521 health_endpoint="/health",

522 tags=["tts", "speech", "voice-cloning", "luxtts", "48khz", "gpu"],

523 timeout=60,

524 )

525 tool_info.is_healthy = True

526 service_tool_registry._tools["luxtts"] = tool_info

527 return True

Coverage for integrations / service_tools / luxtts_tool.py: 49.2%

189 statements