Coverage for integrations / service_tools / luxtts_tool.py: 49.2%
189 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2LuxTTS tool — high-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx).
4Benefits:
5 - Zero-shot voice cloning from 3+ seconds of audio
6 - 4-step distilled flow-matching diffusion
7 - 24kHz output via Vocos vocoder (ONNX INT8)
8 - Runs on CPU (sherpa-onnx), no GPU required
9 - ~130MB models, auto-downloaded from GitHub releases
10 - espeak-ng G2P for multilingual phonemization (EN + ZH)
11 - Apache 2.0 license
13Models: sherpa-onnx-zipvoice-distill-int8-zh-en-emilia + vocos_24khz.onnx
15Fallback chain: LuxTTS (sherpa-onnx) → Pocket TTS → espeak-ng
17Public API:
18 luxtts_synthesize(text, voice_audio, output_path, ...) → JSON
19 luxtts_list_voices() → JSON
20 luxtts_clone_voice(audio_path, name) → JSON
21 luxtts_benchmark(text, ...) → JSON
22 unload_luxtts() → None
23"""
25import json
26import logging
27import os
28import time
29import wave as wave_mod
30from pathlib import Path
31from typing import Optional
33import numpy as np
35from .registry import ServiceToolInfo, service_tool_registry
37logger = logging.getLogger(__name__)
39# ═══════════════════════════════════════════════════════════════
40# Constants
41# ═══════════════════════════════════════════════════════════════
43SAMPLE_RATE = 24000
44MODEL_TARBALL = "sherpa-onnx-zipvoice-distill-int8-zh-en-emilia"
45MODEL_DOWNLOAD_URL = (
46 "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/"
47 f"{MODEL_TARBALL}.tar.bz2"
48)
49VOCODER_URL = (
50 "https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/"
51 "vocos_24khz.onnx"
52)
54# ═══════════════════════════════════════════════════════════════
55# Cached engine (singleton)
56# ═══════════════════════════════════════════════════════════════
58_tts_engine = None
59_prompt_cache = {} # voice_name -> (samples, sample_rate)
62# ═══════════════════════════════════════════════════════════════
63# Directory helpers
64# ═══════════════════════════════════════════════════════════════
66def _get_tts_dir() -> Path:
67 """Get the LuxTTS model/output storage directory."""
68 try:
69 from .model_storage import model_storage
70 tts_dir = model_storage.get_tool_dir("luxtts")
71 except (ImportError, Exception):
72 tts_dir = Path(os.path.expanduser("~/.hevolve/models/luxtts"))
73 tts_dir.mkdir(parents=True, exist_ok=True)
74 return tts_dir
77def _get_output_dir() -> Path:
78 out_dir = _get_tts_dir() / "output"
79 out_dir.mkdir(parents=True, exist_ok=True)
80 return out_dir
83def _get_voices_dir() -> Path:
84 vdir = _get_tts_dir() / "voices"
85 vdir.mkdir(parents=True, exist_ok=True)
86 return vdir
89def _get_model_dir() -> Path:
90 return _get_tts_dir() / MODEL_TARBALL
93# ═══════════════════════════════════════════════════════════════
94# Model download & engine init
95# ═══════════════════════════════════════════════════════════════
97def _ensure_models() -> Path:
98 """Download models if not cached. Returns model directory path."""
99 model_dir = _get_model_dir()
100 encoder_path = model_dir / "encoder.int8.onnx"
101 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx"
103 if encoder_path.exists() and vocoder_path.exists():
104 return model_dir
106 import urllib.request
107 import tarfile
109 # Download and extract model tarball
110 if not encoder_path.exists():
111 tarball_path = _get_tts_dir() / f"{MODEL_TARBALL}.tar.bz2"
112 if not tarball_path.exists():
113 logger.info(f"Downloading ZipVoice models (~109MB)...")
114 urllib.request.urlretrieve(MODEL_DOWNLOAD_URL, str(tarball_path))
115 logger.info("Download complete.")
117 logger.info("Extracting models...")
118 with tarfile.open(str(tarball_path), 'r:bz2') as tar:
119 # CVE-2007-4559: validate member paths before extraction
120 for member in tar.getmembers():
121 if member.name.startswith('/') or '..' in member.name:
122 raise ValueError(f"Unsafe tar member: {member.name}")
123 tar.extractall(str(_get_tts_dir()))
124 tarball_path.unlink(missing_ok=True)
125 logger.info("Models extracted.")
127 # Download vocoder
128 if not vocoder_path.exists():
129 logger.info("Downloading Vocos vocoder (~54MB)...")
130 urllib.request.urlretrieve(VOCODER_URL, str(vocoder_path))
131 logger.info("Vocoder downloaded.")
133 return model_dir
136def _load_engine():
137 """Load sherpa-onnx TTS engine (lazy, cached)."""
138 global _tts_engine
140 if _tts_engine is not None:
141 return _tts_engine
143 import sherpa_onnx
145 model_dir = _ensure_models()
146 vocoder_path = _get_tts_dir() / "vocos_24khz.onnx"
147 num_threads = int(os.environ.get('LUXTTS_CPU_THREADS', '4'))
149 tts_config = sherpa_onnx.OfflineTtsConfig(
150 model=sherpa_onnx.OfflineTtsModelConfig(
151 zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig(
152 tokens=str(model_dir / "tokens.txt"),
153 encoder=str(model_dir / "encoder.int8.onnx"),
154 decoder=str(model_dir / "decoder.int8.onnx"),
155 data_dir=str(model_dir / "espeak-ng-data"),
156 lexicon=str(model_dir / "lexicon.txt"),
157 vocoder=str(vocoder_path),
158 feat_scale=0.15,
159 t_shift=0.4,
160 target_rms=0.1,
161 guidance_scale=1.2,
162 ),
163 provider='cpu',
164 debug=False,
165 num_threads=num_threads,
166 ),
167 max_num_sentences=1,
168 )
170 if not tts_config.validate():
171 raise RuntimeError("ZipVoice TTS config validation failed")
173 logger.info(f"Loading ZipVoice TTS engine ({num_threads} threads)...")
174 _tts_engine = sherpa_onnx.OfflineTts(tts_config)
175 logger.info("ZipVoice TTS engine ready.")
176 return _tts_engine
179def _read_prompt_wav(wav_path: str):
180 """Read a WAV file as float32 samples + sample_rate for sherpa-onnx prompt."""
181 with wave_mod.open(wav_path) as f:
182 assert f.getnchannels() == 1, f"Expected mono, got {f.getnchannels()} channels"
183 assert f.getsampwidth() == 2, f"Expected 16-bit, got {f.getsampwidth()*8}-bit"
184 num_samples = f.getnframes()
185 raw = f.readframes(num_samples)
186 samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
187 return samples, f.getframerate()
190def _get_prompt(voice: Optional[str]):
191 """Resolve voice name to (samples, sample_rate) tuple."""
192 if voice and voice in _prompt_cache:
193 return _prompt_cache[voice]
195 # Find the WAV file
196 wav_path = None
197 if voice:
198 # Check voices directory
199 saved = _get_voices_dir() / f"{voice}.wav"
200 if saved.exists():
201 wav_path = str(saved)
202 elif os.path.isfile(voice):
203 wav_path = voice
205 if wav_path is None:
206 # Try default voice
207 default = _get_voices_dir() / "default.wav"
208 if default.exists():
209 wav_path = str(default)
210 voice = "default"
212 if wav_path is None:
213 return None
215 samples, sr = _read_prompt_wav(wav_path)
216 _prompt_cache[voice] = (samples, sr)
217 return samples, sr
220# ═══════════════════════════════════════════════════════════════
221# Public API
222# ═══════════════════════════════════════════════════════════════
224def luxtts_synthesize(
225 text: str,
226 voice_audio: Optional[str] = None,
227 output_path: Optional[str] = None,
228 device: Optional[str] = None,
229 num_steps: int = 4,
230 speed: float = 1.0,
231 rms: float = 0.01,
232) -> str:
233 """Synthesize text to speech using ZipVoice (sherpa-onnx).
235 Args:
236 text: Text to synthesize.
237 voice_audio: Path to reference voice audio (.wav, mono 16-bit, 3+ seconds).
238 Or name of a previously cloned voice.
239 output_path: Optional output .wav path. Auto-generated if None.
240 device: Ignored (sherpa-onnx uses CPU; kept for API compat).
241 num_steps: Diffusion steps (3-4 optimal). Default 4.
242 speed: Playback speed. Default 1.0.
243 rms: Ignored (sherpa-onnx handles internally; kept for API compat).
245 Returns:
246 JSON string with 'path', 'duration', 'device', 'rtf', 'latency_ms'.
247 """
248 if not text or not text.strip():
249 return json.dumps({"error": "Text is required"})
251 if output_path is None:
252 import hashlib
253 h = hashlib.md5(f"{text[:50]}:{voice_audio or 'default'}".encode()).hexdigest()[:12]
254 output_path = str(_get_output_dir() / f"luxtts_{h}.wav")
256 try:
257 engine = _load_engine()
259 # Resolve voice prompt
260 prompt = _get_prompt(voice_audio)
261 if prompt is None:
262 return json.dumps({
263 "error": "voice_audio required — provide a .wav reference (mono 16-bit, 3+ seconds)"
264 })
266 prompt_samples, prompt_sr = prompt
267 # Use a generic prompt text (sherpa-onnx needs it for alignment)
268 prompt_text = "This is a sample of my voice."
270 t0 = time.time()
271 audio = engine.generate(
272 text,
273 prompt_text,
274 prompt_samples,
275 prompt_sr,
276 speed=speed,
277 num_steps=num_steps,
278 )
279 gen_time = time.time() - t0
281 if len(audio.samples) == 0:
282 return json.dumps({"error": "TTS generation produced no audio"})
284 import soundfile as sf
285 sf.write(output_path, audio.samples, samplerate=audio.sample_rate, subtype='PCM_16')
287 duration = len(audio.samples) / audio.sample_rate
288 rtf = gen_time / duration if duration > 0 else 0
290 return json.dumps({
291 "path": output_path,
292 "duration": round(duration, 2),
293 "sample_rate": audio.sample_rate,
294 "voice": voice_audio or "default",
295 "engine": "zipvoice-sherpa-onnx",
296 "device": "cpu",
297 "num_steps": num_steps,
298 "latency_ms": round(gen_time * 1000, 1),
299 "rtf": round(rtf, 4),
300 "realtime_factor": round(1.0 / rtf, 1) if rtf > 0 else 0,
301 })
302 except ImportError as e:
303 logger.info(f"sherpa-onnx not installed: {e}")
304 return json.dumps({"error": f"sherpa-onnx not available (pip install sherpa-onnx): {e}"})
305 except Exception as e:
306 logger.warning(f"LuxTTS synthesis failed: {e}")
307 return json.dumps({"error": f"LuxTTS synthesis failed: {e}"})
310def luxtts_list_voices() -> str:
311 """List available cloned voices for LuxTTS.
313 Returns:
314 JSON with 'voices' list and availability info.
315 """
316 voices = []
317 voices_dir = _get_voices_dir()
318 if voices_dir.exists():
319 for f in sorted(voices_dir.glob("*.wav")):
320 voices.append({
321 "id": f.stem,
322 "name": f.stem.replace("-", " ").replace("_", " ").title(),
323 "type": "cloned",
324 "format": "wav",
325 "path": str(f),
326 })
328 engine_available = False
329 try:
330 import sherpa_onnx # noqa: F401
331 engine_available = True
332 except ImportError:
333 pass
335 return json.dumps({
336 "voices": voices,
337 "count": len(voices),
338 "engine": "zipvoice-sherpa-onnx" if engine_available else "not_installed",
339 "device": "cpu",
340 "sample_rate": SAMPLE_RATE,
341 })
344def luxtts_clone_voice(audio_path: str, name: str) -> str:
345 """Save a voice reference audio for LuxTTS voice cloning.
347 LuxTTS encodes the reference at synthesis time, so this just copies
348 the audio file to the voices directory for reuse.
350 Args:
351 audio_path: Path to .wav/.mp3 audio (3+ seconds of clear speech).
352 name: Name to save the voice as.
354 Returns:
355 JSON with 'saved', 'name', 'path'.
356 """
357 if not audio_path or not os.path.isfile(audio_path):
358 return json.dumps({"error": "Valid audio_path required"})
359 if not name or not name.strip():
360 return json.dumps({"error": "Voice name required"})
362 name = name.strip().lower().replace(" ", "-")
363 save_path = _get_voices_dir() / f"{name}.wav"
365 try:
366 import shutil
367 # If not WAV, convert using soundfile
368 if not audio_path.lower().endswith('.wav'):
369 import soundfile as sf
370 data, sr = sf.read(audio_path)
371 sf.write(str(save_path), data, sr)
372 else:
373 shutil.copy2(audio_path, str(save_path))
375 # Clear cache entry so it reloads
376 _prompt_cache.pop(name, None)
378 logger.info(f"LuxTTS voice saved: {name} from {audio_path}")
379 return json.dumps({
380 "saved": True,
381 "name": name,
382 "path": str(save_path),
383 })
384 except Exception as e:
385 return json.dumps({"error": f"Voice save failed: {e}"})
388def luxtts_benchmark(
389 text: str = "Hello, this is a benchmark test for measuring text to speech performance.",
390 device: Optional[str] = None,
391 voice_audio: Optional[str] = None,
392 num_runs: int = 3,
393) -> str:
394 """Benchmark LuxTTS performance on the current hardware.
396 Args:
397 text: Text to synthesize for benchmarking.
398 device: Ignored (kept for API compat).
399 voice_audio: Reference voice audio path or voice name.
400 num_runs: Number of benchmark runs (default 3).
402 Returns:
403 JSON with timing statistics, RTF, device info.
404 """
405 try:
406 engine = _load_engine()
408 prompt = _get_prompt(voice_audio)
409 if prompt is None:
410 return json.dumps({"error": "voice_audio required for benchmark"})
412 prompt_samples, prompt_sr = prompt
413 prompt_text = "This is a sample of my voice."
415 # Warmup
416 engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4)
418 times = []
419 durations = []
420 for _ in range(num_runs):
421 t0 = time.time()
422 audio = engine.generate(text, prompt_text, prompt_samples, prompt_sr, speed=1.0, num_steps=4)
423 elapsed = time.time() - t0
424 times.append(elapsed)
425 durations.append(len(audio.samples) / audio.sample_rate)
427 avg_time = sum(times) / len(times)
428 avg_duration = sum(durations) / len(durations)
429 avg_rtf = avg_time / avg_duration if avg_duration > 0 else 0
431 return json.dumps({
432 "engine": "zipvoice-sherpa-onnx",
433 "device": "cpu",
434 "num_runs": num_runs,
435 "text_length": len(text),
436 "avg_gen_time_ms": round(avg_time * 1000, 1),
437 "min_gen_time_ms": round(min(times) * 1000, 1),
438 "max_gen_time_ms": round(max(times) * 1000, 1),
439 "avg_audio_duration_s": round(avg_duration, 2),
440 "avg_rtf": round(avg_rtf, 4),
441 "avg_realtime_factor": round(1.0 / avg_rtf, 1) if avg_rtf > 0 else 0,
442 "sample_rate": SAMPLE_RATE,
443 })
444 except ImportError as e:
445 return json.dumps({"error": f"sherpa-onnx not available: {e}"})
446 except Exception as e:
447 return json.dumps({"error": f"Benchmark failed: {e}"})
450def unload_luxtts():
451 """Unload LuxTTS engine to free memory."""
452 global _tts_engine
453 _tts_engine = None
454 _prompt_cache.clear()
456 import gc
457 gc.collect()
458 logger.info("LuxTTS engine unloaded")
461# ═══════════════════════════════════════════════════════════════
462# Service tool registration
463# ═══════════════════════════════════════════════════════════════
465class LuxTTSTool:
466 """Register LuxTTS as an in-process service tool."""
468 @classmethod
469 def register_functions(cls):
470 """Register LuxTTS functions with service_tool_registry."""
471 tool_info = ServiceToolInfo(
472 name="luxtts",
473 description=(
474 "High-quality voice cloning TTS via ZipVoice-Distill (sherpa-onnx). "
475 "Zero-shot voice cloning from 3s audio. 24kHz Vocos vocoder. "
476 "CPU ONNX inference. Apache 2.0 license."
477 ),
478 base_url="inprocess://luxtts",
479 endpoints={
480 "synthesize": {
481 "path": "/synthesize",
482 "method": "POST",
483 "description": (
484 "Synthesize text to speech with voice cloning. "
485 "Input: text, voice_audio (path to reference .wav), "
486 "device (cuda/cpu/mps), num_steps (3-4). "
487 "Returns 48kHz WAV."
488 ),
489 "params_schema": {
490 "text": {"type": "string", "description": "Text to speak"},
491 "voice_audio": {"type": "string", "description": "Path to reference voice audio"},
492 "device": {"type": "string", "description": "cuda, cpu, or mps"},
493 "num_steps": {"type": "integer", "description": "Diffusion steps (3-4 optimal)"},
494 },
495 },
496 "list_voices": {
497 "path": "/voices",
498 "method": "GET",
499 "description": "List saved voice references.",
500 "params_schema": {},
501 },
502 "clone_voice": {
503 "path": "/clone",
504 "method": "POST",
505 "description": "Save a voice reference for reuse (3+ seconds audio).",
506 "params_schema": {
507 "audio_path": {"type": "string", "description": "Path to audio sample"},
508 "name": {"type": "string", "description": "Name for the voice"},
509 },
510 },
511 "benchmark": {
512 "path": "/benchmark",
513 "method": "POST",
514 "description": "Run performance benchmark on current hardware.",
515 "params_schema": {
516 "text": {"type": "string", "description": "Text to benchmark with"},
517 "device": {"type": "string", "description": "cuda, cpu, or mps"},
518 },
519 },
520 },
521 health_endpoint="/health",
522 tags=["tts", "speech", "voice-cloning", "luxtts", "48khz", "gpu"],
523 timeout=60,
524 )
525 tool_info.is_healthy = True
526 service_tool_registry._tools["luxtts"] = tool_info
527 return True