Coverage for integrations / service_tools / kokoro_tool.py: 37.5%
48 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Kokoro TTS tool — 82M-parameter English-first voice, CPU or GPU.
4Kokoro (https://huggingface.co/hexgrad/Kokoro-82M) is a tiny neural
5TTS model that fits between Piper (fast but robotic) and the big
6voice-clone engines (F5, Chatterbox, CosyVoice3). It runs at ~1x real
7time on CPU with better quality than Piper's best voices, so it's the
8right second rung on the English fallback ladder:
10 chatterbox_turbo (GPU) → kokoro (CPU/GPU) → piper (CPU bundled)
12Why it lives here instead of in Nunba:
13 - It's a neural model; we isolate it in a subprocess the same way
14 the other TTS engines are isolated via gpu_worker so a crash can't
15 take down the main process.
16 - Nunba's tts_engine.py routes to this via the shared
17 `_SubprocessTTSBackend` adapter — no parallel in-process impl.
19VRAM: ~200MB if GPU, else CPU-only.
20Requires: pip install kokoro (from hexgrad/kokoro)
22Public API (parent side):
23 kokoro_synthesize(text, language, voice, output_path, speed) → JSON
24 unload_kokoro() → None
25"""
27from typing import Optional
29from integrations.service_tools.gpu_worker import ToolWorker
32# ── Worker callbacks (run in subprocess) ──────────────────────────
34# Default voice — Kokoro ships multiple English voices (af_bella,
35# af_sarah, af_sky, af_nicole, am_adam, am_michael, bf_emma, bf_isabella,
36# bm_george, bm_lewis, ...). 'af_bella' is a clean, neutral US female
37# voice that matches the default feel of Piper's Lessac high-quality
38# model, so the fallback ladder stays tonally consistent when Kokoro
39# takes over from chatterbox_turbo.
40_DEFAULT_VOICE = 'af_bella'
43def _load():
44 """Load Kokoro once at subprocess startup (~3-5s).
46 Uses the GPU if CUDA is available, otherwise CPU. This is the right
47 place to burn one warm-up cost — subsequent synth calls amortise
48 over the life of the worker process. On a modest consumer CPU the
49 warm 82M model produces ~1x real-time speech, which beats every
50 realtime-capable voice-clone engine at the same quality level.
51 """
52 import torch
53 try:
54 from kokoro import KPipeline
55 except ImportError as e:
56 raise ImportError(
57 "kokoro package not installed. "
58 "Install with: pip install kokoro"
59 ) from e
61 device = 'cuda' if torch.cuda.is_available() else 'cpu'
62 # lang_code='a' = American English, 'b' = British English.
63 # The worker is registered once with lang_code='a'; multi-accent
64 # support would spawn a second worker — out of scope for MVP.
65 pipeline = KPipeline(lang_code='a', device=device)
66 return {'pipeline': pipeline, 'device': device}
69def _synthesize(model, req: dict) -> dict:
70 """Run one synthesis request inside the worker.
72 Accumulates the generator output into a single waveform, writes
73 it to output_path as WAV via soundfile. Kokoro returns PCM at
74 24 kHz by default — matches the other neural engines.
75 """
76 text = req.get('text', '')
77 if not text or not text.strip():
78 return {'error': 'Text is required'}
80 output_path = req.get('output_path')
81 if not output_path:
82 return {'error': 'output_path is required'}
84 voice = req.get('voice') or _DEFAULT_VOICE
85 # Kokoro's `speed` arg is a multiplier on phoneme duration: 1.0 is
86 # natural speed, >1 speeds up, <1 slows down. Forward from the
87 # parent so synthesize_text(..., speed=0.9) reaches Kokoro.
88 speed = float(req.get('speed') or 1.0)
90 import numpy as np
91 import soundfile as sf
93 pipeline = model['pipeline']
94 # KPipeline returns a generator of (gs, ps, audio) tuples — one
95 # per sentence. Concatenate into a single numpy array so we write
96 # a single WAV. KPipeline itself does sentence splitting, so
97 # long prompts work without us doing our own chunking.
98 audio_segments = []
99 sample_rate = 24000
100 for _gs, _ps, audio in pipeline(text, voice=voice, speed=speed):
101 if audio is None:
102 continue
103 # Some versions return torch tensors, others numpy arrays
104 if hasattr(audio, 'cpu'):
105 audio = audio.cpu().numpy()
106 audio_segments.append(audio)
108 if not audio_segments:
109 return {'error': 'Kokoro returned no audio'}
111 full_wave = np.concatenate(audio_segments)
112 sf.write(output_path, full_wave, sample_rate)
114 return {
115 'path': output_path,
116 'duration': round(len(full_wave) / sample_rate, 2),
117 'sample_rate': sample_rate,
118 'engine': 'kokoro',
119 'device': model['device'],
120 'voice': voice,
121 }
124# ── Parent-side: one ToolWorker instance ─────────────────────────
126_tool = ToolWorker(
127 tool_name='kokoro',
128 tool_module='integrations.service_tools.kokoro_tool',
129 vram_budget='tts_kokoro',
130 output_subdir='kokoro/output',
131 engine='kokoro',
132 startup_timeout=30.0,
133 request_timeout=45.0,
134)
137def kokoro_synthesize(
138 text: str,
139 language: str = 'en',
140 voice: Optional[str] = None,
141 output_path: Optional[str] = None,
142 speed: float = 1.0,
143) -> str:
144 """Synthesize speech using Kokoro 82M (CPU or GPU subprocess).
146 Args:
147 text: Text to speak.
148 language: ISO code — 'en' for now (Kokoro supports other
149 lang_codes but each needs its own pipeline).
150 voice: Optional voice preset (e.g. 'af_bella', 'am_adam').
151 Defaults to 'af_bella'.
152 output_path: Where to write the WAV.
153 speed: Speed multiplier passed through to KPipeline.
155 Returns JSON. On subprocess crash the response contains
156 `{"error": ..., "transient": true}` so the caller can fall back
157 to the next engine in the English chain (piper).
158 """
159 return _tool.synthesize(
160 text=text,
161 language=language,
162 voice=voice,
163 output_path=output_path,
164 extra_request={'speed': speed} if speed != 1.0 else None,
165 )
168def unload_kokoro():
169 """Stop the Kokoro worker subprocess and free its memory."""
170 _tool.stop()
173class KokoroTool:
174 """Register Kokoro as an in-process service tool."""
176 @classmethod
177 def register_functions(cls):
178 from .registry import ServiceToolInfo, service_tool_registry
179 tool_info = ServiceToolInfo(
180 name="kokoro",
181 description=(
182 "Kokoro 82M: small neural English TTS. "
183 "Runs on CPU at ~1x real-time or GPU at ~0.1x real-time. "
184 "Quality sits between Piper and the big voice-clone engines. "
185 "Requires: pip install kokoro"
186 ),
187 base_url="inprocess://kokoro",
188 endpoints={
189 "synthesize": {
190 "path": "/synthesize",
191 "method": "POST",
192 "description": "Synthesize with Kokoro (English, CPU or GPU).",
193 "params_schema": {
194 "text": {"type": "string"},
195 "language": {"type": "string"},
196 "voice": {"type": "string", "description": "Voice preset name"},
197 "speed": {"type": "number"},
198 },
199 },
200 },
201 tags=["tts", "speech", "english", "small-model", "kokoro"],
202 timeout=45,
203 )
204 tool_info.is_healthy = True
205 service_tool_registry._tools["kokoro"] = tool_info
206 return True
208# NOTE: no `if __name__ == '__main__':` block here. The centralized
209# dispatcher at integrations.service_tools.gpu_worker imports this
210# module and calls `_load` / `_synthesize` directly when spawned.