Coverage for integrations/service_tools/kokoro

1"""

2Kokoro TTS tool — 82M-parameter English-first voice, CPU or GPU.

4Kokoro (https://huggingface.co/hexgrad/Kokoro-82M) is a tiny neural

5TTS model that fits between Piper (fast but robotic) and the big

6voice-clone engines (F5, Chatterbox, CosyVoice3). It runs at ~1x real

7time on CPU with better quality than Piper's best voices, so it's the

8right second rung on the English fallback ladder:

10 chatterbox_turbo (GPU) → kokoro (CPU/GPU) → piper (CPU bundled)

12Why it lives here instead of in Nunba:

13 - It's a neural model; we isolate it in a subprocess the same way

14 the other TTS engines are isolated via gpu_worker so a crash can't

15 take down the main process.

16 - Nunba's tts_engine.py routes to this via the shared

17 `_SubprocessTTSBackend` adapter — no parallel in-process impl.

19VRAM: ~200MB if GPU, else CPU-only.

20Requires: pip install kokoro (from hexgrad/kokoro)

22Public API (parent side):

23 kokoro_synthesize(text, language, voice, output_path, speed) → JSON

24 unload_kokoro() → None

25"""

27from typing import Optional

29from integrations.service_tools.gpu_worker import ToolWorker

32# ── Worker callbacks (run in subprocess) ──────────────────────────

34# Default voice — Kokoro ships multiple English voices (af_bella,

35# af_sarah, af_sky, af_nicole, am_adam, am_michael, bf_emma, bf_isabella,

36# bm_george, bm_lewis, ...). 'af_bella' is a clean, neutral US female

37# voice that matches the default feel of Piper's Lessac high-quality

38# model, so the fallback ladder stays tonally consistent when Kokoro

39# takes over from chatterbox_turbo.

40_DEFAULT_VOICE = 'af_bella'

43def _load():

44 """Load Kokoro once at subprocess startup (~3-5s).

46 Uses the GPU if CUDA is available, otherwise CPU. This is the right

47 place to burn one warm-up cost — subsequent synth calls amortise

48 over the life of the worker process. On a modest consumer CPU the

49 warm 82M model produces ~1x real-time speech, which beats every

50 realtime-capable voice-clone engine at the same quality level.

51 """

52 import torch

53 try:

54 from kokoro import KPipeline

55 except ImportError as e:

56 raise ImportError(

57 "kokoro package not installed. "

58 "Install with: pip install kokoro"

59 ) from e

61 device = 'cuda' if torch.cuda.is_available() else 'cpu'

62 # lang_code='a' = American English, 'b' = British English.

63 # The worker is registered once with lang_code='a'; multi-accent

64 # support would spawn a second worker — out of scope for MVP.

65 pipeline = KPipeline(lang_code='a', device=device)

66 return {'pipeline': pipeline, 'device': device}

69def _synthesize(model, req: dict) -> dict:

70 """Run one synthesis request inside the worker.

72 Accumulates the generator output into a single waveform, writes

73 it to output_path as WAV via soundfile. Kokoro returns PCM at

74 24 kHz by default — matches the other neural engines.

75 """

76 text = req.get('text', '')

77 if not text or not text.strip():

78 return {'error': 'Text is required'}

80 output_path = req.get('output_path')

81 if not output_path:

82 return {'error': 'output_path is required'}

84 voice = req.get('voice') or _DEFAULT_VOICE

85 # Kokoro's `speed` arg is a multiplier on phoneme duration: 1.0 is

86 # natural speed, >1 speeds up, <1 slows down. Forward from the

87 # parent so synthesize_text(..., speed=0.9) reaches Kokoro.

88 speed = float(req.get('speed') or 1.0)

90 import numpy as np

91 import soundfile as sf

93 pipeline = model['pipeline']

94 # KPipeline returns a generator of (gs, ps, audio) tuples — one

95 # per sentence. Concatenate into a single numpy array so we write

96 # a single WAV. KPipeline itself does sentence splitting, so

97 # long prompts work without us doing our own chunking.

98 audio_segments = []

99 sample_rate = 24000

100 for _gs, _ps, audio in pipeline(text, voice=voice, speed=speed):

101 if audio is None:

102 continue

103 # Some versions return torch tensors, others numpy arrays

104 if hasattr(audio, 'cpu'):

105 audio = audio.cpu().numpy()

106 audio_segments.append(audio)

107

108 if not audio_segments:

109 return {'error': 'Kokoro returned no audio'}

110

111 full_wave = np.concatenate(audio_segments)

112 sf.write(output_path, full_wave, sample_rate)

113

114 return {

115 'path': output_path,

116 'duration': round(len(full_wave) / sample_rate, 2),

117 'sample_rate': sample_rate,

118 'engine': 'kokoro',

119 'device': model['device'],

120 'voice': voice,

121 }

122

123

124# ── Parent-side: one ToolWorker instance ─────────────────────────

125

126_tool = ToolWorker(

127 tool_name='kokoro',

128 tool_module='integrations.service_tools.kokoro_tool',

129 vram_budget='tts_kokoro',

130 output_subdir='kokoro/output',

131 engine='kokoro',

132 startup_timeout=30.0,

133 request_timeout=45.0,

134)

135

136

137def kokoro_synthesize(

138 text: str,

139 language: str = 'en',

140 voice: Optional[str] = None,

141 output_path: Optional[str] = None,

142 speed: float = 1.0,

143) -> str:

144 """Synthesize speech using Kokoro 82M (CPU or GPU subprocess).

145

146 Args:

147 text: Text to speak.

148 language: ISO code — 'en' for now (Kokoro supports other

149 lang_codes but each needs its own pipeline).

150 voice: Optional voice preset (e.g. 'af_bella', 'am_adam').

151 Defaults to 'af_bella'.

152 output_path: Where to write the WAV.

153 speed: Speed multiplier passed through to KPipeline.

154

155 Returns JSON. On subprocess crash the response contains

156 `{"error": ..., "transient": true}` so the caller can fall back

157 to the next engine in the English chain (piper).

158 """

159 return _tool.synthesize(

160 text=text,

161 language=language,

162 voice=voice,

163 output_path=output_path,

164 extra_request={'speed': speed} if speed != 1.0 else None,

165 )

166

167

168def unload_kokoro():

169 """Stop the Kokoro worker subprocess and free its memory."""

170 _tool.stop()

171

172

173class KokoroTool:

174 """Register Kokoro as an in-process service tool."""

175

176 @classmethod

177 def register_functions(cls):

178 from .registry import ServiceToolInfo, service_tool_registry

179 tool_info = ServiceToolInfo(

180 name="kokoro",

181 description=(

182 "Kokoro 82M: small neural English TTS. "

183 "Runs on CPU at ~1x real-time or GPU at ~0.1x real-time. "

184 "Quality sits between Piper and the big voice-clone engines. "

185 "Requires: pip install kokoro"

186 ),

187 base_url="inprocess://kokoro",

188 endpoints={

189 "synthesize": {

190 "path": "/synthesize",

191 "method": "POST",

192 "description": "Synthesize with Kokoro (English, CPU or GPU).",

193 "params_schema": {

194 "text": {"type": "string"},

195 "language": {"type": "string"},

196 "voice": {"type": "string", "description": "Voice preset name"},

197 "speed": {"type": "number"},

198 },

199 },

200 },

201 tags=["tts", "speech", "english", "small-model", "kokoro"],

202 timeout=45,

203 )

204 tool_info.is_healthy = True

205 service_tool_registry._tools["kokoro"] = tool_info

206 return True

207

208# NOTE: no `if __name__ == '__main__':` block here. The centralized

209# dispatcher at integrations.service_tools.gpu_worker imports this

210# module and calls `_load` / `_synthesize` directly when spawned.

Coverage for integrations / service_tools / kokoro_tool.py: 37.5%

48 statements