Coverage for integrations / service_tools / kokoro_tool.py: 37.5%

48 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Kokoro TTS tool — 82M-parameter English-first voice, CPU or GPU. 

3 

4Kokoro (https://huggingface.co/hexgrad/Kokoro-82M) is a tiny neural 

5TTS model that fits between Piper (fast but robotic) and the big 

6voice-clone engines (F5, Chatterbox, CosyVoice3). It runs at ~1x real 

7time on CPU with better quality than Piper's best voices, so it's the 

8right second rung on the English fallback ladder: 

9 

10 chatterbox_turbo (GPU) → kokoro (CPU/GPU) → piper (CPU bundled) 

11 

12Why it lives here instead of in Nunba: 

13 - It's a neural model; we isolate it in a subprocess the same way 

14 the other TTS engines are isolated via gpu_worker so a crash can't 

15 take down the main process. 

16 - Nunba's tts_engine.py routes to this via the shared 

17 `_SubprocessTTSBackend` adapter — no parallel in-process impl. 

18 

19VRAM: ~200MB if GPU, else CPU-only. 

20Requires: pip install kokoro (from hexgrad/kokoro) 

21 

22Public API (parent side): 

23 kokoro_synthesize(text, language, voice, output_path, speed) → JSON 

24 unload_kokoro() → None 

25""" 

26 

27from typing import Optional 

28 

29from integrations.service_tools.gpu_worker import ToolWorker 

30 

31 

32# ── Worker callbacks (run in subprocess) ────────────────────────── 

33 

34# Default voice — Kokoro ships multiple English voices (af_bella, 

35# af_sarah, af_sky, af_nicole, am_adam, am_michael, bf_emma, bf_isabella, 

36# bm_george, bm_lewis, ...). 'af_bella' is a clean, neutral US female 

37# voice that matches the default feel of Piper's Lessac high-quality 

38# model, so the fallback ladder stays tonally consistent when Kokoro 

39# takes over from chatterbox_turbo. 

40_DEFAULT_VOICE = 'af_bella' 

41 

42 

43def _load(): 

44 """Load Kokoro once at subprocess startup (~3-5s). 

45 

46 Uses the GPU if CUDA is available, otherwise CPU. This is the right 

47 place to burn one warm-up cost — subsequent synth calls amortise 

48 over the life of the worker process. On a modest consumer CPU the 

49 warm 82M model produces ~1x real-time speech, which beats every 

50 realtime-capable voice-clone engine at the same quality level. 

51 """ 

52 import torch 

53 try: 

54 from kokoro import KPipeline 

55 except ImportError as e: 

56 raise ImportError( 

57 "kokoro package not installed. " 

58 "Install with: pip install kokoro" 

59 ) from e 

60 

61 device = 'cuda' if torch.cuda.is_available() else 'cpu' 

62 # lang_code='a' = American English, 'b' = British English. 

63 # The worker is registered once with lang_code='a'; multi-accent 

64 # support would spawn a second worker — out of scope for MVP. 

65 pipeline = KPipeline(lang_code='a', device=device) 

66 return {'pipeline': pipeline, 'device': device} 

67 

68 

69def _synthesize(model, req: dict) -> dict: 

70 """Run one synthesis request inside the worker. 

71 

72 Accumulates the generator output into a single waveform, writes 

73 it to output_path as WAV via soundfile. Kokoro returns PCM at 

74 24 kHz by default — matches the other neural engines. 

75 """ 

76 text = req.get('text', '') 

77 if not text or not text.strip(): 

78 return {'error': 'Text is required'} 

79 

80 output_path = req.get('output_path') 

81 if not output_path: 

82 return {'error': 'output_path is required'} 

83 

84 voice = req.get('voice') or _DEFAULT_VOICE 

85 # Kokoro's `speed` arg is a multiplier on phoneme duration: 1.0 is 

86 # natural speed, >1 speeds up, <1 slows down. Forward from the 

87 # parent so synthesize_text(..., speed=0.9) reaches Kokoro. 

88 speed = float(req.get('speed') or 1.0) 

89 

90 import numpy as np 

91 import soundfile as sf 

92 

93 pipeline = model['pipeline'] 

94 # KPipeline returns a generator of (gs, ps, audio) tuples — one 

95 # per sentence. Concatenate into a single numpy array so we write 

96 # a single WAV. KPipeline itself does sentence splitting, so 

97 # long prompts work without us doing our own chunking. 

98 audio_segments = [] 

99 sample_rate = 24000 

100 for _gs, _ps, audio in pipeline(text, voice=voice, speed=speed): 

101 if audio is None: 

102 continue 

103 # Some versions return torch tensors, others numpy arrays 

104 if hasattr(audio, 'cpu'): 

105 audio = audio.cpu().numpy() 

106 audio_segments.append(audio) 

107 

108 if not audio_segments: 

109 return {'error': 'Kokoro returned no audio'} 

110 

111 full_wave = np.concatenate(audio_segments) 

112 sf.write(output_path, full_wave, sample_rate) 

113 

114 return { 

115 'path': output_path, 

116 'duration': round(len(full_wave) / sample_rate, 2), 

117 'sample_rate': sample_rate, 

118 'engine': 'kokoro', 

119 'device': model['device'], 

120 'voice': voice, 

121 } 

122 

123 

124# ── Parent-side: one ToolWorker instance ───────────────────────── 

125 

126_tool = ToolWorker( 

127 tool_name='kokoro', 

128 tool_module='integrations.service_tools.kokoro_tool', 

129 vram_budget='tts_kokoro', 

130 output_subdir='kokoro/output', 

131 engine='kokoro', 

132 startup_timeout=30.0, 

133 request_timeout=45.0, 

134) 

135 

136 

137def kokoro_synthesize( 

138 text: str, 

139 language: str = 'en', 

140 voice: Optional[str] = None, 

141 output_path: Optional[str] = None, 

142 speed: float = 1.0, 

143) -> str: 

144 """Synthesize speech using Kokoro 82M (CPU or GPU subprocess). 

145 

146 Args: 

147 text: Text to speak. 

148 language: ISO code — 'en' for now (Kokoro supports other 

149 lang_codes but each needs its own pipeline). 

150 voice: Optional voice preset (e.g. 'af_bella', 'am_adam'). 

151 Defaults to 'af_bella'. 

152 output_path: Where to write the WAV. 

153 speed: Speed multiplier passed through to KPipeline. 

154 

155 Returns JSON. On subprocess crash the response contains 

156 `{"error": ..., "transient": true}` so the caller can fall back 

157 to the next engine in the English chain (piper). 

158 """ 

159 return _tool.synthesize( 

160 text=text, 

161 language=language, 

162 voice=voice, 

163 output_path=output_path, 

164 extra_request={'speed': speed} if speed != 1.0 else None, 

165 ) 

166 

167 

168def unload_kokoro(): 

169 """Stop the Kokoro worker subprocess and free its memory.""" 

170 _tool.stop() 

171 

172 

173class KokoroTool: 

174 """Register Kokoro as an in-process service tool.""" 

175 

176 @classmethod 

177 def register_functions(cls): 

178 from .registry import ServiceToolInfo, service_tool_registry 

179 tool_info = ServiceToolInfo( 

180 name="kokoro", 

181 description=( 

182 "Kokoro 82M: small neural English TTS. " 

183 "Runs on CPU at ~1x real-time or GPU at ~0.1x real-time. " 

184 "Quality sits between Piper and the big voice-clone engines. " 

185 "Requires: pip install kokoro" 

186 ), 

187 base_url="inprocess://kokoro", 

188 endpoints={ 

189 "synthesize": { 

190 "path": "/synthesize", 

191 "method": "POST", 

192 "description": "Synthesize with Kokoro (English, CPU or GPU).", 

193 "params_schema": { 

194 "text": {"type": "string"}, 

195 "language": {"type": "string"}, 

196 "voice": {"type": "string", "description": "Voice preset name"}, 

197 "speed": {"type": "number"}, 

198 }, 

199 }, 

200 }, 

201 tags=["tts", "speech", "english", "small-model", "kokoro"], 

202 timeout=45, 

203 ) 

204 tool_info.is_healthy = True 

205 service_tool_registry._tools["kokoro"] = tool_info 

206 return True 

207 

208# NOTE: no `if __name__ == '__main__':` block here. The centralized 

209# dispatcher at integrations.service_tools.gpu_worker imports this 

210# module and calls `_load` / `_synthesize` directly when spawned.