Coverage for integrations / service_tools / omnivoice_tool.py: 41.1%

107 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2OmniVoice TTS tool — 646 languages, zero-shot voice cloning (GPU). 

3 

4Backbone: Qwen3-0.6B + diffusion decoder. Trained on 581k hours spanning 

5646 languages — covers every Indic script with substantially more hours 

6per language than Indic-Parler (e.g. Tamil 423h vs parler's ~10h), and 

7also covers English, Mandarin, Japanese, Korean, European, Arabic, 

8African, and low-resource tongues. Apache 2.0. 

9 

10VRAM: stubbed at 3.0 GB until first real load. vram_manager's 

11record_actual_usage() auto-tightens the budget from the worker's 

12'__WORKER_VRAM_GB__' telemetry marker on startup. 

13 

14Requires: pip install omnivoice torch soundfile 

15Model: HF hub 'k2-fsa/OmniVoice' (~1.5 GB safetensors) 

16 

17Public API (parent): 

18 omnivoice_synthesize(text, language, voice, output_path) -> JSON 

19 unload_omnivoice() -> None 

20 

21Worker entry (via dispatcher): 

22 python -m integrations.service_tools.gpu_worker \\ 

23 integrations.service_tools.omnivoice_tool 

24 

25SUBPROCESS ISOLATED: model + tokenizer live in the worker. Parent 

26forwards requests through ToolWorker. 

27""" 

28 

29from __future__ import annotations 

30 

31import os 

32import re 

33from typing import Optional 

34 

35from integrations.service_tools.gpu_worker import ToolWorker 

36 

37# OmniVoice outputs at 24 kHz (matches every other neural TTS in this 

38# registry — simplifies audio concatenation in the Nunba chat pipeline). 

39SAMPLE_RATE = 24000 

40 

41# Hugging Face repo id — 646-language 0.6B checkpoint 

42HF_MODEL_ID = 'k2-fsa/OmniVoice' 

43 

44# Sentence-chunking thresholds — diffusion decoders trail off at long 

45# contexts; chunk + concat with a short gap for clean prosody. 

46_INTER_SENTENCE_GAP_S = 0.12 

47_END_PAD_S = 0.4 

48_PEAK_TARGET_DB = -1.0 

49_SPLIT_THRESHOLD_CHARS = 120 # OmniVoice handles longer spans than parler 

50_MIN_CHUNK_CHARS = 20 

51_TAIL_MERGE_CHARS = 15 

52 

53# Extensions we recognise as reference audio file paths (voice cloning); 

54# anything else passed in `voice` is treated as a free-form descriptor 

55# (passed to the model's `instruct` argument for voice design). 

56_AUDIO_SUFFIXES = ('.wav', '.mp3', '.flac', '.ogg', '.m4a') 

57 

58 

59# ─── Helpers ──────────────────────────────────────────────────────────── 

60 

61def _split_sentences(text: str) -> list: 

62 """Split at Latin + Indic sentence boundaries, protect '...', merge 

63 shorts. Same logic as indic_parler_tool._split_sentences — duplicated 

64 here deliberately to keep worker modules self-contained (no shared 

65 runtime state between sibling TTS workers).""" 

66 protected = text.replace('...', '\x00ELLIPSIS\x00') 

67 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected) 

68 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts] 

69 merged = [] 

70 for p in parts: 

71 p = p.strip() 

72 if not p: 

73 continue 

74 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS: 

75 merged[-1] = merged[-1] + ' ' + p 

76 else: 

77 merged.append(p) 

78 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS: 

79 merged[-2] = merged[-2] + ' ' + merged[-1] 

80 merged.pop() 

81 return merged if len(merged) > 1 else [text] 

82 

83 

84def _is_audio_path(voice: Optional[str]) -> bool: 

85 if not voice: 

86 return False 

87 v = voice.strip() 

88 if v.lower().endswith(_AUDIO_SUFFIXES): 

89 return True 

90 # Windows-absolute (C:\..) or POSIX-absolute (/..) that exists on disk 

91 if os.path.isabs(v) and os.path.isfile(v): 

92 return True 

93 return False 

94 

95 

96# ─── Worker-side callbacks (run in subprocess) ────────────────────────── 

97 

98def _load(): 

99 """Load OmniVoice on GPU (fp16). 

100 

101 If the official `omnivoice` package isn't installed we fall back to 

102 a clear error — the worker exits with code 2 and the parent returns 

103 'insufficient compute' so the router demotes to the next engine in 

104 LANG_ENGINE_PREFERENCE (indic_parler for Indic, chatterbox_ml for 

105 others, espeak as final fallback). 

106 """ 

107 import torch 

108 try: 

109 from omnivoice import OmniVoice # type: ignore 

110 except ImportError as e: 

111 raise ImportError( 

112 "omnivoice package not installed. " 

113 "Install with: pip install omnivoice" 

114 ) from e 

115 

116 model = OmniVoice.from_pretrained( 

117 HF_MODEL_ID, 

118 device_map='cuda:0', 

119 dtype=torch.float16, 

120 ) 

121 return { 

122 'model': model, 

123 'sample_rate': SAMPLE_RATE, 

124 } 

125 

126 

127def _generate_chunk(state: dict, text: str, voice: Optional[str]): 

128 """One chunk through OmniVoice. Returns np.ndarray (float32).""" 

129 model = state['model'] 

130 kwargs = {'text': text} 

131 if voice: 

132 if _is_audio_path(voice): 

133 kwargs['ref_audio'] = voice 

134 # OmniVoice auto-transcribes the reference if ref_text is 

135 # omitted — wrapped in try/except inside model.generate. 

136 kwargs['ref_text'] = '' 

137 else: 

138 # Free-form speaker descriptor → voice-design path 

139 kwargs['instruct'] = voice 

140 

141 audio = model.generate(**kwargs) 

142 # OmniVoice returns a list of np.ndarray; take the first clip 

143 if isinstance(audio, (list, tuple)): 

144 audio = audio[0] 

145 import numpy as np 

146 if hasattr(audio, 'detach'): 

147 audio = audio.detach().cpu().float().numpy() 

148 return np.asarray(audio, dtype='float32').squeeze() 

149 

150 

151def _synthesize(state, req: dict) -> dict: 

152 text = req.get('text', '') 

153 if not text or not text.strip(): 

154 return {'error': 'Text is required'} 

155 output_path = req.get('output_path') 

156 if not output_path: 

157 return {'error': 'output_path is required'} 

158 

159 import numpy as np 

160 import soundfile as sf 

161 

162 language = req.get('language', 'en') 

163 voice = req.get('voice') 

164 sr = state['sample_rate'] 

165 

166 if len(text) > _SPLIT_THRESHOLD_CHARS: 

167 sentences = _split_sentences(text) 

168 else: 

169 sentences = [text] 

170 

171 if len(sentences) == 1: 

172 audio = _generate_chunk(state, text, voice) 

173 else: 

174 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32) 

175 chunks = [] 

176 for i, sent in enumerate(sentences): 

177 chunk_audio = _generate_chunk(state, sent, voice) 

178 if chunk_audio is not None and len(chunk_audio) > 0: 

179 chunks.append(chunk_audio) 

180 if i < len(sentences) - 1: 

181 chunks.append(gap) 

182 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32) 

183 

184 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32) 

185 audio = np.concatenate([audio, end_pad]) 

186 

187 peak = float(np.abs(audio).max()) 

188 if peak > 0: 

189 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0) 

190 audio = audio * (target_peak / peak) 

191 

192 sf.write(output_path, audio, sr) 

193 

194 return { 

195 'path': output_path, 

196 'duration': round(len(audio) / sr, 2), 

197 'sample_rate': sr, 

198 'engine': 'omnivoice', 

199 'device': 'cuda', 

200 'language': language, 

201 'voice': voice or 'default', 

202 } 

203 

204 

205# ─── Parent-side: ToolWorker instance ─────────────────────────────────── 

206 

207_tool = ToolWorker( 

208 tool_name='omnivoice', 

209 tool_module='integrations.service_tools.omnivoice_tool', 

210 vram_budget='tts_omnivoice', 

211 output_subdir='omnivoice/output', 

212 engine='omnivoice', 

213 startup_timeout=180.0, # first run downloads ~1.5 GB checkpoint 

214 request_timeout=120.0, 

215) 

216 

217 

218def omnivoice_synthesize( 

219 text: str, 

220 language: str = 'en', 

221 voice: Optional[str] = None, 

222 output_path: Optional[str] = None, 

223) -> str: 

224 """Synthesize with OmniVoice (646 languages, zero-shot clone, GPU). 

225 

226 `voice` semantics: 

227 - Path to a .wav/.mp3/.flac reference → voice cloning (ref_audio) 

228 - Free-form descriptor (e.g. "female, low pitch, british accent") 

229 → voice design (instruct) 

230 - None / 'default' → default speaker 

231 

232 Language is auto-detected by the model from the input text; the 

233 `language` argument is carried through as metadata for logging and 

234 the downstream router. 

235 """ 

236 return _tool.synthesize( 

237 text=text, 

238 language=language, 

239 voice=voice, 

240 output_path=output_path, 

241 default_sample_rate=SAMPLE_RATE, 

242 ) 

243 

244 

245def unload_omnivoice(): 

246 """Stop the OmniVoice worker subprocess and free its VRAM.""" 

247 _tool.stop() 

248 

249 

250class OmniVoiceTool: 

251 """Register OmniVoice as an in-process service tool.""" 

252 

253 @classmethod 

254 def register_functions(cls): 

255 from .registry import ServiceToolInfo, service_tool_registry 

256 tool_info = ServiceToolInfo( 

257 name="omnivoice", 

258 description=( 

259 "OmniVoice TTS: 646 languages (every Indic script, " 

260 "zh/ja/ko, European, Arabic, low-resource). " 

261 "Zero-shot voice cloning from 3-10 s reference. " 

262 "Qwen3-0.6B + diffusion (Apache 2.0). ~2-3 GB VRAM. " 

263 "Requires: pip install omnivoice" 

264 ), 

265 base_url="inprocess://omnivoice", 

266 endpoints={ 

267 "synthesize": { 

268 "path": "/synthesize", 

269 "method": "POST", 

270 "description": ( 

271 "Synthesize with OmniVoice (646 langs, voice " 

272 "cloning, GPU)." 

273 ), 

274 "params_schema": { 

275 "text": {"type": "string"}, 

276 "language": {"type": "string"}, 

277 "voice": { 

278 "type": "string", 

279 "description": ( 

280 "Reference audio path (.wav/.mp3/.flac) " 

281 "for cloning, OR a descriptor string " 

282 "for voice design" 

283 ), 

284 }, 

285 }, 

286 }, 

287 }, 

288 tags=[ 

289 "tts", "speech", "gpu", "multilingual", "universal", 

290 "voice-clone", "indic", 

291 ], 

292 timeout=60, 

293 ) 

294 tool_info.is_healthy = True 

295 service_tool_registry._tools["omnivoice"] = tool_info 

296 return True 

297 

298# NOTE: no `if __name__ == '__main__':` block — the centralized 

299# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.