Coverage for integrations/service_tools/omnivoice

1"""

2OmniVoice TTS tool — 646 languages, zero-shot voice cloning (GPU).

4Backbone: Qwen3-0.6B + diffusion decoder. Trained on 581k hours spanning

5646 languages — covers every Indic script with substantially more hours

6per language than Indic-Parler (e.g. Tamil 423h vs parler's ~10h), and

7also covers English, Mandarin, Japanese, Korean, European, Arabic,

8African, and low-resource tongues. Apache 2.0.

10VRAM: stubbed at 3.0 GB until first real load. vram_manager's

11record_actual_usage() auto-tightens the budget from the worker's

12'__WORKER_VRAM_GB__' telemetry marker on startup.

14Requires: pip install omnivoice torch soundfile

15Model: HF hub 'k2-fsa/OmniVoice' (~1.5 GB safetensors)

17Public API (parent):

18 omnivoice_synthesize(text, language, voice, output_path) -> JSON

19 unload_omnivoice() -> None

21Worker entry (via dispatcher):

22 python -m integrations.service_tools.gpu_worker \\

23 integrations.service_tools.omnivoice_tool

25SUBPROCESS ISOLATED: model + tokenizer live in the worker. Parent

26forwards requests through ToolWorker.

27"""

29from __future__ import annotations

31import os

32import re

33from typing import Optional

35from integrations.service_tools.gpu_worker import ToolWorker

37# OmniVoice outputs at 24 kHz (matches every other neural TTS in this

38# registry — simplifies audio concatenation in the Nunba chat pipeline).

39SAMPLE_RATE = 24000

41# Hugging Face repo id — 646-language 0.6B checkpoint

42HF_MODEL_ID = 'k2-fsa/OmniVoice'

44# Sentence-chunking thresholds — diffusion decoders trail off at long

45# contexts; chunk + concat with a short gap for clean prosody.

46_INTER_SENTENCE_GAP_S = 0.12

47_END_PAD_S = 0.4

48_PEAK_TARGET_DB = -1.0

49_SPLIT_THRESHOLD_CHARS = 120 # OmniVoice handles longer spans than parler

50_MIN_CHUNK_CHARS = 20

51_TAIL_MERGE_CHARS = 15

53# Extensions we recognise as reference audio file paths (voice cloning);

54# anything else passed in `voice` is treated as a free-form descriptor

55# (passed to the model's `instruct` argument for voice design).

56_AUDIO_SUFFIXES = ('.wav', '.mp3', '.flac', '.ogg', '.m4a')

59# ─── Helpers ────────────────────────────────────────────────────────────

61def _split_sentences(text: str) -> list:

62 """Split at Latin + Indic sentence boundaries, protect '...', merge

63 shorts. Same logic as indic_parler_tool._split_sentences — duplicated

64 here deliberately to keep worker modules self-contained (no shared

65 runtime state between sibling TTS workers)."""

66 protected = text.replace('...', '\x00ELLIPSIS\x00')

67 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected)

68 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts]

69 merged = []

70 for p in parts:

71 p = p.strip()

72 if not p:

73 continue

74 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS:

75 merged[-1] = merged[-1] + ' ' + p

76 else:

77 merged.append(p)

78 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS:

79 merged[-2] = merged[-2] + ' ' + merged[-1]

80 merged.pop()

81 return merged if len(merged) > 1 else [text]

84def _is_audio_path(voice: Optional[str]) -> bool:

85 if not voice:

86 return False

87 v = voice.strip()

88 if v.lower().endswith(_AUDIO_SUFFIXES):

89 return True

90 # Windows-absolute (C:\..) or POSIX-absolute (/..) that exists on disk

91 if os.path.isabs(v) and os.path.isfile(v):

92 return True

93 return False

96# ─── Worker-side callbacks (run in subprocess) ──────────────────────────

98def _load():

99 """Load OmniVoice on GPU (fp16).

100

101 If the official `omnivoice` package isn't installed we fall back to

102 a clear error — the worker exits with code 2 and the parent returns

103 'insufficient compute' so the router demotes to the next engine in

104 LANG_ENGINE_PREFERENCE (indic_parler for Indic, chatterbox_ml for

105 others, espeak as final fallback).

106 """

107 import torch

108 try:

109 from omnivoice import OmniVoice # type: ignore

110 except ImportError as e:

111 raise ImportError(

112 "omnivoice package not installed. "

113 "Install with: pip install omnivoice"

114 ) from e

115

116 model = OmniVoice.from_pretrained(

117 HF_MODEL_ID,

118 device_map='cuda:0',

119 dtype=torch.float16,

120 )

121 return {

122 'model': model,

123 'sample_rate': SAMPLE_RATE,

124 }

125

126

127def _generate_chunk(state: dict, text: str, voice: Optional[str]):

128 """One chunk through OmniVoice. Returns np.ndarray (float32)."""

129 model = state['model']

130 kwargs = {'text': text}

131 if voice:

132 if _is_audio_path(voice):

133 kwargs['ref_audio'] = voice

134 # OmniVoice auto-transcribes the reference if ref_text is

135 # omitted — wrapped in try/except inside model.generate.

136 kwargs['ref_text'] = ''

137 else:

138 # Free-form speaker descriptor → voice-design path

139 kwargs['instruct'] = voice

140

141 audio = model.generate(**kwargs)

142 # OmniVoice returns a list of np.ndarray; take the first clip

143 if isinstance(audio, (list, tuple)):

144 audio = audio[0]

145 import numpy as np

146 if hasattr(audio, 'detach'):

147 audio = audio.detach().cpu().float().numpy()

148 return np.asarray(audio, dtype='float32').squeeze()

149

150

151def _synthesize(state, req: dict) -> dict:

152 text = req.get('text', '')

153 if not text or not text.strip():

154 return {'error': 'Text is required'}

155 output_path = req.get('output_path')

156 if not output_path:

157 return {'error': 'output_path is required'}

158

159 import numpy as np

160 import soundfile as sf

161

162 language = req.get('language', 'en')

163 voice = req.get('voice')

164 sr = state['sample_rate']

165

166 if len(text) > _SPLIT_THRESHOLD_CHARS:

167 sentences = _split_sentences(text)

168 else:

169 sentences = [text]

170

171 if len(sentences) == 1:

172 audio = _generate_chunk(state, text, voice)

173 else:

174 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32)

175 chunks = []

176 for i, sent in enumerate(sentences):

177 chunk_audio = _generate_chunk(state, sent, voice)

178 if chunk_audio is not None and len(chunk_audio) > 0:

179 chunks.append(chunk_audio)

180 if i < len(sentences) - 1:

181 chunks.append(gap)

182 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32)

183

184 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32)

185 audio = np.concatenate([audio, end_pad])

186

187 peak = float(np.abs(audio).max())

188 if peak > 0:

189 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0)

190 audio = audio * (target_peak / peak)

191

192 sf.write(output_path, audio, sr)

193

194 return {

195 'path': output_path,

196 'duration': round(len(audio) / sr, 2),

197 'sample_rate': sr,

198 'engine': 'omnivoice',

199 'device': 'cuda',

200 'language': language,

201 'voice': voice or 'default',

202 }

203

204

205# ─── Parent-side: ToolWorker instance ───────────────────────────────────

206

207_tool = ToolWorker(

208 tool_name='omnivoice',

209 tool_module='integrations.service_tools.omnivoice_tool',

210 vram_budget='tts_omnivoice',

211 output_subdir='omnivoice/output',

212 engine='omnivoice',

213 startup_timeout=180.0, # first run downloads ~1.5 GB checkpoint

214 request_timeout=120.0,

215)

216

217

218def omnivoice_synthesize(

219 text: str,

220 language: str = 'en',

221 voice: Optional[str] = None,

222 output_path: Optional[str] = None,

223) -> str:

224 """Synthesize with OmniVoice (646 languages, zero-shot clone, GPU).

225

226 `voice` semantics:

227 - Path to a .wav/.mp3/.flac reference → voice cloning (ref_audio)

228 - Free-form descriptor (e.g. "female, low pitch, british accent")

229 → voice design (instruct)

230 - None / 'default' → default speaker

231

232 Language is auto-detected by the model from the input text; the

233 `language` argument is carried through as metadata for logging and

234 the downstream router.

235 """

236 return _tool.synthesize(

237 text=text,

238 language=language,

239 voice=voice,

240 output_path=output_path,

241 default_sample_rate=SAMPLE_RATE,

242 )

243

244

245def unload_omnivoice():

246 """Stop the OmniVoice worker subprocess and free its VRAM."""

247 _tool.stop()

248

249

250class OmniVoiceTool:

251 """Register OmniVoice as an in-process service tool."""

252

253 @classmethod

254 def register_functions(cls):

255 from .registry import ServiceToolInfo, service_tool_registry

256 tool_info = ServiceToolInfo(

257 name="omnivoice",

258 description=(

259 "OmniVoice TTS: 646 languages (every Indic script, "

260 "zh/ja/ko, European, Arabic, low-resource). "

261 "Zero-shot voice cloning from 3-10 s reference. "

262 "Qwen3-0.6B + diffusion (Apache 2.0). ~2-3 GB VRAM. "

263 "Requires: pip install omnivoice"

264 ),

265 base_url="inprocess://omnivoice",

266 endpoints={

267 "synthesize": {

268 "path": "/synthesize",

269 "method": "POST",

270 "description": (

271 "Synthesize with OmniVoice (646 langs, voice "

272 "cloning, GPU)."

273 ),

274 "params_schema": {

275 "text": {"type": "string"},

276 "language": {"type": "string"},

277 "voice": {

278 "type": "string",

279 "description": (

280 "Reference audio path (.wav/.mp3/.flac) "

281 "for cloning, OR a descriptor string "

282 "for voice design"

283 ),

284 },

285 },

286 },

287 },

288 tags=[

289 "tts", "speech", "gpu", "multilingual", "universal",

290 "voice-clone", "indic",

291 ],

292 timeout=60,

293 )

294 tool_info.is_healthy = True

295 service_tool_registry._tools["omnivoice"] = tool_info

296 return True

297

298# NOTE: no `if __name__ == '__main__':` block — the centralized

299# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.

Coverage for integrations / service_tools / omnivoice_tool.py: 41.1%

107 statements