Coverage for integrations / service_tools / omnivoice_tool.py: 41.1%
107 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2OmniVoice TTS tool — 646 languages, zero-shot voice cloning (GPU).
4Backbone: Qwen3-0.6B + diffusion decoder. Trained on 581k hours spanning
5646 languages — covers every Indic script with substantially more hours
6per language than Indic-Parler (e.g. Tamil 423h vs parler's ~10h), and
7also covers English, Mandarin, Japanese, Korean, European, Arabic,
8African, and low-resource tongues. Apache 2.0.
10VRAM: stubbed at 3.0 GB until first real load. vram_manager's
11record_actual_usage() auto-tightens the budget from the worker's
12'__WORKER_VRAM_GB__' telemetry marker on startup.
14Requires: pip install omnivoice torch soundfile
15Model: HF hub 'k2-fsa/OmniVoice' (~1.5 GB safetensors)
17Public API (parent):
18 omnivoice_synthesize(text, language, voice, output_path) -> JSON
19 unload_omnivoice() -> None
21Worker entry (via dispatcher):
22 python -m integrations.service_tools.gpu_worker \\
23 integrations.service_tools.omnivoice_tool
25SUBPROCESS ISOLATED: model + tokenizer live in the worker. Parent
26forwards requests through ToolWorker.
27"""
29from __future__ import annotations
31import os
32import re
33from typing import Optional
35from integrations.service_tools.gpu_worker import ToolWorker
37# OmniVoice outputs at 24 kHz (matches every other neural TTS in this
38# registry — simplifies audio concatenation in the Nunba chat pipeline).
39SAMPLE_RATE = 24000
41# Hugging Face repo id — 646-language 0.6B checkpoint
42HF_MODEL_ID = 'k2-fsa/OmniVoice'
44# Sentence-chunking thresholds — diffusion decoders trail off at long
45# contexts; chunk + concat with a short gap for clean prosody.
46_INTER_SENTENCE_GAP_S = 0.12
47_END_PAD_S = 0.4
48_PEAK_TARGET_DB = -1.0
49_SPLIT_THRESHOLD_CHARS = 120 # OmniVoice handles longer spans than parler
50_MIN_CHUNK_CHARS = 20
51_TAIL_MERGE_CHARS = 15
53# Extensions we recognise as reference audio file paths (voice cloning);
54# anything else passed in `voice` is treated as a free-form descriptor
55# (passed to the model's `instruct` argument for voice design).
56_AUDIO_SUFFIXES = ('.wav', '.mp3', '.flac', '.ogg', '.m4a')
59# ─── Helpers ────────────────────────────────────────────────────────────
61def _split_sentences(text: str) -> list:
62 """Split at Latin + Indic sentence boundaries, protect '...', merge
63 shorts. Same logic as indic_parler_tool._split_sentences — duplicated
64 here deliberately to keep worker modules self-contained (no shared
65 runtime state between sibling TTS workers)."""
66 protected = text.replace('...', '\x00ELLIPSIS\x00')
67 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected)
68 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts]
69 merged = []
70 for p in parts:
71 p = p.strip()
72 if not p:
73 continue
74 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS:
75 merged[-1] = merged[-1] + ' ' + p
76 else:
77 merged.append(p)
78 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS:
79 merged[-2] = merged[-2] + ' ' + merged[-1]
80 merged.pop()
81 return merged if len(merged) > 1 else [text]
84def _is_audio_path(voice: Optional[str]) -> bool:
85 if not voice:
86 return False
87 v = voice.strip()
88 if v.lower().endswith(_AUDIO_SUFFIXES):
89 return True
90 # Windows-absolute (C:\..) or POSIX-absolute (/..) that exists on disk
91 if os.path.isabs(v) and os.path.isfile(v):
92 return True
93 return False
96# ─── Worker-side callbacks (run in subprocess) ──────────────────────────
98def _load():
99 """Load OmniVoice on GPU (fp16).
101 If the official `omnivoice` package isn't installed we fall back to
102 a clear error — the worker exits with code 2 and the parent returns
103 'insufficient compute' so the router demotes to the next engine in
104 LANG_ENGINE_PREFERENCE (indic_parler for Indic, chatterbox_ml for
105 others, espeak as final fallback).
106 """
107 import torch
108 try:
109 from omnivoice import OmniVoice # type: ignore
110 except ImportError as e:
111 raise ImportError(
112 "omnivoice package not installed. "
113 "Install with: pip install omnivoice"
114 ) from e
116 model = OmniVoice.from_pretrained(
117 HF_MODEL_ID,
118 device_map='cuda:0',
119 dtype=torch.float16,
120 )
121 return {
122 'model': model,
123 'sample_rate': SAMPLE_RATE,
124 }
127def _generate_chunk(state: dict, text: str, voice: Optional[str]):
128 """One chunk through OmniVoice. Returns np.ndarray (float32)."""
129 model = state['model']
130 kwargs = {'text': text}
131 if voice:
132 if _is_audio_path(voice):
133 kwargs['ref_audio'] = voice
134 # OmniVoice auto-transcribes the reference if ref_text is
135 # omitted — wrapped in try/except inside model.generate.
136 kwargs['ref_text'] = ''
137 else:
138 # Free-form speaker descriptor → voice-design path
139 kwargs['instruct'] = voice
141 audio = model.generate(**kwargs)
142 # OmniVoice returns a list of np.ndarray; take the first clip
143 if isinstance(audio, (list, tuple)):
144 audio = audio[0]
145 import numpy as np
146 if hasattr(audio, 'detach'):
147 audio = audio.detach().cpu().float().numpy()
148 return np.asarray(audio, dtype='float32').squeeze()
151def _synthesize(state, req: dict) -> dict:
152 text = req.get('text', '')
153 if not text or not text.strip():
154 return {'error': 'Text is required'}
155 output_path = req.get('output_path')
156 if not output_path:
157 return {'error': 'output_path is required'}
159 import numpy as np
160 import soundfile as sf
162 language = req.get('language', 'en')
163 voice = req.get('voice')
164 sr = state['sample_rate']
166 if len(text) > _SPLIT_THRESHOLD_CHARS:
167 sentences = _split_sentences(text)
168 else:
169 sentences = [text]
171 if len(sentences) == 1:
172 audio = _generate_chunk(state, text, voice)
173 else:
174 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32)
175 chunks = []
176 for i, sent in enumerate(sentences):
177 chunk_audio = _generate_chunk(state, sent, voice)
178 if chunk_audio is not None and len(chunk_audio) > 0:
179 chunks.append(chunk_audio)
180 if i < len(sentences) - 1:
181 chunks.append(gap)
182 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32)
184 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32)
185 audio = np.concatenate([audio, end_pad])
187 peak = float(np.abs(audio).max())
188 if peak > 0:
189 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0)
190 audio = audio * (target_peak / peak)
192 sf.write(output_path, audio, sr)
194 return {
195 'path': output_path,
196 'duration': round(len(audio) / sr, 2),
197 'sample_rate': sr,
198 'engine': 'omnivoice',
199 'device': 'cuda',
200 'language': language,
201 'voice': voice or 'default',
202 }
205# ─── Parent-side: ToolWorker instance ───────────────────────────────────
207_tool = ToolWorker(
208 tool_name='omnivoice',
209 tool_module='integrations.service_tools.omnivoice_tool',
210 vram_budget='tts_omnivoice',
211 output_subdir='omnivoice/output',
212 engine='omnivoice',
213 startup_timeout=180.0, # first run downloads ~1.5 GB checkpoint
214 request_timeout=120.0,
215)
218def omnivoice_synthesize(
219 text: str,
220 language: str = 'en',
221 voice: Optional[str] = None,
222 output_path: Optional[str] = None,
223) -> str:
224 """Synthesize with OmniVoice (646 languages, zero-shot clone, GPU).
226 `voice` semantics:
227 - Path to a .wav/.mp3/.flac reference → voice cloning (ref_audio)
228 - Free-form descriptor (e.g. "female, low pitch, british accent")
229 → voice design (instruct)
230 - None / 'default' → default speaker
232 Language is auto-detected by the model from the input text; the
233 `language` argument is carried through as metadata for logging and
234 the downstream router.
235 """
236 return _tool.synthesize(
237 text=text,
238 language=language,
239 voice=voice,
240 output_path=output_path,
241 default_sample_rate=SAMPLE_RATE,
242 )
245def unload_omnivoice():
246 """Stop the OmniVoice worker subprocess and free its VRAM."""
247 _tool.stop()
250class OmniVoiceTool:
251 """Register OmniVoice as an in-process service tool."""
253 @classmethod
254 def register_functions(cls):
255 from .registry import ServiceToolInfo, service_tool_registry
256 tool_info = ServiceToolInfo(
257 name="omnivoice",
258 description=(
259 "OmniVoice TTS: 646 languages (every Indic script, "
260 "zh/ja/ko, European, Arabic, low-resource). "
261 "Zero-shot voice cloning from 3-10 s reference. "
262 "Qwen3-0.6B + diffusion (Apache 2.0). ~2-3 GB VRAM. "
263 "Requires: pip install omnivoice"
264 ),
265 base_url="inprocess://omnivoice",
266 endpoints={
267 "synthesize": {
268 "path": "/synthesize",
269 "method": "POST",
270 "description": (
271 "Synthesize with OmniVoice (646 langs, voice "
272 "cloning, GPU)."
273 ),
274 "params_schema": {
275 "text": {"type": "string"},
276 "language": {"type": "string"},
277 "voice": {
278 "type": "string",
279 "description": (
280 "Reference audio path (.wav/.mp3/.flac) "
281 "for cloning, OR a descriptor string "
282 "for voice design"
283 ),
284 },
285 },
286 },
287 },
288 tags=[
289 "tts", "speech", "gpu", "multilingual", "universal",
290 "voice-clone", "indic",
291 ],
292 timeout=60,
293 )
294 tool_info.is_healthy = True
295 service_tool_registry._tools["omnivoice"] = tool_info
296 return True
298# NOTE: no `if __name__ == '__main__':` block — the centralized
299# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.