Coverage for integrations / service_tools / indic_parler_tool.py: 32.7%
98 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Indic Parler TTS tool — 22 Indian languages + English (GPU).
4Supports: hi, ta, te, bn, gu, kn, ml, mr, or, pa, ur, as, bho, doi,
5 kok, mai, mni, ne, sa, sat, sd, en
6VRAM: 1.8GB model size, 2GB recommended.
7Requires: pip install indic-parler-tts
9SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the
10worker subprocess entry point. Model + tokenizer live in the worker;
11the parent just forwards requests.
13Public API (parent):
14 indic_parler_synthesize(text, language, voice, output_path) → JSON
15 unload_indic_parler() → None
17Worker entry:
18 python -m integrations.service_tools.indic_parler_tool
19"""
21import re
22from typing import Optional
24from integrations.service_tools.gpu_worker import ToolWorker
26# Fallback sample rate used by the parent-side default_sample_rate kwarg
27# when the worker response doesn't carry one. The real value comes from
28# model.config.sampling_rate at runtime (44100 for Indic Parler TTS).
29SAMPLE_RATE = 44100
31# Recommended voices per language (from Nunba's _LazyIndicParler.SPEAKERS).
32# Indic Parler's output character depends heavily on which named speaker
33# appears in the description — wrong language-speaker pairing gives poor
34# pronunciation.
35_SPEAKERS = {
36 'ta': 'Jaya', 'hi': 'Divya', 'bn': 'Aditi', 'te': 'Lalitha',
37 'kn': 'Anu', 'ml': 'Anjali', 'gu': 'Neha', 'mr': 'Sunita',
38 'as': 'Sita', 'ur': 'Divya', 'ne': 'Amrita', 'or': 'Debjani',
39 'sa': 'Aryan', 'mai': 'Aditi', 'mni': 'Laishram','sd': 'Divya',
40 'kok': 'Sunita','brx': 'Maya', 'doi': 'Karan', 'sat': 'Maya',
41 'pa': 'Divya', 'en': 'Divya',
42}
44# Tuning constants — match Nunba's _LazyIndicParler exactly so output is
45# acoustically identical after the port.
46_INTER_SENTENCE_GAP_S = 0.15
47_END_PAD_S = 0.5
48_PEAK_TARGET_DB = -1.0
49_SPLIT_THRESHOLD_CHARS = 80
50_MIN_CHUNK_CHARS = 20 # merge any sub-20-char fragment into neighbor
51_TAIL_MERGE_CHARS = 15 # merge any ≤15-char trailing fragment backwards
52_MAX_NEW_TOKENS_MIN = 3000
53_MAX_NEW_TOKENS_MAX = 8000
54_MAX_NEW_TOKENS_PER_CHAR = 50
57def _build_description(language: str) -> str:
58 """Build a style description with the recommended speaker for language."""
59 speaker = _SPEAKERS.get(language, 'Divya')
60 return (
61 f"{speaker} speaks with a confident, clear and expressive voice "
62 f"at a moderate pace. The recording is of very high quality with no "
63 f"background noise, the speaker's voice is loud, clear and very "
64 f"close to the microphone."
65 )
68def _split_sentences(text: str) -> list:
69 """Split text at real sentence boundaries (not mid-ellipsis).
71 Handles Latin + Indic punctuation (. ? ! । ৷). Protects "..." so
72 ellipses don't trigger splits. Merges fragments shorter than
73 _MIN_CHUNK_CHARS into their neighbor, and pulls any ≤
74 _TAIL_MERGE_CHARS tail back into the previous chunk.
75 """
76 protected = text.replace('...', '\x00ELLIPSIS\x00')
77 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected)
78 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts]
79 merged = []
80 for p in parts:
81 p = p.strip()
82 if not p:
83 continue
84 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS:
85 merged[-1] = merged[-1] + ' ' + p
86 else:
87 merged.append(p)
88 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS:
89 merged[-2] = merged[-2] + ' ' + merged[-1]
90 merged.pop()
91 return merged if len(merged) > 1 else [text]
94# ── Worker callbacks (run in subprocess) ──────────────────────────
96def _load():
97 """Load Indic Parler TTS + both tokenizers.
99 Ported from Nunba's _LazyIndicParler:
100 - Loads ParlerTTSForConditionalGeneration on CUDA
101 - Uses TWO tokenizers: one for the prompt text, one for the
102 description. The description tokenizer comes from the model's
103 own text encoder (different vocab from the prompt tokenizer).
104 """
105 from parler_tts import ParlerTTSForConditionalGeneration
106 from transformers import AutoTokenizer
108 model = ParlerTTSForConditionalGeneration.from_pretrained(
109 'ai4bharat/indic-parler-tts',
110 ).to('cuda')
111 # Prompt tokenizer: lowercases Indic text, matches the model's decoder.
112 prompt_tokenizer = AutoTokenizer.from_pretrained(
113 'ai4bharat/indic-parler-tts',
114 )
115 # Description tokenizer: matches the model's text encoder (different
116 # vocab — English-centric since descriptions are always English).
117 desc_tokenizer = AutoTokenizer.from_pretrained(
118 model.config.text_encoder._name_or_path,
119 )
120 return {
121 'model': model,
122 'prompt_tokenizer': prompt_tokenizer,
123 'desc_tokenizer': desc_tokenizer,
124 'sample_rate': model.config.sampling_rate,
125 }
128def _generate_chunk(state: dict, text: str, language: str):
129 """Generate audio for one text chunk; returns a 1-D numpy float32 array."""
130 import torch
132 model = state['model']
133 prompt_tokenizer = state['prompt_tokenizer']
134 desc_tokenizer = state['desc_tokenizer']
136 description = _build_description(language)
137 desc_inputs = desc_tokenizer(description, return_tensors='pt').to('cuda')
138 prompt_inputs = prompt_tokenizer(text, return_tensors='pt').to('cuda')
139 max_tokens = max(
140 _MAX_NEW_TOKENS_MIN,
141 min(_MAX_NEW_TOKENS_MAX, len(text) * _MAX_NEW_TOKENS_PER_CHAR),
142 )
144 with torch.no_grad():
145 generation = model.generate(
146 input_ids=desc_inputs.input_ids,
147 attention_mask=desc_inputs.attention_mask,
148 prompt_input_ids=prompt_inputs.input_ids,
149 prompt_attention_mask=prompt_inputs.attention_mask,
150 max_new_tokens=max_tokens,
151 )
152 return generation.cpu().float().numpy().squeeze()
155def _synthesize(state, req: dict) -> dict:
156 text = req.get('text', '')
157 if not text or not text.strip():
158 return {'error': 'Text is required'}
159 output_path = req.get('output_path')
160 if not output_path:
161 return {'error': 'output_path is required'}
163 import numpy as np
164 import soundfile as sf
166 language = req.get('language', 'hi')
167 sr = state['sample_rate']
169 # Split long text to prevent Indic Parler's tendency to clip long
170 # utterances' tails. Threshold 80 chars matches Nunba's tuning.
171 if len(text) > _SPLIT_THRESHOLD_CHARS:
172 sentences = _split_sentences(text)
173 else:
174 sentences = [text]
176 if len(sentences) == 1:
177 audio = _generate_chunk(state, text, language)
178 else:
179 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32)
180 chunks = []
181 for i, sent in enumerate(sentences):
182 chunk_audio = _generate_chunk(state, sent, language)
183 if chunk_audio is not None and len(chunk_audio) > 0:
184 chunks.append(chunk_audio)
185 if i < len(sentences) - 1:
186 chunks.append(gap)
187 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32)
189 # Pad trailing silence to prevent chopped endings
190 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32)
191 audio = np.concatenate([audio, end_pad])
193 # Peak-normalize to the target dBFS
194 peak = float(np.abs(audio).max())
195 if peak > 0:
196 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0)
197 audio = audio * (target_peak / peak)
199 sf.write(output_path, audio, sr)
201 return {
202 'path': output_path,
203 'duration': round(len(audio) / sr, 2),
204 'sample_rate': sr,
205 'engine': 'indic-parler-tts',
206 'device': 'cuda',
207 'language': language,
208 'voice': f"{_SPEAKERS.get(language, 'Divya')} ({language})",
209 }
212# ── Parent-side: ToolWorker instance ─────────────────────────────
214_tool = ToolWorker(
215 tool_name='indic_parler',
216 tool_module='integrations.service_tools.indic_parler_tool',
217 vram_budget='tts_indic_parler',
218 output_subdir='indic_parler/output',
219 engine='indic-parler-tts',
220 startup_timeout=120.0,
221 request_timeout=120.0,
222)
225def indic_parler_synthesize(
226 text: str,
227 language: str = 'hi',
228 voice: Optional[str] = None,
229 output_path: Optional[str] = None,
230) -> str:
231 """Synthesize with Indic Parler TTS (22 Indic languages, GPU subprocess).
233 `voice` here is a style description (e.g. "A female speaker with
234 calm tone"), not a reference audio path. Indic Parler uses
235 text-conditioned styles, not voice cloning.
236 """
237 return _tool.synthesize(
238 text=text,
239 language=language,
240 voice=voice,
241 output_path=output_path,
242 default_sample_rate=SAMPLE_RATE,
243 )
246def unload_indic_parler():
247 """Stop the Indic Parler worker subprocess and free its VRAM."""
248 _tool.stop()
251class IndicParlerTool:
252 """Register Indic Parler as an in-process service tool."""
254 @classmethod
255 def register_functions(cls):
256 from .registry import ServiceToolInfo, service_tool_registry
257 tool_info = ServiceToolInfo(
258 name="indic_parler",
259 description=(
260 "Indic Parler TTS: 22 Indian languages + English. "
261 "Style-conditioned synthesis (no voice cloning). "
262 "1.8GB VRAM. Requires: pip install indic-parler-tts"
263 ),
264 base_url="inprocess://indic_parler",
265 endpoints={
266 "synthesize": {
267 "path": "/synthesize",
268 "method": "POST",
269 "description": "Synthesize with Indic Parler TTS (22 Indic languages, GPU).",
270 "params_schema": {
271 "text": {"type": "string"},
272 "language": {"type": "string"},
273 "voice": {"type": "string", "description": "Style description text"},
274 },
275 },
276 },
277 tags=["tts", "speech", "gpu", "indic", "multilingual"],
278 timeout=60,
279 )
280 tool_info.is_healthy = True
281 service_tool_registry._tools["indic_parler"] = tool_info
282 return True
284# NOTE: no `if __name__ == '__main__':` block — the centralized
285# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.