Coverage for integrations/service_tools/indic_parler

1"""

2Indic Parler TTS tool — 22 Indian languages + English (GPU).

4Supports: hi, ta, te, bn, gu, kn, ml, mr, or, pa, ur, as, bho, doi,

5 kok, mai, mni, ne, sa, sat, sd, en

6VRAM: 1.8GB model size, 2GB recommended.

7Requires: pip install indic-parler-tts

9SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the

10worker subprocess entry point. Model + tokenizer live in the worker;

11the parent just forwards requests.

13Public API (parent):

14 indic_parler_synthesize(text, language, voice, output_path) → JSON

15 unload_indic_parler() → None

17Worker entry:

18 python -m integrations.service_tools.indic_parler_tool

19"""

21import re

22from typing import Optional

24from integrations.service_tools.gpu_worker import ToolWorker

26# Fallback sample rate used by the parent-side default_sample_rate kwarg

27# when the worker response doesn't carry one. The real value comes from

28# model.config.sampling_rate at runtime (44100 for Indic Parler TTS).

29SAMPLE_RATE = 44100

31# Recommended voices per language (from Nunba's _LazyIndicParler.SPEAKERS).

32# Indic Parler's output character depends heavily on which named speaker

33# appears in the description — wrong language-speaker pairing gives poor

34# pronunciation.

35_SPEAKERS = {

36 'ta': 'Jaya', 'hi': 'Divya', 'bn': 'Aditi', 'te': 'Lalitha',

37 'kn': 'Anu', 'ml': 'Anjali', 'gu': 'Neha', 'mr': 'Sunita',

38 'as': 'Sita', 'ur': 'Divya', 'ne': 'Amrita', 'or': 'Debjani',

39 'sa': 'Aryan', 'mai': 'Aditi', 'mni': 'Laishram','sd': 'Divya',

40 'kok': 'Sunita','brx': 'Maya', 'doi': 'Karan', 'sat': 'Maya',

41 'pa': 'Divya', 'en': 'Divya',

42}

44# Tuning constants — match Nunba's _LazyIndicParler exactly so output is

45# acoustically identical after the port.

46_INTER_SENTENCE_GAP_S = 0.15

47_END_PAD_S = 0.5

48_PEAK_TARGET_DB = -1.0

49_SPLIT_THRESHOLD_CHARS = 80

50_MIN_CHUNK_CHARS = 20 # merge any sub-20-char fragment into neighbor

51_TAIL_MERGE_CHARS = 15 # merge any ≤15-char trailing fragment backwards

52_MAX_NEW_TOKENS_MIN = 3000

53_MAX_NEW_TOKENS_MAX = 8000

54_MAX_NEW_TOKENS_PER_CHAR = 50

57def _build_description(language: str) -> str:

58 """Build a style description with the recommended speaker for language."""

59 speaker = _SPEAKERS.get(language, 'Divya')

60 return (

61 f"{speaker} speaks with a confident, clear and expressive voice "

62 f"at a moderate pace. The recording is of very high quality with no "

63 f"background noise, the speaker's voice is loud, clear and very "

64 f"close to the microphone."

65 )

68def _split_sentences(text: str) -> list:

69 """Split text at real sentence boundaries (not mid-ellipsis).

71 Handles Latin + Indic punctuation (. ? ! । ৷). Protects "..." so

72 ellipses don't trigger splits. Merges fragments shorter than

73 _MIN_CHUNK_CHARS into their neighbor, and pulls any ≤

74 _TAIL_MERGE_CHARS tail back into the previous chunk.

75 """

76 protected = text.replace('...', '\x00ELLIPSIS\x00')

77 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected)

78 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts]

79 merged = []

80 for p in parts:

81 p = p.strip()

82 if not p:

83 continue

84 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS:

85 merged[-1] = merged[-1] + ' ' + p

86 else:

87 merged.append(p)

88 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS:

89 merged[-2] = merged[-2] + ' ' + merged[-1]

90 merged.pop()

91 return merged if len(merged) > 1 else [text]

94# ── Worker callbacks (run in subprocess) ──────────────────────────

96def _load():

97 """Load Indic Parler TTS + both tokenizers.

99 Ported from Nunba's _LazyIndicParler:

100 - Loads ParlerTTSForConditionalGeneration on CUDA

101 - Uses TWO tokenizers: one for the prompt text, one for the

102 description. The description tokenizer comes from the model's

103 own text encoder (different vocab from the prompt tokenizer).

104 """

105 from parler_tts import ParlerTTSForConditionalGeneration

106 from transformers import AutoTokenizer

107

108 model = ParlerTTSForConditionalGeneration.from_pretrained(

109 'ai4bharat/indic-parler-tts',

110 ).to('cuda')

111 # Prompt tokenizer: lowercases Indic text, matches the model's decoder.

112 prompt_tokenizer = AutoTokenizer.from_pretrained(

113 'ai4bharat/indic-parler-tts',

114 )

115 # Description tokenizer: matches the model's text encoder (different

116 # vocab — English-centric since descriptions are always English).

117 desc_tokenizer = AutoTokenizer.from_pretrained(

118 model.config.text_encoder._name_or_path,

119 )

120 return {

121 'model': model,

122 'prompt_tokenizer': prompt_tokenizer,

123 'desc_tokenizer': desc_tokenizer,

124 'sample_rate': model.config.sampling_rate,

125 }

126

127

128def _generate_chunk(state: dict, text: str, language: str):

129 """Generate audio for one text chunk; returns a 1-D numpy float32 array."""

130 import torch

131

132 model = state['model']

133 prompt_tokenizer = state['prompt_tokenizer']

134 desc_tokenizer = state['desc_tokenizer']

135

136 description = _build_description(language)

137 desc_inputs = desc_tokenizer(description, return_tensors='pt').to('cuda')

138 prompt_inputs = prompt_tokenizer(text, return_tensors='pt').to('cuda')

139 max_tokens = max(

140 _MAX_NEW_TOKENS_MIN,

141 min(_MAX_NEW_TOKENS_MAX, len(text) * _MAX_NEW_TOKENS_PER_CHAR),

142 )

143

144 with torch.no_grad():

145 generation = model.generate(

146 input_ids=desc_inputs.input_ids,

147 attention_mask=desc_inputs.attention_mask,

148 prompt_input_ids=prompt_inputs.input_ids,

149 prompt_attention_mask=prompt_inputs.attention_mask,

150 max_new_tokens=max_tokens,

151 )

152 return generation.cpu().float().numpy().squeeze()

153

154

155def _synthesize(state, req: dict) -> dict:

156 text = req.get('text', '')

157 if not text or not text.strip():

158 return {'error': 'Text is required'}

159 output_path = req.get('output_path')

160 if not output_path:

161 return {'error': 'output_path is required'}

162

163 import numpy as np

164 import soundfile as sf

165

166 language = req.get('language', 'hi')

167 sr = state['sample_rate']

168

169 # Split long text to prevent Indic Parler's tendency to clip long

170 # utterances' tails. Threshold 80 chars matches Nunba's tuning.

171 if len(text) > _SPLIT_THRESHOLD_CHARS:

172 sentences = _split_sentences(text)

173 else:

174 sentences = [text]

175

176 if len(sentences) == 1:

177 audio = _generate_chunk(state, text, language)

178 else:

179 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32)

180 chunks = []

181 for i, sent in enumerate(sentences):

182 chunk_audio = _generate_chunk(state, sent, language)

183 if chunk_audio is not None and len(chunk_audio) > 0:

184 chunks.append(chunk_audio)

185 if i < len(sentences) - 1:

186 chunks.append(gap)

187 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32)

188

189 # Pad trailing silence to prevent chopped endings

190 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32)

191 audio = np.concatenate([audio, end_pad])

192

193 # Peak-normalize to the target dBFS

194 peak = float(np.abs(audio).max())

195 if peak > 0:

196 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0)

197 audio = audio * (target_peak / peak)

198

199 sf.write(output_path, audio, sr)

200

201 return {

202 'path': output_path,

203 'duration': round(len(audio) / sr, 2),

204 'sample_rate': sr,

205 'engine': 'indic-parler-tts',

206 'device': 'cuda',

207 'language': language,

208 'voice': f"{_SPEAKERS.get(language, 'Divya')} ({language})",

209 }

210

211

212# ── Parent-side: ToolWorker instance ─────────────────────────────

213

214_tool = ToolWorker(

215 tool_name='indic_parler',

216 tool_module='integrations.service_tools.indic_parler_tool',

217 vram_budget='tts_indic_parler',

218 output_subdir='indic_parler/output',

219 engine='indic-parler-tts',

220 startup_timeout=120.0,

221 request_timeout=120.0,

222)

223

224

225def indic_parler_synthesize(

226 text: str,

227 language: str = 'hi',

228 voice: Optional[str] = None,

229 output_path: Optional[str] = None,

230) -> str:

231 """Synthesize with Indic Parler TTS (22 Indic languages, GPU subprocess).

232

233 `voice` here is a style description (e.g. "A female speaker with

234 calm tone"), not a reference audio path. Indic Parler uses

235 text-conditioned styles, not voice cloning.

236 """

237 return _tool.synthesize(

238 text=text,

239 language=language,

240 voice=voice,

241 output_path=output_path,

242 default_sample_rate=SAMPLE_RATE,

243 )

244

245

246def unload_indic_parler():

247 """Stop the Indic Parler worker subprocess and free its VRAM."""

248 _tool.stop()

249

250

251class IndicParlerTool:

252 """Register Indic Parler as an in-process service tool."""

253

254 @classmethod

255 def register_functions(cls):

256 from .registry import ServiceToolInfo, service_tool_registry

257 tool_info = ServiceToolInfo(

258 name="indic_parler",

259 description=(

260 "Indic Parler TTS: 22 Indian languages + English. "

261 "Style-conditioned synthesis (no voice cloning). "

262 "1.8GB VRAM. Requires: pip install indic-parler-tts"

263 ),

264 base_url="inprocess://indic_parler",

265 endpoints={

266 "synthesize": {

267 "path": "/synthesize",

268 "method": "POST",

269 "description": "Synthesize with Indic Parler TTS (22 Indic languages, GPU).",

270 "params_schema": {

271 "text": {"type": "string"},

272 "language": {"type": "string"},

273 "voice": {"type": "string", "description": "Style description text"},

274 },

275 },

276 },

277 tags=["tts", "speech", "gpu", "indic", "multilingual"],

278 timeout=60,

279 )

280 tool_info.is_healthy = True

281 service_tool_registry._tools["indic_parler"] = tool_info

282 return True

283

284# NOTE: no `if __name__ == '__main__':` block — the centralized

285# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.

Coverage for integrations / service_tools / indic_parler_tool.py: 32.7%

98 statements