Coverage for integrations / service_tools / indic_parler_tool.py: 32.7%

98 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Indic Parler TTS tool — 22 Indian languages + English (GPU). 

3 

4Supports: hi, ta, te, bn, gu, kn, ml, mr, or, pa, ur, as, bho, doi, 

5 kok, mai, mni, ne, sa, sat, sd, en 

6VRAM: 1.8GB model size, 2GB recommended. 

7Requires: pip install indic-parler-tts 

8 

9SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the 

10worker subprocess entry point. Model + tokenizer live in the worker; 

11the parent just forwards requests. 

12 

13Public API (parent): 

14 indic_parler_synthesize(text, language, voice, output_path) → JSON 

15 unload_indic_parler() → None 

16 

17Worker entry: 

18 python -m integrations.service_tools.indic_parler_tool 

19""" 

20 

21import re 

22from typing import Optional 

23 

24from integrations.service_tools.gpu_worker import ToolWorker 

25 

26# Fallback sample rate used by the parent-side default_sample_rate kwarg 

27# when the worker response doesn't carry one. The real value comes from 

28# model.config.sampling_rate at runtime (44100 for Indic Parler TTS). 

29SAMPLE_RATE = 44100 

30 

31# Recommended voices per language (from Nunba's _LazyIndicParler.SPEAKERS). 

32# Indic Parler's output character depends heavily on which named speaker 

33# appears in the description — wrong language-speaker pairing gives poor 

34# pronunciation. 

35_SPEAKERS = { 

36 'ta': 'Jaya', 'hi': 'Divya', 'bn': 'Aditi', 'te': 'Lalitha', 

37 'kn': 'Anu', 'ml': 'Anjali', 'gu': 'Neha', 'mr': 'Sunita', 

38 'as': 'Sita', 'ur': 'Divya', 'ne': 'Amrita', 'or': 'Debjani', 

39 'sa': 'Aryan', 'mai': 'Aditi', 'mni': 'Laishram','sd': 'Divya', 

40 'kok': 'Sunita','brx': 'Maya', 'doi': 'Karan', 'sat': 'Maya', 

41 'pa': 'Divya', 'en': 'Divya', 

42} 

43 

44# Tuning constants — match Nunba's _LazyIndicParler exactly so output is 

45# acoustically identical after the port. 

46_INTER_SENTENCE_GAP_S = 0.15 

47_END_PAD_S = 0.5 

48_PEAK_TARGET_DB = -1.0 

49_SPLIT_THRESHOLD_CHARS = 80 

50_MIN_CHUNK_CHARS = 20 # merge any sub-20-char fragment into neighbor 

51_TAIL_MERGE_CHARS = 15 # merge any ≤15-char trailing fragment backwards 

52_MAX_NEW_TOKENS_MIN = 3000 

53_MAX_NEW_TOKENS_MAX = 8000 

54_MAX_NEW_TOKENS_PER_CHAR = 50 

55 

56 

57def _build_description(language: str) -> str: 

58 """Build a style description with the recommended speaker for language.""" 

59 speaker = _SPEAKERS.get(language, 'Divya') 

60 return ( 

61 f"{speaker} speaks with a confident, clear and expressive voice " 

62 f"at a moderate pace. The recording is of very high quality with no " 

63 f"background noise, the speaker's voice is loud, clear and very " 

64 f"close to the microphone." 

65 ) 

66 

67 

68def _split_sentences(text: str) -> list: 

69 """Split text at real sentence boundaries (not mid-ellipsis). 

70 

71 Handles Latin + Indic punctuation (. ? ! । ৷). Protects "..." so 

72 ellipses don't trigger splits. Merges fragments shorter than 

73 _MIN_CHUNK_CHARS into their neighbor, and pulls any ≤ 

74 _TAIL_MERGE_CHARS tail back into the previous chunk. 

75 """ 

76 protected = text.replace('...', '\x00ELLIPSIS\x00') 

77 parts = re.split(r'(?<=[^\.\s])[.?!।৷]\s+', protected) 

78 parts = [p.replace('\x00ELLIPSIS\x00', '...') for p in parts] 

79 merged = [] 

80 for p in parts: 

81 p = p.strip() 

82 if not p: 

83 continue 

84 if merged and len(merged[-1]) < _MIN_CHUNK_CHARS: 

85 merged[-1] = merged[-1] + ' ' + p 

86 else: 

87 merged.append(p) 

88 if len(merged) > 1 and len(merged[-1]) < _TAIL_MERGE_CHARS: 

89 merged[-2] = merged[-2] + ' ' + merged[-1] 

90 merged.pop() 

91 return merged if len(merged) > 1 else [text] 

92 

93 

94# ── Worker callbacks (run in subprocess) ────────────────────────── 

95 

96def _load(): 

97 """Load Indic Parler TTS + both tokenizers. 

98 

99 Ported from Nunba's _LazyIndicParler: 

100 - Loads ParlerTTSForConditionalGeneration on CUDA 

101 - Uses TWO tokenizers: one for the prompt text, one for the 

102 description. The description tokenizer comes from the model's 

103 own text encoder (different vocab from the prompt tokenizer). 

104 """ 

105 from parler_tts import ParlerTTSForConditionalGeneration 

106 from transformers import AutoTokenizer 

107 

108 model = ParlerTTSForConditionalGeneration.from_pretrained( 

109 'ai4bharat/indic-parler-tts', 

110 ).to('cuda') 

111 # Prompt tokenizer: lowercases Indic text, matches the model's decoder. 

112 prompt_tokenizer = AutoTokenizer.from_pretrained( 

113 'ai4bharat/indic-parler-tts', 

114 ) 

115 # Description tokenizer: matches the model's text encoder (different 

116 # vocab — English-centric since descriptions are always English). 

117 desc_tokenizer = AutoTokenizer.from_pretrained( 

118 model.config.text_encoder._name_or_path, 

119 ) 

120 return { 

121 'model': model, 

122 'prompt_tokenizer': prompt_tokenizer, 

123 'desc_tokenizer': desc_tokenizer, 

124 'sample_rate': model.config.sampling_rate, 

125 } 

126 

127 

128def _generate_chunk(state: dict, text: str, language: str): 

129 """Generate audio for one text chunk; returns a 1-D numpy float32 array.""" 

130 import torch 

131 

132 model = state['model'] 

133 prompt_tokenizer = state['prompt_tokenizer'] 

134 desc_tokenizer = state['desc_tokenizer'] 

135 

136 description = _build_description(language) 

137 desc_inputs = desc_tokenizer(description, return_tensors='pt').to('cuda') 

138 prompt_inputs = prompt_tokenizer(text, return_tensors='pt').to('cuda') 

139 max_tokens = max( 

140 _MAX_NEW_TOKENS_MIN, 

141 min(_MAX_NEW_TOKENS_MAX, len(text) * _MAX_NEW_TOKENS_PER_CHAR), 

142 ) 

143 

144 with torch.no_grad(): 

145 generation = model.generate( 

146 input_ids=desc_inputs.input_ids, 

147 attention_mask=desc_inputs.attention_mask, 

148 prompt_input_ids=prompt_inputs.input_ids, 

149 prompt_attention_mask=prompt_inputs.attention_mask, 

150 max_new_tokens=max_tokens, 

151 ) 

152 return generation.cpu().float().numpy().squeeze() 

153 

154 

155def _synthesize(state, req: dict) -> dict: 

156 text = req.get('text', '') 

157 if not text or not text.strip(): 

158 return {'error': 'Text is required'} 

159 output_path = req.get('output_path') 

160 if not output_path: 

161 return {'error': 'output_path is required'} 

162 

163 import numpy as np 

164 import soundfile as sf 

165 

166 language = req.get('language', 'hi') 

167 sr = state['sample_rate'] 

168 

169 # Split long text to prevent Indic Parler's tendency to clip long 

170 # utterances' tails. Threshold 80 chars matches Nunba's tuning. 

171 if len(text) > _SPLIT_THRESHOLD_CHARS: 

172 sentences = _split_sentences(text) 

173 else: 

174 sentences = [text] 

175 

176 if len(sentences) == 1: 

177 audio = _generate_chunk(state, text, language) 

178 else: 

179 gap = np.zeros(int(sr * _INTER_SENTENCE_GAP_S), dtype=np.float32) 

180 chunks = [] 

181 for i, sent in enumerate(sentences): 

182 chunk_audio = _generate_chunk(state, sent, language) 

183 if chunk_audio is not None and len(chunk_audio) > 0: 

184 chunks.append(chunk_audio) 

185 if i < len(sentences) - 1: 

186 chunks.append(gap) 

187 audio = np.concatenate(chunks) if chunks else np.zeros(1, dtype=np.float32) 

188 

189 # Pad trailing silence to prevent chopped endings 

190 end_pad = np.zeros(int(sr * _END_PAD_S), dtype=np.float32) 

191 audio = np.concatenate([audio, end_pad]) 

192 

193 # Peak-normalize to the target dBFS 

194 peak = float(np.abs(audio).max()) 

195 if peak > 0: 

196 target_peak = 10 ** (_PEAK_TARGET_DB / 20.0) 

197 audio = audio * (target_peak / peak) 

198 

199 sf.write(output_path, audio, sr) 

200 

201 return { 

202 'path': output_path, 

203 'duration': round(len(audio) / sr, 2), 

204 'sample_rate': sr, 

205 'engine': 'indic-parler-tts', 

206 'device': 'cuda', 

207 'language': language, 

208 'voice': f"{_SPEAKERS.get(language, 'Divya')} ({language})", 

209 } 

210 

211 

212# ── Parent-side: ToolWorker instance ───────────────────────────── 

213 

214_tool = ToolWorker( 

215 tool_name='indic_parler', 

216 tool_module='integrations.service_tools.indic_parler_tool', 

217 vram_budget='tts_indic_parler', 

218 output_subdir='indic_parler/output', 

219 engine='indic-parler-tts', 

220 startup_timeout=120.0, 

221 request_timeout=120.0, 

222) 

223 

224 

225def indic_parler_synthesize( 

226 text: str, 

227 language: str = 'hi', 

228 voice: Optional[str] = None, 

229 output_path: Optional[str] = None, 

230) -> str: 

231 """Synthesize with Indic Parler TTS (22 Indic languages, GPU subprocess). 

232 

233 `voice` here is a style description (e.g. "A female speaker with 

234 calm tone"), not a reference audio path. Indic Parler uses 

235 text-conditioned styles, not voice cloning. 

236 """ 

237 return _tool.synthesize( 

238 text=text, 

239 language=language, 

240 voice=voice, 

241 output_path=output_path, 

242 default_sample_rate=SAMPLE_RATE, 

243 ) 

244 

245 

246def unload_indic_parler(): 

247 """Stop the Indic Parler worker subprocess and free its VRAM.""" 

248 _tool.stop() 

249 

250 

251class IndicParlerTool: 

252 """Register Indic Parler as an in-process service tool.""" 

253 

254 @classmethod 

255 def register_functions(cls): 

256 from .registry import ServiceToolInfo, service_tool_registry 

257 tool_info = ServiceToolInfo( 

258 name="indic_parler", 

259 description=( 

260 "Indic Parler TTS: 22 Indian languages + English. " 

261 "Style-conditioned synthesis (no voice cloning). " 

262 "1.8GB VRAM. Requires: pip install indic-parler-tts" 

263 ), 

264 base_url="inprocess://indic_parler", 

265 endpoints={ 

266 "synthesize": { 

267 "path": "/synthesize", 

268 "method": "POST", 

269 "description": "Synthesize with Indic Parler TTS (22 Indic languages, GPU).", 

270 "params_schema": { 

271 "text": {"type": "string"}, 

272 "language": {"type": "string"}, 

273 "voice": {"type": "string", "description": "Style description text"}, 

274 }, 

275 }, 

276 }, 

277 tags=["tts", "speech", "gpu", "indic", "multilingual"], 

278 timeout=60, 

279 ) 

280 tool_info.is_healthy = True 

281 service_tool_registry._tools["indic_parler"] = tool_info 

282 return True 

283 

284# NOTE: no `if __name__ == '__main__':` block — the centralized 

285# dispatcher (gpu_worker) imports this module and calls _load/_synthesize.