Coverage for integrations / channels / media / tts_text_normalizer.py: 75.8%

157 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2TTS Text Normalizer — expand numbers, currency, units, URLs to spoken form. 

3 

4Why this exists: 

5 Modern diffusion-token TTS (OmniVoice, F5, CosyVoice, Indic-Parler) cannot 

6 pronounce tokens like "Rs.200", "12.5%", "2:30 PM", "https://x.com", 

7 "Dr.", "kg". They either skip them or produce garbage. Text MUST be 

8 normalized to its spoken form BEFORE hitting the synthesizer. 

9 

10Single converging path: 

11 Called ONCE from tts_router.synthesize() right after language detection. 

12 Every TTS engine in the registry benefits — we do not duplicate this 

13 logic per-engine. 

14 

15Two-stage strategy: 

16 1) Rule pass (fast, offline, deterministic) — num2words + regex for 

17 currency, percent, time, URLs, emails. <1 ms for short utterances. 

18 2) LLM fallback (slow, online) — only if rule pass leaves residual 

19 unspeakable tokens. Calls model_bus_service.infer() with the local 

20 0.8B model. 2 s timeout → falls back to rule output. 

21 

22Cache: 

23 (sha256(text), lang) → normalized_text, persisted at 

24 ~/.hevolve/cache/tts_normalize/ so repeated phrases never re-run 

25 num2words or hit the LLM. 

26""" 

27 

28from __future__ import annotations 

29 

30import hashlib 

31import json 

32import logging 

33import os 

34import re 

35import time 

36from pathlib import Path 

37from typing import Optional 

38 

39logger = logging.getLogger(__name__) 

40 

41# ─── Configuration ───────────────────────────────────────────────────── 

42 

43# num2words built-in language support (as of 0.5.x). Missing Indic langs 

44# (hi, ta, bn, ur, pa, or, as, ml, mr — partially covered; sa never) will 

45# fall through to LLM normalization when num2words raises NotImplementedError. 

46NUM2WORDS_LANGS = frozenset({ 

47 'ar', 'az', 'be', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 

48 'eo', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'hy', 'id', 

49 'is', 'it', 'ja', 'kn', 'ko', 'kz', 'lt', 'lv', 'nl', 'no', 'pl', 

50 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv', 'te', 'th', 'tr', 'uk', 

51 'vi', 'zh', 

52 # Added in recent forks — check at call-time since versions vary 

53 'hi', 'ta', 'gu', 'mr', 

54}) 

55 

56# Currency symbol → language-keyed spoken form. Symbols that make it past 

57# the regex rule pass get replaced with the target-language word; unknown 

58# languages fall back to English. 

59CURRENCY_WORDS: dict[str, dict[str, str]] = { 

60 '$': {'en': 'dollars', 'hi': 'डॉलर', 'ta': 'டாலர்', 'default': 'dollars'}, 

61 '₹': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'}, 

62 'Rs': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'}, 

63 '€': {'en': 'euros', 'hi': 'यूरो', 'ta': 'யூரோ', 'default': 'euros'}, 

64 '£': {'en': 'pounds', 'hi': 'पाउंड', 'ta': 'பவுண்ட்', 'default': 'pounds'}, 

65 '¥': {'en': 'yen', 'default': 'yen'}, 

66 '₩': {'en': 'won', 'default': 'won'}, 

67 '₽': {'en': 'rubles', 'default': 'rubles'}, 

68} 

69 

70# Percent and per-mille 

71PERCENT_WORDS: dict[str, str] = { 

72 'en': 'percent', 'hi': 'प्रतिशत', 'ta': 'சதவீதம்', 

73 'bn': 'শতাংশ', 'te': 'శాతం', 'ml': 'ശതമാനം', 'kn': 'ಶೇಕಡಾ', 

74 'mr': 'टक्के', 'gu': 'ટકા', 'pa': 'ਪ੍ਰਤੀਸ਼ਤ', 'ur': 'فیصد', 

75 'default': 'percent', 

76} 

77 

78# "at" and "dot" words for email/URL spelling 

79AT_DOT_WORDS: dict[str, tuple[str, str]] = { 

80 'en': ('at', 'dot'), 

81 'hi': ('ऐट', 'डॉट'), 

82 'ta': ('அட்', 'டாட்'), 

83 'default': ('at', 'dot'), 

84} 

85 

86LINK_WORDS: dict[str, str] = { 

87 'en': 'link', 'hi': 'लिंक', 'ta': 'இணைப்பு', 

88 'bn': 'লিঙ্ক', 'te': 'లింక్', 

89 'default': 'link', 

90} 

91 

92# Residual-token detector: any Latin digit or uppercase-only 3+ char 

93# acronym after the rule pass triggers LLM fallback. Indic scripts in 

94# this regex set are excluded — their own digits stay unchanged (TTS 

95# pronounces Devanagari / Tamil / Bengali digits natively). 

96_RESIDUAL_PATTERN = re.compile( 

97 r'[0-9]|(?:\b[A-Z]{3,}\b)' 

98) 

99 

100_LLM_TIMEOUT_SEC = 2.0 

101_CACHE_TTL_DAYS = 30 

102 

103 

104# ─── Cache ───────────────────────────────────────────────────────────── 

105 

106def _cache_dir() -> Path: 

107 base = Path( 

108 os.environ.get('HEVOLVE_CACHE_DIR') 

109 or (Path.home() / '.hevolve' / 'cache' / 'tts_normalize') 

110 ) 

111 base.mkdir(parents=True, exist_ok=True) 

112 return base 

113 

114 

115def _cache_key(text: str, lang: str) -> str: 

116 h = hashlib.sha256(f'{lang}\x00{text}'.encode('utf-8')).hexdigest() 

117 return h[:32] 

118 

119 

120def _cache_get(text: str, lang: str) -> Optional[str]: 

121 try: 

122 p = _cache_dir() / f'{_cache_key(text, lang)}.json' 

123 if not p.exists(): 

124 return None 

125 age = time.time() - p.stat().st_mtime 

126 if age > _CACHE_TTL_DAYS * 86400: 

127 p.unlink(missing_ok=True) 

128 return None 

129 return json.loads(p.read_text(encoding='utf-8')).get('normalized') 

130 except Exception: 

131 return None 

132 

133 

134def _cache_put(text: str, lang: str, normalized: str) -> None: 

135 try: 

136 p = _cache_dir() / f'{_cache_key(text, lang)}.json' 

137 tmp = p.with_suffix('.json.tmp') 

138 tmp.write_text( 

139 json.dumps({ 

140 'lang': lang, 

141 'text': text[:200], # truncated preview only; full text in hash 

142 'normalized': normalized, 

143 'ts': time.time(), 

144 }, ensure_ascii=False), 

145 encoding='utf-8', 

146 ) 

147 tmp.replace(p) 

148 except Exception as e: 

149 logger.debug(f'tts_normalize cache write failed: {e}') 

150 

151 

152# ─── Rule pass ───────────────────────────────────────────────────────── 

153 

154def _currency_word(symbol: str, lang: str) -> str: 

155 # Normalize "Rs." / "Rs " / "Rs" to the registry key "Rs" 

156 key = symbol.rstrip('. ').strip() 

157 entry = CURRENCY_WORDS.get(key) or CURRENCY_WORDS.get(symbol) or {} 

158 return entry.get(lang) or entry.get('default') or symbol 

159 

160 

161def _percent_word(lang: str) -> str: 

162 return PERCENT_WORDS.get(lang) or PERCENT_WORDS['default'] 

163 

164 

165def _link_word(lang: str) -> str: 

166 return LINK_WORDS.get(lang) or LINK_WORDS['default'] 

167 

168 

169def _at_dot(lang: str) -> tuple[str, str]: 

170 return AT_DOT_WORDS.get(lang) or AT_DOT_WORDS['default'] 

171 

172 

173def _num_to_words(n: float, lang: str) -> Optional[str]: 

174 """Convert number to spoken words in target language via num2words. 

175 Returns None if unsupported. 

176 """ 

177 if lang not in NUM2WORDS_LANGS: 

178 return None 

179 try: 

180 from num2words import num2words # type: ignore 

181 except ImportError: 

182 return None 

183 try: 

184 # num2words doesn't always error on unsupported langs; try and catch 

185 return num2words(n, lang=lang) 

186 except (NotImplementedError, KeyError, ValueError): 

187 return None 

188 except Exception as e: 

189 logger.debug(f'num2words failed for {lang}: {e}') 

190 return None 

191 

192 

193def _expand_currency_number(match: re.Match, lang: str) -> str: 

194 """Replace <symbol><amount> with <amount-words> <currency-word>. 

195 E.g. Rs.200 → "two hundred rupees" (en) or "दो सौ रुपये" (hi). 

196 """ 

197 symbol = match.group('sym') 

198 amount_str = match.group('amt').replace(',', '') 

199 try: 

200 amount = float(amount_str) if '.' in amount_str else int(amount_str) 

201 except ValueError: 

202 return match.group(0) 

203 words = _num_to_words(amount, lang) 

204 if words is None: 

205 words = _num_to_words(amount, 'en') or str(amount) 

206 return f'{words} {_currency_word(symbol, lang)}' 

207 

208 

209def _expand_standalone_number(match: re.Match, lang: str) -> str: 

210 """Replace bare numbers with words. Falls through to original on failure.""" 

211 raw = match.group(0).replace(',', '') 

212 try: 

213 n = float(raw) if '.' in raw else int(raw) 

214 except ValueError: 

215 return match.group(0) 

216 return _num_to_words(n, lang) or _num_to_words(n, 'en') or raw 

217 

218 

219def _expand_percent(match: re.Match, lang: str) -> str: 

220 """12.5% → 'twelve point five percent' (or target-lang equivalent).""" 

221 num_part = match.group('num').replace(',', '') 

222 try: 

223 n = float(num_part) if '.' in num_part else int(num_part) 

224 except ValueError: 

225 return match.group(0) 

226 words = _num_to_words(n, lang) or _num_to_words(n, 'en') or num_part 

227 return f'{words} {_percent_word(lang)}' 

228 

229 

230def _expand_url(match: re.Match, lang: str) -> str: 

231 return _link_word(lang) 

232 

233 

234def _expand_email(match: re.Match, lang: str) -> str: 

235 at, dot = _at_dot(lang) 

236 user = match.group('user') 

237 domain = match.group('domain') 

238 # Spell the domain dots too ("example.com" → "example dot com") 

239 domain_spoken = domain.replace('.', f' {dot} ') 

240 return f'{user} {at} {domain_spoken}' 

241 

242 

243def _expand_time(match: re.Match, lang: str) -> str: 

244 """2:30 PM → 'two thirty PM' (EN only — other langs fall through).""" 

245 if lang != 'en': 

246 return match.group(0) 

247 h = int(match.group('h')) 

248 m = int(match.group('m')) 

249 ampm = match.group('ampm') or '' 

250 h_words = _num_to_words(h, 'en') or str(h) 

251 if m == 0: 

252 return f'{h_words} o clock{" " + ampm if ampm else ""}' 

253 m_words = _num_to_words(m, 'en') or str(m) 

254 return f'{h_words} {m_words}{" " + ampm if ampm else ""}' 

255 

256 

257def rule_normalize(text: str, lang: str) -> str: 

258 """Apply all regex-based normalizations. Deterministic, <1 ms.""" 

259 out = text 

260 

261 # Emails FIRST (before URL matcher eats the @-host) 

262 out = re.sub( 

263 r'(?P<user>[\w.+\-]+)@(?P<domain>[\w.\-]+\.[A-Za-z]{2,})', 

264 lambda m: _expand_email(m, lang), 

265 out, 

266 ) 

267 

268 # URLs 

269 out = re.sub( 

270 r'https?://\S+', 

271 lambda m: _expand_url(m, lang), 

272 out, 

273 ) 

274 

275 # Currency: $100 / ₹200 / Rs.200 / €50.25 — optional dot/comma in amount 

276 out = re.sub( 

277 r'(?P<sym>\$|₹|€|£|¥|₩|₽|Rs\.?)\s?(?P<amt>[\d,]+(?:\.\d+)?)', 

278 lambda m: _expand_currency_number(m, lang), 

279 out, 

280 ) 

281 

282 # Percent 

283 out = re.sub( 

284 r'(?P<num>[\d,]+(?:\.\d+)?)\s?%', 

285 lambda m: _expand_percent(m, lang), 

286 out, 

287 ) 

288 

289 # Time HH:MM with optional AM/PM 

290 out = re.sub( 

291 r'\b(?P<h>\d{1,2}):(?P<m>\d{2})\s?(?P<ampm>AM|PM|am|pm)?\b', 

292 lambda m: _expand_time(m, lang), 

293 out, 

294 ) 

295 

296 # Standalone numbers LAST — catches any residual bare digits. 

297 # Only target ASCII digits; Indic digits (০-৯ ०-९ etc.) stay intact. 

298 out = re.sub( 

299 r'\b\d[\d,]*(?:\.\d+)?\b', 

300 lambda m: _expand_standalone_number(m, lang), 

301 out, 

302 ) 

303 

304 return out 

305 

306 

307# ─── LLM fallback ────────────────────────────────────────────────────── 

308 

309def _has_residual_tokens(text: str) -> bool: 

310 """True if rule-normalized text still contains digits or unspoken 

311 acronyms that TTS would stumble on. 

312 """ 

313 return bool(_RESIDUAL_PATTERN.search(text)) 

314 

315 

316def _get_model_bus(): 

317 """Resolve the local ModelBus singleton. 

318 

319 Tries legacy module-level accessors first (used by several other 

320 call sites), then falls back to constructing ModelBusService lazily. 

321 Returns None if the model_bus module is absent or construction fails 

322 — caller MUST handle None gracefully. 

323 """ 

324 try: 

325 from integrations.agent_engine import model_bus_service as _mbs 

326 except ImportError: 

327 return None 

328 # Prefer whatever accessor the rest of the codebase agrees on 

329 for attr in ('get_model_bus', 'get_model_bus_service'): 

330 fn = getattr(_mbs, attr, None) 

331 if callable(fn): 

332 try: 

333 return fn() 

334 except Exception as e: 

335 logger.debug(f'{attr}() failed: {e}') 

336 # Fallback: construct directly — cheap if already a singleton internally 

337 try: 

338 return _mbs.ModelBusService() 

339 except Exception as e: 

340 logger.debug(f'ModelBusService() failed: {e}') 

341 return None 

342 

343 

344def _llm_normalize(text: str, lang: str) -> Optional[str]: 

345 """Ask the local 0.8B model to expand remaining unspeakable tokens. 

346 Returns None on timeout, error, or backend absence. 

347 """ 

348 bus = _get_model_bus() 

349 if bus is None: 

350 return None 

351 

352 prompt = ( 

353 f'Normalize the following text for text-to-speech in {lang}.\n' 

354 f'Expand numbers, currency, units, dates, and abbreviations into ' 

355 f'their spoken form in {lang}. Keep all other content unchanged. ' 

356 f'Return ONLY the normalized text, no commentary.\n\n' 

357 f'Text: {text}\n' 

358 f'Normalized:' 

359 ) 

360 

361 try: 

362 t0 = time.time() 

363 result = bus.infer( 

364 model_type='llm', 

365 prompt=prompt, 

366 options={'max_tokens': min(512, len(text) * 3), 'temperature': 0.0}, 

367 ) 

368 if (time.time() - t0) > _LLM_TIMEOUT_SEC: 

369 logger.debug(f'tts_normalize LLM call exceeded {_LLM_TIMEOUT_SEC}s — using rule output') 

370 return None 

371 if isinstance(result, dict) and 'response' in result: 

372 response = result['response'].strip() 

373 # Guard: LLM sometimes echoes the prompt; strip "Normalized:" prefix 

374 if response.lower().startswith('normalized:'): 

375 response = response[len('normalized:'):].strip() 

376 return response or None 

377 return None 

378 except Exception as e: 

379 logger.debug(f'tts_normalize LLM fallback failed: {e}') 

380 return None 

381 

382 

383# ─── Public API ──────────────────────────────────────────────────────── 

384 

385def normalize_for_tts( 

386 text: str, 

387 lang: str = 'en', 

388 use_llm: bool = True, 

389) -> str: 

390 """Normalize text so TTS can pronounce everything. 

391 

392 Pipeline: 

393 1. Cache lookup → return if hit 

394 2. Rule pass (num2words + regex) 

395 3. If residual digits / acronyms remain AND use_llm → LLM pass 

396 4. Cache the final result 

397 

398 Args: 

399 text: Raw text (may contain currency, numbers, URLs, acronyms) 

400 lang: Target ISO 639-1 language code 

401 use_llm: If False, skip LLM fallback (faster, but residual tokens 

402 may remain). Use when caller is latency-sensitive. 

403 

404 Returns: 

405 Normalized text safe for any TTS engine. 

406 """ 

407 if not text or not text.strip(): 

408 return text 

409 

410 lang = (lang or 'en').lower().split('-')[0].split('_')[0] 

411 

412 cached = _cache_get(text, lang) 

413 if cached is not None: 

414 return cached 

415 

416 normalized = rule_normalize(text, lang) 

417 

418 if use_llm and _has_residual_tokens(normalized): 

419 llm_out = _llm_normalize(normalized, lang) 

420 if llm_out: 

421 normalized = llm_out 

422 

423 _cache_put(text, lang, normalized) 

424 return normalized