Coverage for integrations/channels/media/tts_text

1"""

2TTS Text Normalizer — expand numbers, currency, units, URLs to spoken form.

4Why this exists:

5 Modern diffusion-token TTS (OmniVoice, F5, CosyVoice, Indic-Parler) cannot

6 pronounce tokens like "Rs.200", "12.5%", "2:30 PM", "https://x.com",

7 "Dr.", "kg". They either skip them or produce garbage. Text MUST be

8 normalized to its spoken form BEFORE hitting the synthesizer.

10Single converging path:

11 Called ONCE from tts_router.synthesize() right after language detection.

12 Every TTS engine in the registry benefits — we do not duplicate this

13 logic per-engine.

15Two-stage strategy:

16 1) Rule pass (fast, offline, deterministic) — num2words + regex for

17 currency, percent, time, URLs, emails. <1 ms for short utterances.

18 2) LLM fallback (slow, online) — only if rule pass leaves residual

19 unspeakable tokens. Calls model_bus_service.infer() with the local

20 0.8B model. 2 s timeout → falls back to rule output.

22Cache:

23 (sha256(text), lang) → normalized_text, persisted at

24 ~/.hevolve/cache/tts_normalize/ so repeated phrases never re-run

25 num2words or hit the LLM.

26"""

28from __future__ import annotations

30import hashlib

31import json

32import logging

33import os

34import re

35import time

36from pathlib import Path

37from typing import Optional

39logger = logging.getLogger(__name__)

41# ─── Configuration ─────────────────────────────────────────────────────

43# num2words built-in language support (as of 0.5.x). Missing Indic langs

44# (hi, ta, bn, ur, pa, or, as, ml, mr — partially covered; sa never) will

45# fall through to LLM normalization when num2words raises NotImplementedError.

46NUM2WORDS_LANGS = frozenset({

47 'ar', 'az', 'be', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en',

48 'eo', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'hy', 'id',

49 'is', 'it', 'ja', 'kn', 'ko', 'kz', 'lt', 'lv', 'nl', 'no', 'pl',

50 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv', 'te', 'th', 'tr', 'uk',

51 'vi', 'zh',

52 # Added in recent forks — check at call-time since versions vary

53 'hi', 'ta', 'gu', 'mr',

54})

56# Currency symbol → language-keyed spoken form. Symbols that make it past

57# the regex rule pass get replaced with the target-language word; unknown

58# languages fall back to English.

59CURRENCY_WORDS: dict[str, dict[str, str]] = {

60 '$': {'en': 'dollars', 'hi': 'डॉलर', 'ta': 'டாலர்', 'default': 'dollars'},

61 '₹': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'},

62 'Rs': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'},

63 '€': {'en': 'euros', 'hi': 'यूरो', 'ta': 'யூரோ', 'default': 'euros'},

64 '£': {'en': 'pounds', 'hi': 'पाउंड', 'ta': 'பவுண்ட்', 'default': 'pounds'},

65 '¥': {'en': 'yen', 'default': 'yen'},

66 '₩': {'en': 'won', 'default': 'won'},

67 '₽': {'en': 'rubles', 'default': 'rubles'},

68}

70# Percent and per-mille

71PERCENT_WORDS: dict[str, str] = {

72 'en': 'percent', 'hi': 'प्रतिशत', 'ta': 'சதவீதம்',

73 'bn': 'শতাংশ', 'te': 'శాతం', 'ml': 'ശതമാനം', 'kn': 'ಶೇಕಡಾ',

74 'mr': 'टक्के', 'gu': 'ટકા', 'pa': 'ਪ੍ਰਤੀਸ਼ਤ', 'ur': 'فیصد',

75 'default': 'percent',

76}

78# "at" and "dot" words for email/URL spelling

79AT_DOT_WORDS: dict[str, tuple[str, str]] = {

80 'en': ('at', 'dot'),

81 'hi': ('ऐट', 'डॉट'),

82 'ta': ('அட்', 'டாட்'),

83 'default': ('at', 'dot'),

84}

86LINK_WORDS: dict[str, str] = {

87 'en': 'link', 'hi': 'लिंक', 'ta': 'இணைப்பு',

88 'bn': 'লিঙ্ক', 'te': 'లింక్',

89 'default': 'link',

90}

92# Residual-token detector: any Latin digit or uppercase-only 3+ char

93# acronym after the rule pass triggers LLM fallback. Indic scripts in

94# this regex set are excluded — their own digits stay unchanged (TTS

95# pronounces Devanagari / Tamil / Bengali digits natively).

96_RESIDUAL_PATTERN = re.compile(

97 r'[0-9]|(?:\b[A-Z]{3,}\b)'

98)

100_LLM_TIMEOUT_SEC = 2.0

101_CACHE_TTL_DAYS = 30

102

103

104# ─── Cache ─────────────────────────────────────────────────────────────

105

106def _cache_dir() -> Path:

107 base = Path(

108 os.environ.get('HEVOLVE_CACHE_DIR')

109 or (Path.home() / '.hevolve' / 'cache' / 'tts_normalize')

110 )

111 base.mkdir(parents=True, exist_ok=True)

112 return base

113

114

115def _cache_key(text: str, lang: str) -> str:

116 h = hashlib.sha256(f'{lang}\x00{text}'.encode('utf-8')).hexdigest()

117 return h[:32]

118

119

120def _cache_get(text: str, lang: str) -> Optional[str]:

121 try:

122 p = _cache_dir() / f'{_cache_key(text, lang)}.json'

123 if not p.exists():

124 return None

125 age = time.time() - p.stat().st_mtime

126 if age > _CACHE_TTL_DAYS * 86400:

127 p.unlink(missing_ok=True)

128 return None

129 return json.loads(p.read_text(encoding='utf-8')).get('normalized')

130 except Exception:

131 return None

132

133

134def _cache_put(text: str, lang: str, normalized: str) -> None:

135 try:

136 p = _cache_dir() / f'{_cache_key(text, lang)}.json'

137 tmp = p.with_suffix('.json.tmp')

138 tmp.write_text(

139 json.dumps({

140 'lang': lang,

141 'text': text[:200], # truncated preview only; full text in hash

142 'normalized': normalized,

143 'ts': time.time(),

144 }, ensure_ascii=False),

145 encoding='utf-8',

146 )

147 tmp.replace(p)

148 except Exception as e:

149 logger.debug(f'tts_normalize cache write failed: {e}')

150

151

152# ─── Rule pass ─────────────────────────────────────────────────────────

153

154def _currency_word(symbol: str, lang: str) -> str:

155 # Normalize "Rs." / "Rs " / "Rs" to the registry key "Rs"

156 key = symbol.rstrip('. ').strip()

157 entry = CURRENCY_WORDS.get(key) or CURRENCY_WORDS.get(symbol) or {}

158 return entry.get(lang) or entry.get('default') or symbol

159

160

161def _percent_word(lang: str) -> str:

162 return PERCENT_WORDS.get(lang) or PERCENT_WORDS['default']

163

164

165def _link_word(lang: str) -> str:

166 return LINK_WORDS.get(lang) or LINK_WORDS['default']

167

168

169def _at_dot(lang: str) -> tuple[str, str]:

170 return AT_DOT_WORDS.get(lang) or AT_DOT_WORDS['default']

171

172

173def _num_to_words(n: float, lang: str) -> Optional[str]:

174 """Convert number to spoken words in target language via num2words.

175 Returns None if unsupported.

176 """

177 if lang not in NUM2WORDS_LANGS:

178 return None

179 try:

180 from num2words import num2words # type: ignore

181 except ImportError:

182 return None

183 try:

184 # num2words doesn't always error on unsupported langs; try and catch

185 return num2words(n, lang=lang)

186 except (NotImplementedError, KeyError, ValueError):

187 return None

188 except Exception as e:

189 logger.debug(f'num2words failed for {lang}: {e}')

190 return None

191

192

193def _expand_currency_number(match: re.Match, lang: str) -> str:

194 """Replace <symbol><amount> with <amount-words> <currency-word>.

195 E.g. Rs.200 → "two hundred rupees" (en) or "दो सौ रुपये" (hi).

196 """

197 symbol = match.group('sym')

198 amount_str = match.group('amt').replace(',', '')

199 try:

200 amount = float(amount_str) if '.' in amount_str else int(amount_str)

201 except ValueError:

202 return match.group(0)

203 words = _num_to_words(amount, lang)

204 if words is None:

205 words = _num_to_words(amount, 'en') or str(amount)

206 return f'{words} {_currency_word(symbol, lang)}'

207

208

209def _expand_standalone_number(match: re.Match, lang: str) -> str:

210 """Replace bare numbers with words. Falls through to original on failure."""

211 raw = match.group(0).replace(',', '')

212 try:

213 n = float(raw) if '.' in raw else int(raw)

214 except ValueError:

215 return match.group(0)

216 return _num_to_words(n, lang) or _num_to_words(n, 'en') or raw

217

218

219def _expand_percent(match: re.Match, lang: str) -> str:

220 """12.5% → 'twelve point five percent' (or target-lang equivalent)."""

221 num_part = match.group('num').replace(',', '')

222 try:

223 n = float(num_part) if '.' in num_part else int(num_part)

224 except ValueError:

225 return match.group(0)

226 words = _num_to_words(n, lang) or _num_to_words(n, 'en') or num_part

227 return f'{words} {_percent_word(lang)}'

228

229

230def _expand_url(match: re.Match, lang: str) -> str:

231 return _link_word(lang)

232

233

234def _expand_email(match: re.Match, lang: str) -> str:

235 at, dot = _at_dot(lang)

236 user = match.group('user')

237 domain = match.group('domain')

238 # Spell the domain dots too ("example.com" → "example dot com")

239 domain_spoken = domain.replace('.', f' {dot} ')

240 return f'{user} {at} {domain_spoken}'

241

242

243def _expand_time(match: re.Match, lang: str) -> str:

244 """2:30 PM → 'two thirty PM' (EN only — other langs fall through)."""

245 if lang != 'en':

246 return match.group(0)

247 h = int(match.group('h'))

248 m = int(match.group('m'))

249 ampm = match.group('ampm') or ''

250 h_words = _num_to_words(h, 'en') or str(h)

251 if m == 0:

252 return f'{h_words} o clock{" " + ampm if ampm else ""}'

253 m_words = _num_to_words(m, 'en') or str(m)

254 return f'{h_words} {m_words}{" " + ampm if ampm else ""}'

255

256

257def rule_normalize(text: str, lang: str) -> str:

258 """Apply all regex-based normalizations. Deterministic, <1 ms."""

259 out = text

260

261 # Emails FIRST (before URL matcher eats the @-host)

262 out = re.sub(

263 r'(?P<user>[\w.+\-]+)@(?P<domain>[\w.\-]+\.[A-Za-z]{2,})',

264 lambda m: _expand_email(m, lang),

265 out,

266 )

267

268 # URLs

269 out = re.sub(

270 r'https?://\S+',

271 lambda m: _expand_url(m, lang),

272 out,

273 )

274

275 # Currency: $100 / ₹200 / Rs.200 / €50.25 — optional dot/comma in amount

276 out = re.sub(

277 r'(?P<sym>\$|₹|€|£|¥|₩|₽|Rs\.?)\s?(?P<amt>[\d,]+(?:\.\d+)?)',

278 lambda m: _expand_currency_number(m, lang),

279 out,

280 )

281

282 # Percent

283 out = re.sub(

284 r'(?P<num>[\d,]+(?:\.\d+)?)\s?%',

285 lambda m: _expand_percent(m, lang),

286 out,

287 )

288

289 # Time HH:MM with optional AM/PM

290 out = re.sub(

291 r'\b(?P<h>\d{1,2}):(?P<m>\d{2})\s?(?P<ampm>AM|PM|am|pm)?\b',

292 lambda m: _expand_time(m, lang),

293 out,

294 )

295

296 # Standalone numbers LAST — catches any residual bare digits.

297 # Only target ASCII digits; Indic digits (০-৯ ०-९ etc.) stay intact.

298 out = re.sub(

299 r'\b\d[\d,]*(?:\.\d+)?\b',

300 lambda m: _expand_standalone_number(m, lang),

301 out,

302 )

303

304 return out

305

306

307# ─── LLM fallback ──────────────────────────────────────────────────────

308

309def _has_residual_tokens(text: str) -> bool:

310 """True if rule-normalized text still contains digits or unspoken

311 acronyms that TTS would stumble on.

312 """

313 return bool(_RESIDUAL_PATTERN.search(text))

314

315

316def _get_model_bus():

317 """Resolve the local ModelBus singleton.

318

319 Tries legacy module-level accessors first (used by several other

320 call sites), then falls back to constructing ModelBusService lazily.

321 Returns None if the model_bus module is absent or construction fails

322 — caller MUST handle None gracefully.

323 """

324 try:

325 from integrations.agent_engine import model_bus_service as _mbs

326 except ImportError:

327 return None

328 # Prefer whatever accessor the rest of the codebase agrees on

329 for attr in ('get_model_bus', 'get_model_bus_service'):

330 fn = getattr(_mbs, attr, None)

331 if callable(fn):

332 try:

333 return fn()

334 except Exception as e:

335 logger.debug(f'{attr}() failed: {e}')

336 # Fallback: construct directly — cheap if already a singleton internally

337 try:

338 return _mbs.ModelBusService()

339 except Exception as e:

340 logger.debug(f'ModelBusService() failed: {e}')

341 return None

342

343

344def _llm_normalize(text: str, lang: str) -> Optional[str]:

345 """Ask the local 0.8B model to expand remaining unspeakable tokens.

346 Returns None on timeout, error, or backend absence.

347 """

348 bus = _get_model_bus()

349 if bus is None:

350 return None

351

352 prompt = (

353 f'Normalize the following text for text-to-speech in {lang}.\n'

354 f'Expand numbers, currency, units, dates, and abbreviations into '

355 f'their spoken form in {lang}. Keep all other content unchanged. '

356 f'Return ONLY the normalized text, no commentary.\n\n'

357 f'Text: {text}\n'

358 f'Normalized:'

359 )

360

361 try:

362 t0 = time.time()

363 result = bus.infer(

364 model_type='llm',

365 prompt=prompt,

366 options={'max_tokens': min(512, len(text) * 3), 'temperature': 0.0},

367 )

368 if (time.time() - t0) > _LLM_TIMEOUT_SEC:

369 logger.debug(f'tts_normalize LLM call exceeded {_LLM_TIMEOUT_SEC}s — using rule output')

370 return None

371 if isinstance(result, dict) and 'response' in result:

372 response = result['response'].strip()

373 # Guard: LLM sometimes echoes the prompt; strip "Normalized:" prefix

374 if response.lower().startswith('normalized:'):

375 response = response[len('normalized:'):].strip()

376 return response or None

377 return None

378 except Exception as e:

379 logger.debug(f'tts_normalize LLM fallback failed: {e}')

380 return None

381

382

383# ─── Public API ────────────────────────────────────────────────────────

384

385def normalize_for_tts(

386 text: str,

387 lang: str = 'en',

388 use_llm: bool = True,

389) -> str:

390 """Normalize text so TTS can pronounce everything.

391

392 Pipeline:

393 1. Cache lookup → return if hit

394 2. Rule pass (num2words + regex)

395 3. If residual digits / acronyms remain AND use_llm → LLM pass

396 4. Cache the final result

397

398 Args:

399 text: Raw text (may contain currency, numbers, URLs, acronyms)

400 lang: Target ISO 639-1 language code

401 use_llm: If False, skip LLM fallback (faster, but residual tokens

402 may remain). Use when caller is latency-sensitive.

403

404 Returns:

405 Normalized text safe for any TTS engine.

406 """

407 if not text or not text.strip():

408 return text

409

410 lang = (lang or 'en').lower().split('-')[0].split('_')[0]

411

412 cached = _cache_get(text, lang)

413 if cached is not None:

414 return cached

415

416 normalized = rule_normalize(text, lang)

417

418 if use_llm and _has_residual_tokens(normalized):

419 llm_out = _llm_normalize(normalized, lang)

420 if llm_out:

421 normalized = llm_out

422

423 _cache_put(text, lang, normalized)

424 return normalized

Coverage for integrations / channels / media / tts_text_normalizer.py: 75.8%

157 statements