Coverage for integrations / channels / media / tts_text_normalizer.py: 75.8%
157 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2TTS Text Normalizer — expand numbers, currency, units, URLs to spoken form.
4Why this exists:
5 Modern diffusion-token TTS (OmniVoice, F5, CosyVoice, Indic-Parler) cannot
6 pronounce tokens like "Rs.200", "12.5%", "2:30 PM", "https://x.com",
7 "Dr.", "kg". They either skip them or produce garbage. Text MUST be
8 normalized to its spoken form BEFORE hitting the synthesizer.
10Single converging path:
11 Called ONCE from tts_router.synthesize() right after language detection.
12 Every TTS engine in the registry benefits — we do not duplicate this
13 logic per-engine.
15Two-stage strategy:
16 1) Rule pass (fast, offline, deterministic) — num2words + regex for
17 currency, percent, time, URLs, emails. <1 ms for short utterances.
18 2) LLM fallback (slow, online) — only if rule pass leaves residual
19 unspeakable tokens. Calls model_bus_service.infer() with the local
20 0.8B model. 2 s timeout → falls back to rule output.
22Cache:
23 (sha256(text), lang) → normalized_text, persisted at
24 ~/.hevolve/cache/tts_normalize/ so repeated phrases never re-run
25 num2words or hit the LLM.
26"""
28from __future__ import annotations
30import hashlib
31import json
32import logging
33import os
34import re
35import time
36from pathlib import Path
37from typing import Optional
39logger = logging.getLogger(__name__)
41# ─── Configuration ─────────────────────────────────────────────────────
43# num2words built-in language support (as of 0.5.x). Missing Indic langs
44# (hi, ta, bn, ur, pa, or, as, ml, mr — partially covered; sa never) will
45# fall through to LLM normalization when num2words raises NotImplementedError.
46NUM2WORDS_LANGS = frozenset({
47 'ar', 'az', 'be', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en',
48 'eo', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'hy', 'id',
49 'is', 'it', 'ja', 'kn', 'ko', 'kz', 'lt', 'lv', 'nl', 'no', 'pl',
50 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv', 'te', 'th', 'tr', 'uk',
51 'vi', 'zh',
52 # Added in recent forks — check at call-time since versions vary
53 'hi', 'ta', 'gu', 'mr',
54})
56# Currency symbol → language-keyed spoken form. Symbols that make it past
57# the regex rule pass get replaced with the target-language word; unknown
58# languages fall back to English.
59CURRENCY_WORDS: dict[str, dict[str, str]] = {
60 '$': {'en': 'dollars', 'hi': 'डॉलर', 'ta': 'டாலர்', 'default': 'dollars'},
61 '₹': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'},
62 'Rs': {'en': 'rupees', 'hi': 'रुपये', 'ta': 'ரூபாய்', 'default': 'rupees'},
63 '€': {'en': 'euros', 'hi': 'यूरो', 'ta': 'யூரோ', 'default': 'euros'},
64 '£': {'en': 'pounds', 'hi': 'पाउंड', 'ta': 'பவுண்ட்', 'default': 'pounds'},
65 '¥': {'en': 'yen', 'default': 'yen'},
66 '₩': {'en': 'won', 'default': 'won'},
67 '₽': {'en': 'rubles', 'default': 'rubles'},
68}
70# Percent and per-mille
71PERCENT_WORDS: dict[str, str] = {
72 'en': 'percent', 'hi': 'प्रतिशत', 'ta': 'சதவீதம்',
73 'bn': 'শতাংশ', 'te': 'శాతం', 'ml': 'ശതമാനം', 'kn': 'ಶೇಕಡಾ',
74 'mr': 'टक्के', 'gu': 'ટકા', 'pa': 'ਪ੍ਰਤੀਸ਼ਤ', 'ur': 'فیصد',
75 'default': 'percent',
76}
78# "at" and "dot" words for email/URL spelling
79AT_DOT_WORDS: dict[str, tuple[str, str]] = {
80 'en': ('at', 'dot'),
81 'hi': ('ऐट', 'डॉट'),
82 'ta': ('அட்', 'டாட்'),
83 'default': ('at', 'dot'),
84}
86LINK_WORDS: dict[str, str] = {
87 'en': 'link', 'hi': 'लिंक', 'ta': 'இணைப்பு',
88 'bn': 'লিঙ্ক', 'te': 'లింక్',
89 'default': 'link',
90}
92# Residual-token detector: any Latin digit or uppercase-only 3+ char
93# acronym after the rule pass triggers LLM fallback. Indic scripts in
94# this regex set are excluded — their own digits stay unchanged (TTS
95# pronounces Devanagari / Tamil / Bengali digits natively).
96_RESIDUAL_PATTERN = re.compile(
97 r'[0-9]|(?:\b[A-Z]{3,}\b)'
98)
100_LLM_TIMEOUT_SEC = 2.0
101_CACHE_TTL_DAYS = 30
104# ─── Cache ─────────────────────────────────────────────────────────────
106def _cache_dir() -> Path:
107 base = Path(
108 os.environ.get('HEVOLVE_CACHE_DIR')
109 or (Path.home() / '.hevolve' / 'cache' / 'tts_normalize')
110 )
111 base.mkdir(parents=True, exist_ok=True)
112 return base
115def _cache_key(text: str, lang: str) -> str:
116 h = hashlib.sha256(f'{lang}\x00{text}'.encode('utf-8')).hexdigest()
117 return h[:32]
120def _cache_get(text: str, lang: str) -> Optional[str]:
121 try:
122 p = _cache_dir() / f'{_cache_key(text, lang)}.json'
123 if not p.exists():
124 return None
125 age = time.time() - p.stat().st_mtime
126 if age > _CACHE_TTL_DAYS * 86400:
127 p.unlink(missing_ok=True)
128 return None
129 return json.loads(p.read_text(encoding='utf-8')).get('normalized')
130 except Exception:
131 return None
134def _cache_put(text: str, lang: str, normalized: str) -> None:
135 try:
136 p = _cache_dir() / f'{_cache_key(text, lang)}.json'
137 tmp = p.with_suffix('.json.tmp')
138 tmp.write_text(
139 json.dumps({
140 'lang': lang,
141 'text': text[:200], # truncated preview only; full text in hash
142 'normalized': normalized,
143 'ts': time.time(),
144 }, ensure_ascii=False),
145 encoding='utf-8',
146 )
147 tmp.replace(p)
148 except Exception as e:
149 logger.debug(f'tts_normalize cache write failed: {e}')
152# ─── Rule pass ─────────────────────────────────────────────────────────
154def _currency_word(symbol: str, lang: str) -> str:
155 # Normalize "Rs." / "Rs " / "Rs" to the registry key "Rs"
156 key = symbol.rstrip('. ').strip()
157 entry = CURRENCY_WORDS.get(key) or CURRENCY_WORDS.get(symbol) or {}
158 return entry.get(lang) or entry.get('default') or symbol
161def _percent_word(lang: str) -> str:
162 return PERCENT_WORDS.get(lang) or PERCENT_WORDS['default']
165def _link_word(lang: str) -> str:
166 return LINK_WORDS.get(lang) or LINK_WORDS['default']
169def _at_dot(lang: str) -> tuple[str, str]:
170 return AT_DOT_WORDS.get(lang) or AT_DOT_WORDS['default']
173def _num_to_words(n: float, lang: str) -> Optional[str]:
174 """Convert number to spoken words in target language via num2words.
175 Returns None if unsupported.
176 """
177 if lang not in NUM2WORDS_LANGS:
178 return None
179 try:
180 from num2words import num2words # type: ignore
181 except ImportError:
182 return None
183 try:
184 # num2words doesn't always error on unsupported langs; try and catch
185 return num2words(n, lang=lang)
186 except (NotImplementedError, KeyError, ValueError):
187 return None
188 except Exception as e:
189 logger.debug(f'num2words failed for {lang}: {e}')
190 return None
193def _expand_currency_number(match: re.Match, lang: str) -> str:
194 """Replace <symbol><amount> with <amount-words> <currency-word>.
195 E.g. Rs.200 → "two hundred rupees" (en) or "दो सौ रुपये" (hi).
196 """
197 symbol = match.group('sym')
198 amount_str = match.group('amt').replace(',', '')
199 try:
200 amount = float(amount_str) if '.' in amount_str else int(amount_str)
201 except ValueError:
202 return match.group(0)
203 words = _num_to_words(amount, lang)
204 if words is None:
205 words = _num_to_words(amount, 'en') or str(amount)
206 return f'{words} {_currency_word(symbol, lang)}'
209def _expand_standalone_number(match: re.Match, lang: str) -> str:
210 """Replace bare numbers with words. Falls through to original on failure."""
211 raw = match.group(0).replace(',', '')
212 try:
213 n = float(raw) if '.' in raw else int(raw)
214 except ValueError:
215 return match.group(0)
216 return _num_to_words(n, lang) or _num_to_words(n, 'en') or raw
219def _expand_percent(match: re.Match, lang: str) -> str:
220 """12.5% → 'twelve point five percent' (or target-lang equivalent)."""
221 num_part = match.group('num').replace(',', '')
222 try:
223 n = float(num_part) if '.' in num_part else int(num_part)
224 except ValueError:
225 return match.group(0)
226 words = _num_to_words(n, lang) or _num_to_words(n, 'en') or num_part
227 return f'{words} {_percent_word(lang)}'
230def _expand_url(match: re.Match, lang: str) -> str:
231 return _link_word(lang)
234def _expand_email(match: re.Match, lang: str) -> str:
235 at, dot = _at_dot(lang)
236 user = match.group('user')
237 domain = match.group('domain')
238 # Spell the domain dots too ("example.com" → "example dot com")
239 domain_spoken = domain.replace('.', f' {dot} ')
240 return f'{user} {at} {domain_spoken}'
243def _expand_time(match: re.Match, lang: str) -> str:
244 """2:30 PM → 'two thirty PM' (EN only — other langs fall through)."""
245 if lang != 'en':
246 return match.group(0)
247 h = int(match.group('h'))
248 m = int(match.group('m'))
249 ampm = match.group('ampm') or ''
250 h_words = _num_to_words(h, 'en') or str(h)
251 if m == 0:
252 return f'{h_words} o clock{" " + ampm if ampm else ""}'
253 m_words = _num_to_words(m, 'en') or str(m)
254 return f'{h_words} {m_words}{" " + ampm if ampm else ""}'
257def rule_normalize(text: str, lang: str) -> str:
258 """Apply all regex-based normalizations. Deterministic, <1 ms."""
259 out = text
261 # Emails FIRST (before URL matcher eats the @-host)
262 out = re.sub(
263 r'(?P<user>[\w.+\-]+)@(?P<domain>[\w.\-]+\.[A-Za-z]{2,})',
264 lambda m: _expand_email(m, lang),
265 out,
266 )
268 # URLs
269 out = re.sub(
270 r'https?://\S+',
271 lambda m: _expand_url(m, lang),
272 out,
273 )
275 # Currency: $100 / ₹200 / Rs.200 / €50.25 — optional dot/comma in amount
276 out = re.sub(
277 r'(?P<sym>\$|₹|€|£|¥|₩|₽|Rs\.?)\s?(?P<amt>[\d,]+(?:\.\d+)?)',
278 lambda m: _expand_currency_number(m, lang),
279 out,
280 )
282 # Percent
283 out = re.sub(
284 r'(?P<num>[\d,]+(?:\.\d+)?)\s?%',
285 lambda m: _expand_percent(m, lang),
286 out,
287 )
289 # Time HH:MM with optional AM/PM
290 out = re.sub(
291 r'\b(?P<h>\d{1,2}):(?P<m>\d{2})\s?(?P<ampm>AM|PM|am|pm)?\b',
292 lambda m: _expand_time(m, lang),
293 out,
294 )
296 # Standalone numbers LAST — catches any residual bare digits.
297 # Only target ASCII digits; Indic digits (০-৯ ०-९ etc.) stay intact.
298 out = re.sub(
299 r'\b\d[\d,]*(?:\.\d+)?\b',
300 lambda m: _expand_standalone_number(m, lang),
301 out,
302 )
304 return out
307# ─── LLM fallback ──────────────────────────────────────────────────────
309def _has_residual_tokens(text: str) -> bool:
310 """True if rule-normalized text still contains digits or unspoken
311 acronyms that TTS would stumble on.
312 """
313 return bool(_RESIDUAL_PATTERN.search(text))
316def _get_model_bus():
317 """Resolve the local ModelBus singleton.
319 Tries legacy module-level accessors first (used by several other
320 call sites), then falls back to constructing ModelBusService lazily.
321 Returns None if the model_bus module is absent or construction fails
322 — caller MUST handle None gracefully.
323 """
324 try:
325 from integrations.agent_engine import model_bus_service as _mbs
326 except ImportError:
327 return None
328 # Prefer whatever accessor the rest of the codebase agrees on
329 for attr in ('get_model_bus', 'get_model_bus_service'):
330 fn = getattr(_mbs, attr, None)
331 if callable(fn):
332 try:
333 return fn()
334 except Exception as e:
335 logger.debug(f'{attr}() failed: {e}')
336 # Fallback: construct directly — cheap if already a singleton internally
337 try:
338 return _mbs.ModelBusService()
339 except Exception as e:
340 logger.debug(f'ModelBusService() failed: {e}')
341 return None
344def _llm_normalize(text: str, lang: str) -> Optional[str]:
345 """Ask the local 0.8B model to expand remaining unspeakable tokens.
346 Returns None on timeout, error, or backend absence.
347 """
348 bus = _get_model_bus()
349 if bus is None:
350 return None
352 prompt = (
353 f'Normalize the following text for text-to-speech in {lang}.\n'
354 f'Expand numbers, currency, units, dates, and abbreviations into '
355 f'their spoken form in {lang}. Keep all other content unchanged. '
356 f'Return ONLY the normalized text, no commentary.\n\n'
357 f'Text: {text}\n'
358 f'Normalized:'
359 )
361 try:
362 t0 = time.time()
363 result = bus.infer(
364 model_type='llm',
365 prompt=prompt,
366 options={'max_tokens': min(512, len(text) * 3), 'temperature': 0.0},
367 )
368 if (time.time() - t0) > _LLM_TIMEOUT_SEC:
369 logger.debug(f'tts_normalize LLM call exceeded {_LLM_TIMEOUT_SEC}s — using rule output')
370 return None
371 if isinstance(result, dict) and 'response' in result:
372 response = result['response'].strip()
373 # Guard: LLM sometimes echoes the prompt; strip "Normalized:" prefix
374 if response.lower().startswith('normalized:'):
375 response = response[len('normalized:'):].strip()
376 return response or None
377 return None
378 except Exception as e:
379 logger.debug(f'tts_normalize LLM fallback failed: {e}')
380 return None
383# ─── Public API ────────────────────────────────────────────────────────
385def normalize_for_tts(
386 text: str,
387 lang: str = 'en',
388 use_llm: bool = True,
389) -> str:
390 """Normalize text so TTS can pronounce everything.
392 Pipeline:
393 1. Cache lookup → return if hit
394 2. Rule pass (num2words + regex)
395 3. If residual digits / acronyms remain AND use_llm → LLM pass
396 4. Cache the final result
398 Args:
399 text: Raw text (may contain currency, numbers, URLs, acronyms)
400 lang: Target ISO 639-1 language code
401 use_llm: If False, skip LLM fallback (faster, but residual tokens
402 may remain). Use when caller is latency-sensitive.
404 Returns:
405 Normalized text safe for any TTS engine.
406 """
407 if not text or not text.strip():
408 return text
410 lang = (lang or 'en').lower().split('-')[0].split('_')[0]
412 cached = _cache_get(text, lang)
413 if cached is not None:
414 return cached
416 normalized = rule_normalize(text, lang)
418 if use_llm and _has_residual_tokens(normalized):
419 llm_out = _llm_normalize(normalized, lang)
420 if llm_out:
421 normalized = llm_out
423 _cache_put(text, lang, normalized)
424 return normalized