Coverage for integrations / service_tools / f5_tts_tool.py: 55.9%
34 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2F5-TTS tool — flow-matching voice cloning (English + Chinese, GPU).
4VRAM: 1.3GB model size, 2GB recommended.
5Requires: pip install f5-tts
7SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the
8worker subprocess entry point. When imported normally, `f5_synthesize()`
9dispatches to a subprocess running this same module via `python -m`.
10CUDA OOM or any C-level crash only kills the subprocess — the parent
11receives `{"error": ..., "transient": true}` and can fall back to Piper.
13Public API (parent side):
14 f5_synthesize(text, language, voice, output_path) → JSON
15 unload_f5_tts() → None
17Worker entry (child side):
18 python -m integrations.service_tools.f5_tts_tool
19"""
21from typing import Optional
23from integrations.service_tools.gpu_worker import ToolWorker
25# ── Worker callbacks (run in subprocess) ──────────────────────────
26#
27# Ported from Nunba's tts/tts_engine.py::_LazyF5 — this module is now
28# the SINGLE source of truth for F5-TTS synthesis. Nunba's TTSEngine
29# routes here via a subprocess adapter; no more parallel in-process
30# implementation.
33import os
36# Default reference voice for F5 voice cloning. Same path Nunba used
37# historically, so existing users' ref audio keeps working.
38_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')
41def _load():
42 """Load F5-TTS model once at subprocess startup (~40s).
44 Uses F5TTS_v1_Base on CUDA — this worker exists because the parent
45 decided it wants GPU F5. If CUDA isn't available the load will fail
46 and the parent (Nunba TTSEngine / HARTOS tool caller) falls back to
47 Piper.
48 """
49 from f5_tts.api import F5TTS
50 return F5TTS(model='F5TTS_v1_Base', device='cuda')
53def _synthesize(model, req: dict) -> dict:
54 """Run one synthesis request inside the worker.
56 Writes directly to output_path via F5's file_wave= arg (avoids a
57 second soundfile.write pass).
58 """
59 text = req.get('text', '')
60 if not text or not text.strip():
61 return {'error': 'Text is required'}
63 output_path = req.get('output_path')
64 if not output_path:
65 return {'error': 'output_path is required'}
67 # Resolve reference voice: request override → default Lily.mp3 → empty
68 # string (F5 auto-picks a voice).
69 ref_voice = req.get('voice')
70 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE):
71 ref_voice = _DEFAULT_REF_VOICE
72 ref_voice = ref_voice or ''
74 # Speed is forwarded from the adapter so synthesize_text(..., speed=0.8)
75 # reaches F5's infer() — preserves behavior of the old _LazyF5 class.
76 speed = float(req.get('speed') or 1.0)
78 wav, sr, _ = model.infer(
79 ref_file=ref_voice,
80 ref_text='', # empty = auto-transcribe, cached by F5
81 gen_text=text,
82 file_wave=output_path, # writes WAV directly
83 speed=speed,
84 )
86 return {
87 'path': output_path,
88 'duration': round(len(wav) / sr, 2),
89 'sample_rate': sr,
90 'engine': 'f5-tts',
91 'device': 'cuda',
92 'voice': ref_voice or 'default',
93 }
96# ── Parent-side: one ToolWorker instance ─────────────────────────
98_tool = ToolWorker(
99 tool_name='f5_tts',
100 tool_module='integrations.service_tools.f5_tts_tool',
101 vram_budget='tts_f5',
102 output_subdir='f5_tts/output',
103 engine='f5-tts',
104 startup_timeout=90.0,
105 request_timeout=120.0,
106)
109def f5_synthesize(
110 text: str,
111 language: str = 'en',
112 voice: Optional[str] = None,
113 output_path: Optional[str] = None,
114 speed: float = 1.0,
115) -> str:
116 """Synthesize speech using F5-TTS (GPU subprocess).
118 Args:
119 speed: Synthesis speed multiplier passed through to F5's
120 infer() call. 1.0 = normal, >1 = faster, <1 = slower.
121 Preserved from the legacy _LazyF5 behavior.
123 Returns JSON. On subprocess crash the response contains
124 `{"error": ..., "transient": true}` so the caller can fall back.
125 """
126 return _tool.synthesize(
127 text=text,
128 language=language,
129 voice=voice,
130 output_path=output_path,
131 extra_request={'speed': speed} if speed != 1.0 else None,
132 )
135def unload_f5_tts():
136 """Stop the F5 worker subprocess and free its VRAM."""
137 _tool.stop()
140class F5TTSTool:
141 """Register F5-TTS as an in-process service tool."""
143 @classmethod
144 def register_functions(cls):
145 from .registry import ServiceToolInfo, service_tool_registry
146 tool_info = ServiceToolInfo(
147 name="f5_tts",
148 description=(
149 "F5-TTS: flow-matching voice cloning. "
150 "English + Chinese, 1.3GB VRAM. "
151 "Requires: pip install f5-tts"
152 ),
153 base_url="inprocess://f5_tts",
154 endpoints={
155 "synthesize": {
156 "path": "/synthesize",
157 "method": "POST",
158 "description": "Synthesize with F5-TTS (English + Chinese, GPU).",
159 "params_schema": {
160 "text": {"type": "string"},
161 "language": {"type": "string"},
162 "voice": {"type": "string", "description": "Reference audio path"},
163 },
164 },
165 },
166 tags=["tts", "speech", "voice-cloning", "gpu", "f5"],
167 timeout=60,
168 )
169 tool_info.is_healthy = True
170 service_tool_registry._tools["f5_tts"] = tool_info
171 return True
173# NOTE: no `if __name__ == '__main__':` block here. The centralized
174# dispatcher at integrations.service_tools.gpu_worker imports this
175# module and calls `_load` / `_synthesize` directly when spawned.