Coverage for integrations/service_tools/f5_tts

1"""

2F5-TTS tool — flow-matching voice cloning (English + Chinese, GPU).

4VRAM: 1.3GB model size, 2GB recommended.

5Requires: pip install f5-tts

7SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the

8worker subprocess entry point. When imported normally, `f5_synthesize()`

9dispatches to a subprocess running this same module via `python -m`.

10CUDA OOM or any C-level crash only kills the subprocess — the parent

11receives `{"error": ..., "transient": true}` and can fall back to Piper.

13Public API (parent side):

14 f5_synthesize(text, language, voice, output_path) → JSON

15 unload_f5_tts() → None

17Worker entry (child side):

18 python -m integrations.service_tools.f5_tts_tool

19"""

21from typing import Optional

23from integrations.service_tools.gpu_worker import ToolWorker

25# ── Worker callbacks (run in subprocess) ──────────────────────────

26#

27# Ported from Nunba's tts/tts_engine.py::_LazyF5 — this module is now

28# the SINGLE source of truth for F5-TTS synthesis. Nunba's TTSEngine

29# routes here via a subprocess adapter; no more parallel in-process

30# implementation.

33import os

36# Default reference voice for F5 voice cloning. Same path Nunba used

37# historically, so existing users' ref audio keeps working.

38_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3')

41def _load():

42 """Load F5-TTS model once at subprocess startup (~40s).

44 Uses F5TTS_v1_Base on CUDA — this worker exists because the parent

45 decided it wants GPU F5. If CUDA isn't available the load will fail

46 and the parent (Nunba TTSEngine / HARTOS tool caller) falls back to

47 Piper.

48 """

49 from f5_tts.api import F5TTS

50 return F5TTS(model='F5TTS_v1_Base', device='cuda')

53def _synthesize(model, req: dict) -> dict:

54 """Run one synthesis request inside the worker.

56 Writes directly to output_path via F5's file_wave= arg (avoids a

57 second soundfile.write pass).

58 """

59 text = req.get('text', '')

60 if not text or not text.strip():

61 return {'error': 'Text is required'}

63 output_path = req.get('output_path')

64 if not output_path:

65 return {'error': 'output_path is required'}

67 # Resolve reference voice: request override → default Lily.mp3 → empty

68 # string (F5 auto-picks a voice).

69 ref_voice = req.get('voice')

70 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE):

71 ref_voice = _DEFAULT_REF_VOICE

72 ref_voice = ref_voice or ''

74 # Speed is forwarded from the adapter so synthesize_text(..., speed=0.8)

75 # reaches F5's infer() — preserves behavior of the old _LazyF5 class.

76 speed = float(req.get('speed') or 1.0)

78 wav, sr, _ = model.infer(

79 ref_file=ref_voice,

80 ref_text='', # empty = auto-transcribe, cached by F5

81 gen_text=text,

82 file_wave=output_path, # writes WAV directly

83 speed=speed,

84 )

86 return {

87 'path': output_path,

88 'duration': round(len(wav) / sr, 2),

89 'sample_rate': sr,

90 'engine': 'f5-tts',

91 'device': 'cuda',

92 'voice': ref_voice or 'default',

93 }

96# ── Parent-side: one ToolWorker instance ─────────────────────────

98_tool = ToolWorker(

99 tool_name='f5_tts',

100 tool_module='integrations.service_tools.f5_tts_tool',

101 vram_budget='tts_f5',

102 output_subdir='f5_tts/output',

103 engine='f5-tts',

104 startup_timeout=90.0,

105 request_timeout=120.0,

106)

107

108

109def f5_synthesize(

110 text: str,

111 language: str = 'en',

112 voice: Optional[str] = None,

113 output_path: Optional[str] = None,

114 speed: float = 1.0,

115) -> str:

116 """Synthesize speech using F5-TTS (GPU subprocess).

117

118 Args:

119 speed: Synthesis speed multiplier passed through to F5's

120 infer() call. 1.0 = normal, >1 = faster, <1 = slower.

121 Preserved from the legacy _LazyF5 behavior.

122

123 Returns JSON. On subprocess crash the response contains

124 `{"error": ..., "transient": true}` so the caller can fall back.

125 """

126 return _tool.synthesize(

127 text=text,

128 language=language,

129 voice=voice,

130 output_path=output_path,

131 extra_request={'speed': speed} if speed != 1.0 else None,

132 )

133

134

135def unload_f5_tts():

136 """Stop the F5 worker subprocess and free its VRAM."""

137 _tool.stop()

138

139

140class F5TTSTool:

141 """Register F5-TTS as an in-process service tool."""

142

143 @classmethod

144 def register_functions(cls):

145 from .registry import ServiceToolInfo, service_tool_registry

146 tool_info = ServiceToolInfo(

147 name="f5_tts",

148 description=(

149 "F5-TTS: flow-matching voice cloning. "

150 "English + Chinese, 1.3GB VRAM. "

151 "Requires: pip install f5-tts"

152 ),

153 base_url="inprocess://f5_tts",

154 endpoints={

155 "synthesize": {

156 "path": "/synthesize",

157 "method": "POST",

158 "description": "Synthesize with F5-TTS (English + Chinese, GPU).",

159 "params_schema": {

160 "text": {"type": "string"},

161 "language": {"type": "string"},

162 "voice": {"type": "string", "description": "Reference audio path"},

163 },

164 },

165 },

166 tags=["tts", "speech", "voice-cloning", "gpu", "f5"],

167 timeout=60,

168 )

169 tool_info.is_healthy = True

170 service_tool_registry._tools["f5_tts"] = tool_info

171 return True

172

173# NOTE: no `if __name__ == '__main__':` block here. The centralized

174# dispatcher at integrations.service_tools.gpu_worker imports this

175# module and calls `_load` / `_synthesize` directly when spawned.

Coverage for integrations / service_tools / f5_tts_tool.py: 55.9%

34 statements