Coverage for integrations / service_tools / f5_tts_tool.py: 55.9%

34 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2F5-TTS tool — flow-matching voice cloning (English + Chinese, GPU). 

3 

4VRAM: 1.3GB model size, 2GB recommended. 

5Requires: pip install f5-tts 

6 

7SUBPROCESS ISOLATED: this module is BOTH the parent-side tool AND the 

8worker subprocess entry point. When imported normally, `f5_synthesize()` 

9dispatches to a subprocess running this same module via `python -m`. 

10CUDA OOM or any C-level crash only kills the subprocess — the parent 

11receives `{"error": ..., "transient": true}` and can fall back to Piper. 

12 

13Public API (parent side): 

14 f5_synthesize(text, language, voice, output_path) → JSON 

15 unload_f5_tts() → None 

16 

17Worker entry (child side): 

18 python -m integrations.service_tools.f5_tts_tool 

19""" 

20 

21from typing import Optional 

22 

23from integrations.service_tools.gpu_worker import ToolWorker 

24 

25# ── Worker callbacks (run in subprocess) ────────────────────────── 

26# 

27# Ported from Nunba's tts/tts_engine.py::_LazyF5 — this module is now 

28# the SINGLE source of truth for F5-TTS synthesis. Nunba's TTSEngine 

29# routes here via a subprocess adapter; no more parallel in-process 

30# implementation. 

31 

32 

33import os 

34 

35 

36# Default reference voice for F5 voice cloning. Same path Nunba used 

37# historically, so existing users' ref audio keeps working. 

38_DEFAULT_REF_VOICE = os.path.join(os.path.expanduser('~'), 'Downloads', 'Lily.mp3') 

39 

40 

41def _load(): 

42 """Load F5-TTS model once at subprocess startup (~40s). 

43 

44 Uses F5TTS_v1_Base on CUDA — this worker exists because the parent 

45 decided it wants GPU F5. If CUDA isn't available the load will fail 

46 and the parent (Nunba TTSEngine / HARTOS tool caller) falls back to 

47 Piper. 

48 """ 

49 from f5_tts.api import F5TTS 

50 return F5TTS(model='F5TTS_v1_Base', device='cuda') 

51 

52 

53def _synthesize(model, req: dict) -> dict: 

54 """Run one synthesis request inside the worker. 

55 

56 Writes directly to output_path via F5's file_wave= arg (avoids a 

57 second soundfile.write pass). 

58 """ 

59 text = req.get('text', '') 

60 if not text or not text.strip(): 

61 return {'error': 'Text is required'} 

62 

63 output_path = req.get('output_path') 

64 if not output_path: 

65 return {'error': 'output_path is required'} 

66 

67 # Resolve reference voice: request override → default Lily.mp3 → empty 

68 # string (F5 auto-picks a voice). 

69 ref_voice = req.get('voice') 

70 if not ref_voice and os.path.isfile(_DEFAULT_REF_VOICE): 

71 ref_voice = _DEFAULT_REF_VOICE 

72 ref_voice = ref_voice or '' 

73 

74 # Speed is forwarded from the adapter so synthesize_text(..., speed=0.8) 

75 # reaches F5's infer() — preserves behavior of the old _LazyF5 class. 

76 speed = float(req.get('speed') or 1.0) 

77 

78 wav, sr, _ = model.infer( 

79 ref_file=ref_voice, 

80 ref_text='', # empty = auto-transcribe, cached by F5 

81 gen_text=text, 

82 file_wave=output_path, # writes WAV directly 

83 speed=speed, 

84 ) 

85 

86 return { 

87 'path': output_path, 

88 'duration': round(len(wav) / sr, 2), 

89 'sample_rate': sr, 

90 'engine': 'f5-tts', 

91 'device': 'cuda', 

92 'voice': ref_voice or 'default', 

93 } 

94 

95 

96# ── Parent-side: one ToolWorker instance ───────────────────────── 

97 

98_tool = ToolWorker( 

99 tool_name='f5_tts', 

100 tool_module='integrations.service_tools.f5_tts_tool', 

101 vram_budget='tts_f5', 

102 output_subdir='f5_tts/output', 

103 engine='f5-tts', 

104 startup_timeout=90.0, 

105 request_timeout=120.0, 

106) 

107 

108 

109def f5_synthesize( 

110 text: str, 

111 language: str = 'en', 

112 voice: Optional[str] = None, 

113 output_path: Optional[str] = None, 

114 speed: float = 1.0, 

115) -> str: 

116 """Synthesize speech using F5-TTS (GPU subprocess). 

117 

118 Args: 

119 speed: Synthesis speed multiplier passed through to F5's 

120 infer() call. 1.0 = normal, >1 = faster, <1 = slower. 

121 Preserved from the legacy _LazyF5 behavior. 

122 

123 Returns JSON. On subprocess crash the response contains 

124 `{"error": ..., "transient": true}` so the caller can fall back. 

125 """ 

126 return _tool.synthesize( 

127 text=text, 

128 language=language, 

129 voice=voice, 

130 output_path=output_path, 

131 extra_request={'speed': speed} if speed != 1.0 else None, 

132 ) 

133 

134 

135def unload_f5_tts(): 

136 """Stop the F5 worker subprocess and free its VRAM.""" 

137 _tool.stop() 

138 

139 

140class F5TTSTool: 

141 """Register F5-TTS as an in-process service tool.""" 

142 

143 @classmethod 

144 def register_functions(cls): 

145 from .registry import ServiceToolInfo, service_tool_registry 

146 tool_info = ServiceToolInfo( 

147 name="f5_tts", 

148 description=( 

149 "F5-TTS: flow-matching voice cloning. " 

150 "English + Chinese, 1.3GB VRAM. " 

151 "Requires: pip install f5-tts" 

152 ), 

153 base_url="inprocess://f5_tts", 

154 endpoints={ 

155 "synthesize": { 

156 "path": "/synthesize", 

157 "method": "POST", 

158 "description": "Synthesize with F5-TTS (English + Chinese, GPU).", 

159 "params_schema": { 

160 "text": {"type": "string"}, 

161 "language": {"type": "string"}, 

162 "voice": {"type": "string", "description": "Reference audio path"}, 

163 }, 

164 }, 

165 }, 

166 tags=["tts", "speech", "voice-cloning", "gpu", "f5"], 

167 timeout=60, 

168 ) 

169 tool_info.is_healthy = True 

170 service_tool_registry._tools["f5_tts"] = tool_info 

171 return True 

172 

173# NOTE: no `if __name__ == '__main__':` block here. The centralized 

174# dispatcher at integrations.service_tools.gpu_worker imports this 

175# module and calls `_load` / `_synthesize` directly when spawned.