Coverage for integrations / service_tools / llamacpp_manager.py: 0.0%

368 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2llama.cpp Server Manager -- lifecycle management for local LLM inference. 

3 

4Manages llama-server (or llama-cpp-python) processes: 

5 - Auto-downloads llama.cpp release binaries if not found 

6 - Starts server with optimal settings for detected hardware 

7 - Health monitoring and auto-restart 

8 - Model hot-swap (stop -> load new GGUF -> start) 

9 - Graceful shutdown 

10 

11Standalone mode: HARTOS manages its own llama.cpp (not waiting for Nunba). 

12Bundled mode: Defers to Nunba's llama.cpp server. 

13 

14Usage: 

15 from integrations.service_tools.llamacpp_manager import get_llamacpp_manager 

16 

17 mgr = get_llamacpp_manager() 

18 mgr.start('/path/to/model.gguf') 

19 print(mgr.health()) 

20 mgr.swap_model('/path/to/other.gguf') 

21 mgr.stop() 

22""" 

23 

24import logging 

25import os 

26import platform 

27import shutil 

28import stat 

29import subprocess 

30import sys 

31import threading 

32import time 

33import zipfile 

34from pathlib import Path 

35from typing import Any, Dict, Optional 

36 

37logger = logging.getLogger(__name__) 

38 

39# Default directories 

40_HEVOLVE_HOME = Path.home() / '.hevolve' 

41_BIN_DIR = _HEVOLVE_HOME / 'bin' 

42_MODELS_DIR = _HEVOLVE_HOME / 'models' 

43 

44# Health check timing 

45_HEALTH_START_TIMEOUT = 30 # Max seconds to wait for server on start 

46_HEALTH_POLL_INTERVAL = 0.5 # Initial poll interval (seconds) 

47_HEALTH_POLL_MAX_INTERVAL = 2.0 # Max poll interval (exponential backoff cap) 

48_HEALTH_CHECK_TIMEOUT = 3 # HTTP timeout for a single health check (seconds) 

49 

50# Process shutdown 

51_STOP_GRACE_PERIOD = 5 # Seconds to wait after terminate() before kill() 

52 

53# GitHub release 

54_GITHUB_RELEASE_API = 'https://api.github.com/repos/ggml-org/llama.cpp/releases/latest' 

55 

56# Platform binary name patterns for GitHub release assets 

57_PLATFORM_ASSET_PATTERNS = { 

58 ('Windows', 'AMD64'): 'win-amd64', 

59 ('Windows', 'x86_64'): 'win-amd64', 

60 ('Linux', 'x86_64'): 'ubuntu-x64', 

61 ('Linux', 'aarch64'): 'ubuntu-arm64', 

62 ('Darwin', 'x86_64'): 'macos-x64', 

63 ('Darwin', 'arm64'): 'macos-arm64', 

64} 

65 

66 

67def _get_platform_key() -> str: 

68 """Return the platform asset key for the current system.""" 

69 system = platform.system() 

70 machine = platform.machine() 

71 return _PLATFORM_ASSET_PATTERNS.get((system, machine), '') 

72 

73 

74def _server_binary_name() -> str: 

75 """Return the expected server binary filename for this OS.""" 

76 if sys.platform == 'win32': 

77 return 'llama-server.exe' 

78 return 'llama-server' 

79 

80 

81def _http_get(url: str, timeout: int = _HEALTH_CHECK_TIMEOUT) -> Any: 

82 """Perform an HTTP GET, preferring pooled session, falling back to urllib. 

83 

84 Returns the parsed JSON body on success, or None on failure. 

85 """ 

86 # Try pooled session first (avoids new TCP connection) 

87 try: 

88 from core.http_pool import pooled_get 

89 resp = pooled_get(url, timeout=(timeout, timeout)) 

90 resp.raise_for_status() 

91 return resp.json() 

92 except Exception: 

93 pass 

94 

95 # Fallback: stdlib urllib (zero dependencies) 

96 try: 

97 import json 

98 import urllib.request 

99 req = urllib.request.Request(url, headers={'Accept': 'application/json'}) 

100 with urllib.request.urlopen(req, timeout=timeout) as resp: 

101 return json.loads(resp.read().decode('utf-8')) 

102 except Exception: 

103 return None 

104 

105 

106def _http_get_raw(url: str, timeout: int = 30) -> Optional[bytes]: 

107 """Download raw bytes from a URL. Returns bytes or None.""" 

108 try: 

109 import urllib.request 

110 req = urllib.request.Request(url, headers={ 

111 'Accept': 'application/octet-stream', 

112 'User-Agent': 'HARTOS-LlamaCppManager/1.0', 

113 }) 

114 with urllib.request.urlopen(req, timeout=timeout) as resp: 

115 return resp.read() 

116 except Exception as exc: 

117 logger.error(f"Download failed for {url}: {exc}") 

118 return None 

119 

120 

121def _http_get_json(url: str, timeout: int = 15) -> Optional[Dict]: 

122 """Fetch JSON from a URL using urllib (for GitHub API). Returns dict or None.""" 

123 try: 

124 import json 

125 import urllib.request 

126 req = urllib.request.Request(url, headers={ 

127 'Accept': 'application/vnd.github+json', 

128 'User-Agent': 'HARTOS-LlamaCppManager/1.0', 

129 }) 

130 with urllib.request.urlopen(req, timeout=timeout) as resp: 

131 return json.loads(resp.read().decode('utf-8')) 

132 except Exception as exc: 

133 logger.error(f"GitHub API request failed: {exc}") 

134 return None 

135 

136 

137class LlamaCppManager: 

138 """Manages a llama-server process for local LLM inference. 

139 

140 Thread-safe: all mutating operations (start/stop/swap) are guarded by a Lock. 

141 """ 

142 

143 def __init__(self): 

144 self._process: Optional[subprocess.Popen] = None 

145 self._current_model: Optional[str] = None 

146 self._port: int = 8080 

147 self._lock = threading.Lock() 

148 self._server_binary: Optional[Path] = None 

149 

150 # ── Public API ─────────────────────────────────────────────── 

151 

152 def start(self, model_path: str, port: int = 8080, **kwargs) -> bool: 

153 """Start llama-server with the given GGUF model. 

154 

155 Auto-detects hardware and selects optimal parameters (GPU layers, 

156 context size, thread count, flash attention). 

157 

158 Args: 

159 model_path: Absolute path to a .gguf model file. 

160 port: Port to listen on (default 8080, from port_registry 'llm'). 

161 **kwargs: Additional overrides for server params (n_gpu_layers, 

162 ctx_size, threads, flash_attn, etc.). 

163 

164 Returns: 

165 True if server started and health check passed, False otherwise. 

166 """ 

167 with self._lock: 

168 return self._start_locked(model_path, port, **kwargs) 

169 

170 def stop(self) -> bool: 

171 """Gracefully stop the managed llama-server process. 

172 

173 Sends terminate signal, waits up to 5 seconds, then force-kills 

174 if the process has not exited. 

175 

176 Returns: 

177 True if the process was stopped (or was not running), False on error. 

178 """ 

179 with self._lock: 

180 return self._stop_locked() 

181 

182 def is_running(self) -> bool: 

183 """Check if the managed server process is alive AND responding to health checks. 

184 

185 Returns: 

186 True if the process is running and /health returns successfully. 

187 """ 

188 if self._process is None: 

189 return False 

190 if self._process.poll() is not None: 

191 # Process has exited 

192 logger.warning( 

193 f"llama-server process exited with code {self._process.returncode}") 

194 self._process = None 

195 return False 

196 # Process alive -- verify health endpoint 

197 return self._check_health() 

198 

199 def health(self) -> Dict: 

200 """Query the llama-server /health endpoint. 

201 

202 Returns: 

203 Parsed JSON from /health on success, or an error dict. 

204 """ 

205 if self._process is None: 

206 return {'status': 'not_running', 'error': 'No managed server process'} 

207 

208 url = f'http://127.0.0.1:{self._port}/health' 

209 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT) 

210 if result is not None: 

211 return result 

212 return { 

213 'status': 'error', 

214 'error': 'Health endpoint did not respond', 

215 'port': self._port, 

216 'model': self._current_model, 

217 } 

218 

219 def swap_model(self, new_model_path: str, **kwargs) -> bool: 

220 """Hot-swap: stop the current model and start with a new one. 

221 

222 Args: 

223 new_model_path: Path to the new .gguf model file. 

224 **kwargs: Additional server param overrides. 

225 

226 Returns: 

227 True if the new model started successfully. 

228 """ 

229 with self._lock: 

230 port = self._port 

231 logger.info( 

232 f"Swapping model: {self._current_model} -> {new_model_path}") 

233 self._stop_locked() 

234 return self._start_locked(new_model_path, port, **kwargs) 

235 

236 def get_server_binary(self) -> Optional[Path]: 

237 """Locate the llama-server binary on this system. 

238 

239 Search order: 

240 1. Cached result from a previous call 

241 2. System PATH (llama-server, llama-cpp-server) 

242 3. ~/.hevolve/bin/llama-server[.exe] 

243 

244 Returns: 

245 Path to the binary, or None if not found. 

246 """ 

247 if self._server_binary and self._server_binary.exists(): 

248 return self._server_binary 

249 

250 binary_name = _server_binary_name() 

251 

252 # 1. Check PATH 

253 for name in ('llama-server', 'llama-cpp-server'): 

254 if sys.platform == 'win32': 

255 name += '.exe' 

256 found = shutil.which(name) 

257 if found: 

258 self._server_binary = Path(found) 

259 logger.info(f"Found llama-server on PATH: {self._server_binary}") 

260 return self._server_binary 

261 

262 # 2. Check ~/.hevolve/bin/ 

263 local_bin = _BIN_DIR / binary_name 

264 if local_bin.exists(): 

265 self._server_binary = local_bin 

266 logger.info(f"Found llama-server at: {self._server_binary}") 

267 return self._server_binary 

268 

269 logger.info("llama-server binary not found on this system") 

270 return None 

271 

272 def download_server(self) -> Optional[Path]: 

273 """Download the latest llama.cpp release binary from GitHub. 

274 

275 Detects the current platform, downloads the appropriate archive, 

276 extracts llama-server to ~/.hevolve/bin/, and makes it executable. 

277 

278 Returns: 

279 Path to the downloaded binary, or None on failure. 

280 """ 

281 platform_key = _get_platform_key() 

282 if not platform_key: 

283 logger.error( 

284 f"Unsupported platform: {platform.system()} {platform.machine()}") 

285 return None 

286 

287 # Fetch latest release metadata 

288 logger.info("Fetching latest llama.cpp release from GitHub...") 

289 release = _http_get_json(_GITHUB_RELEASE_API) 

290 if not release: 

291 logger.error("Failed to fetch release info from GitHub") 

292 return None 

293 

294 tag = release.get('tag_name', 'unknown') 

295 assets = release.get('assets', []) 

296 logger.info(f"Latest release: {tag} ({len(assets)} assets)") 

297 

298 # Find matching asset 

299 target_asset = None 

300 for asset in assets: 

301 name = asset.get('name', '') 

302 # Match pattern: llama-{tag}-bin-{platform_key}.zip 

303 if platform_key in name and name.endswith('.zip'): 

304 target_asset = asset 

305 break 

306 

307 if not target_asset: 

308 # Broader search: any zip containing the platform key 

309 for asset in assets: 

310 name = asset.get('name', '') 

311 if platform_key in name and ('.zip' in name or '.tar.gz' in name): 

312 target_asset = asset 

313 break 

314 

315 if not target_asset: 

316 logger.error( 

317 f"No matching asset found for platform '{platform_key}' " 

318 f"in release {tag}. Available: " 

319 f"{[a['name'] for a in assets[:10]]}") 

320 return None 

321 

322 download_url = target_asset.get('browser_download_url', '') 

323 asset_name = target_asset.get('name', '') 

324 asset_size = target_asset.get('size', 0) 

325 logger.info( 

326 f"Downloading: {asset_name} ({asset_size / 1024 / 1024:.1f} MB)") 

327 

328 # Download 

329 data = _http_get_raw(download_url, timeout=300) 

330 if not data: 

331 return None 

332 

333 # Extract 

334 _BIN_DIR.mkdir(parents=True, exist_ok=True) 

335 archive_path = _BIN_DIR / asset_name 

336 

337 try: 

338 archive_path.write_bytes(data) 

339 binary_name = _server_binary_name() 

340 extracted_binary = None 

341 

342 if asset_name.endswith('.zip'): 

343 with zipfile.ZipFile(archive_path, 'r') as zf: 

344 # Find llama-server in the archive 

345 for entry in zf.namelist(): 

346 basename = Path(entry).name 

347 if basename == binary_name: 

348 # Extract this single file to _BIN_DIR 

349 source = zf.open(entry) 

350 target = _BIN_DIR / binary_name 

351 target.write_bytes(source.read()) 

352 source.close() 

353 extracted_binary = target 

354 break 

355 

356 if not extracted_binary: 

357 # Extract all, then look for the binary 

358 zf.extractall(_BIN_DIR) 

359 for p in _BIN_DIR.rglob(binary_name): 

360 extracted_binary = p 

361 break 

362 else: 

363 # .tar.gz 

364 import tarfile 

365 with tarfile.open(archive_path, 'r:gz') as tf: 

366 for member in tf.getmembers(): 

367 if Path(member.name).name == binary_name: 

368 tf.extract(member, _BIN_DIR) 

369 extracted_binary = _BIN_DIR / member.name 

370 break 

371 if not extracted_binary: 

372 tf.extractall(_BIN_DIR) 

373 for p in _BIN_DIR.rglob(binary_name): 

374 extracted_binary = p 

375 break 

376 

377 # Clean up archive 

378 archive_path.unlink(missing_ok=True) 

379 

380 if not extracted_binary or not extracted_binary.exists(): 

381 logger.error( 

382 f"Could not find {binary_name} in downloaded archive") 

383 return None 

384 

385 # Move to canonical location if nested 

386 canonical = _BIN_DIR / binary_name 

387 if extracted_binary != canonical: 

388 shutil.move(str(extracted_binary), str(canonical)) 

389 extracted_binary = canonical 

390 

391 # Make executable (Unix) 

392 if sys.platform != 'win32': 

393 extracted_binary.chmod( 

394 extracted_binary.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) 

395 

396 self._server_binary = extracted_binary 

397 logger.info(f"llama-server installed at: {extracted_binary}") 

398 return extracted_binary 

399 

400 except Exception as exc: 

401 logger.error(f"Failed to extract llama-server: {exc}") 

402 archive_path.unlink(missing_ok=True) 

403 return None 

404 

405 def get_optimal_params(self, model_path: str) -> Dict[str, Any]: 

406 """Calculate optimal llama-server parameters based on hardware. 

407 

408 Examines the GGUF file size as a proxy for model weight size, then 

409 checks available VRAM via vram_manager to decide GPU offload depth, 

410 context length, and threading. 

411 

412 Args: 

413 model_path: Path to the .gguf model file. 

414 

415 Returns: 

416 Dict with keys: n_gpu_layers, ctx_size, threads, flash_attn, 

417 host, port, and any additional flags. 

418 """ 

419 params: Dict[str, Any] = { 

420 'n_gpu_layers': 0, 

421 'ctx_size': 4096, 

422 'threads': max(1, (os.cpu_count() or 4) // 2), 

423 'flash_attn': False, 

424 'host': '127.0.0.1', 

425 'port': self._port, 

426 } 

427 

428 # Estimate model size from file 

429 model_size_gb = 0.0 

430 try: 

431 model_size_gb = os.path.getsize(model_path) / (1024 ** 3) 

432 logger.info(f"Model file size: {model_size_gb:.2f} GB") 

433 except OSError: 

434 logger.warning(f"Cannot stat model file: {model_path}") 

435 

436 # Query GPU via vram_manager 

437 gpu_info = self._get_gpu_info() 

438 free_vram = gpu_info.get('free_gb', 0.0) 

439 cuda_available = gpu_info.get('cuda_available', False) 

440 total_vram = gpu_info.get('total_gb', 0.0) 

441 

442 if cuda_available and free_vram > 0: 

443 if model_size_gb > 0 and free_vram >= model_size_gb * 1.1: 

444 # Enough VRAM to fit the entire model + overhead 

445 params['n_gpu_layers'] = -1 # All layers on GPU 

446 logger.info( 

447 f"Full GPU offload: {free_vram:.1f} GB free >= " 

448 f"{model_size_gb:.1f} GB model") 

449 elif model_size_gb > 0: 

450 # Partial offload: estimate fraction of layers that fit 

451 # Typical GGUF has ~32-80 layers; use ratio as heuristic 

452 ratio = free_vram / model_size_gb 

453 # Clamp to reasonable range 

454 estimated_layers = max(1, int(ratio * 40)) # assume ~40 layers 

455 params['n_gpu_layers'] = estimated_layers 

456 logger.info( 

457 f"Partial GPU offload: {estimated_layers} layers " 

458 f"({free_vram:.1f} GB free / {model_size_gb:.1f} GB model)") 

459 else: 

460 # Unknown model size, try full offload 

461 params['n_gpu_layers'] = -1 

462 

463 # Context size: balance between LLM capability and leaving VRAM for TTS. 

464 # Reserve ~3GB for TTS (Indic Parler ~1.2GB model + ~2GB inference). 

465 # KV cache memory ≈ ctx * layers * hidden_dim * 2 * 2 bytes (FP16). 

466 # Qwen 4B: 32 layers × 2560 dim × 2 heads × 2 bytes ≈ 0.3MB per 1K ctx. 

467 vram_after_model = free_vram - model_size_gb 

468 tts_reserve_gb = 3.0 # Reserve for GPU TTS (Indic Parler, F5, etc.) 

469 vram_for_ctx = vram_after_model - tts_reserve_gb 

470 if vram_for_ctx >= 3.0: 

471 params['ctx_size'] = 10240 # 10K — good balance 

472 elif vram_for_ctx >= 1.5: 

473 params['ctx_size'] = 8192 

474 elif vram_for_ctx >= 0.5: 

475 params['ctx_size'] = 4096 

476 else: 

477 params['ctx_size'] = 2048 

478 

479 # Flash attention: available on modern NVIDIA GPUs (Ampere+) 

480 # Heuristic: if GPU name contains known architectures 

481 gpu_name = (gpu_info.get('name') or '').lower() 

482 # Ampere: RTX 30xx, A100, etc. Hopper: H100. Ada: RTX 40xx 

483 flash_capable_keywords = [ 

484 'rtx 30', 'rtx 40', 'rtx 50', 'a100', 'a10', 'h100', 

485 'l40', 'rtx a', 'geforce 30', 'geforce 40', 

486 ] 

487 if any(kw in gpu_name for kw in flash_capable_keywords): 

488 params['flash_attn'] = True 

489 logger.info(f"Enabling flash attention for {gpu_info.get('name')}") 

490 

491 else: 

492 # CPU-only mode 

493 params['n_gpu_layers'] = 0 

494 params['ctx_size'] = 2048 # Conservative for CPU 

495 # Use more threads on CPU-only 

496 params['threads'] = max(1, (os.cpu_count() or 4) - 1) 

497 logger.info("CPU-only mode: no GPU available") 

498 

499 # ── ResourceGovernor cap: leave headroom for the rest of the OS ── 

500 # Never use ALL cores — reserve 25% for foreground apps. 

501 total_cores = os.cpu_count() or 4 

502 max_threads = max(1, int(total_cores * 0.75)) 

503 if params['threads'] > max_threads: 

504 logger.info("Capping threads %d → %d (75%% of %d cores)", 

505 params['threads'], max_threads, total_cores) 

506 params['threads'] = max_threads 

507 

508 # Cap context size based on available RAM (avoid low-memory warnings) 

509 try: 

510 import psutil 

511 avail_gb = psutil.virtual_memory().available / (1024**3) 

512 if avail_gb < 4.0 and params['ctx_size'] > 4096: 

513 params['ctx_size'] = 4096 

514 logger.info("Capping ctx_size to 4096 (only %.1fGB RAM available)", avail_gb) 

515 elif avail_gb < 2.0 and params['ctx_size'] > 2048: 

516 params['ctx_size'] = 2048 

517 logger.info("Capping ctx_size to 2048 (only %.1fGB RAM available)", avail_gb) 

518 except ImportError: 

519 pass 

520 

521 return params 

522 

523 @property 

524 def current_model(self) -> Optional[str]: 

525 """Return the path of the currently loaded model, or None.""" 

526 return self._current_model 

527 

528 @property 

529 def port(self) -> int: 

530 """Return the port the server is (or will be) running on.""" 

531 return self._port 

532 

533 # ── Private Implementation ─────────────────────────────────── 

534 

535 def _start_locked(self, model_path: str, port: int, **kwargs) -> bool: 

536 """Start the server (caller must hold self._lock).""" 

537 if self._process is not None and self._process.poll() is None: 

538 logger.warning( 

539 "Server already running (PID %d) -- stop first or use swap_model()", 

540 self._process.pid) 

541 return False 

542 

543 # Validate model file 

544 if not os.path.isfile(model_path): 

545 logger.error(f"Model file not found: {model_path}") 

546 return False 

547 

548 # Find or download binary 

549 binary = self.get_server_binary() 

550 if binary is None: 

551 logger.info("llama-server not found, attempting download...") 

552 binary = self.download_server() 

553 if binary is None: 

554 logger.error( 

555 "Cannot start: llama-server binary not available. " 

556 "Install manually or check network.") 

557 return False 

558 

559 self._port = port 

560 

561 # Calculate params 

562 params = self.get_optimal_params(model_path) 

563 # Apply user overrides 

564 params.update(kwargs) 

565 params['port'] = port 

566 

567 # Build command 

568 cmd = [ 

569 str(binary), 

570 '--model', str(model_path), 

571 '--host', str(params.get('host', '0.0.0.0')), 

572 '--port', str(params['port']), 

573 '--ctx-size', str(params.get('ctx_size', 4096)), 

574 '--threads', str(params.get('threads', 2)), 

575 '--n-gpu-layers', str(params.get('n_gpu_layers', 0)), 

576 ] 

577 

578 if params.get('flash_attn'): 

579 cmd.append('--flash-attn') 

580 

581 # Pass through any extra CLI flags 

582 extra_args = params.get('extra_args', []) 

583 if extra_args: 

584 cmd.extend(extra_args) 

585 

586 logger.info(f"Starting llama-server: {' '.join(cmd)}") 

587 

588 # Platform-specific subprocess options 

589 popen_kwargs: Dict[str, Any] = { 

590 'stdout': subprocess.PIPE, 

591 'stderr': subprocess.PIPE, 

592 } 

593 

594 if sys.platform == 'win32': 

595 # Hide the console window on Windows 

596 si = subprocess.STARTUPINFO() 

597 si.dwFlags |= subprocess.STARTF_USESHOWWINDOW 

598 si.wShowWindow = 0 # SW_HIDE 

599 popen_kwargs['startupinfo'] = si 

600 popen_kwargs['creationflags'] = subprocess.CREATE_NO_WINDOW 

601 

602 try: 

603 self._process = subprocess.Popen(cmd, **popen_kwargs) 

604 logger.info(f"llama-server started (PID {self._process.pid})") 

605 except FileNotFoundError: 

606 logger.error(f"Binary not found or not executable: {binary}") 

607 self._process = None 

608 return False 

609 except PermissionError: 

610 logger.error(f"Permission denied executing: {binary}") 

611 self._process = None 

612 return False 

613 except OSError as exc: 

614 logger.error(f"Failed to start llama-server: {exc}") 

615 self._process = None 

616 return False 

617 

618 # Wait for health endpoint with exponential backoff 

619 self._current_model = model_path 

620 if self._wait_for_health(): 

621 logger.info( 

622 f"llama-server ready on port {port} " 

623 f"(model: {os.path.basename(model_path)})") 

624 return True 

625 else: 

626 logger.error( 

627 f"llama-server health check failed after {_HEALTH_START_TIMEOUT}s " 

628 "-- stopping process") 

629 self._stop_locked() 

630 return False 

631 

632 def _stop_locked(self) -> bool: 

633 """Stop the server (caller must hold self._lock).""" 

634 if self._process is None: 

635 logger.debug("No server process to stop") 

636 return True 

637 

638 pid = self._process.pid 

639 logger.info(f"Stopping llama-server (PID {pid})...") 

640 

641 try: 

642 # Graceful shutdown: terminate (SIGTERM on Unix, TerminateProcess on Windows) 

643 self._process.terminate() 

644 

645 try: 

646 self._process.wait(timeout=_STOP_GRACE_PERIOD) 

647 logger.info(f"llama-server (PID {pid}) terminated gracefully") 

648 except subprocess.TimeoutExpired: 

649 # Force kill 

650 logger.warning( 

651 f"llama-server (PID {pid}) did not exit in " 

652 f"{_STOP_GRACE_PERIOD}s -- force killing") 

653 self._process.kill() 

654 self._process.wait(timeout=5) 

655 logger.info(f"llama-server (PID {pid}) killed") 

656 

657 except ProcessLookupError: 

658 logger.debug(f"Process {pid} already exited") 

659 except OSError as exc: 

660 logger.error(f"Error stopping llama-server (PID {pid}): {exc}") 

661 return False 

662 finally: 

663 self._process = None 

664 self._current_model = None 

665 

666 return True 

667 

668 def _check_health(self) -> bool: 

669 """Single health check against /health endpoint.""" 

670 url = f'http://127.0.0.1:{self._port}/health' 

671 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT) 

672 return result is not None 

673 

674 def _wait_for_health(self) -> bool: 

675 """Wait for the server health endpoint with exponential backoff. 

676 

677 Polls GET /health up to _HEALTH_START_TIMEOUT seconds. 

678 """ 

679 deadline = time.monotonic() + _HEALTH_START_TIMEOUT 

680 interval = _HEALTH_POLL_INTERVAL 

681 

682 while time.monotonic() < deadline: 

683 # Check if process died 

684 if self._process is not None and self._process.poll() is not None: 

685 rc = self._process.returncode 

686 logger.error(f"llama-server exited prematurely (code {rc})") 

687 # Try to read stderr for diagnostics 

688 try: 

689 stderr = self._process.stderr.read().decode('utf-8', errors='replace') 

690 if stderr: 

691 logger.error(f"llama-server stderr: {stderr[:1000]}") 

692 except Exception: 

693 pass 

694 return False 

695 

696 if self._check_health(): 

697 return True 

698 

699 time.sleep(interval) 

700 interval = min(interval * 1.5, _HEALTH_POLL_MAX_INTERVAL) 

701 

702 return False 

703 

704 @staticmethod 

705 def _get_gpu_info() -> Dict: 

706 """Query GPU info via vram_manager singleton.""" 

707 try: 

708 from .vram_manager import vram_manager 

709 return vram_manager.detect_gpu() 

710 except Exception as exc: 

711 logger.debug(f"vram_manager unavailable: {exc}") 

712 return { 

713 'name': None, 

714 'total_gb': 0.0, 

715 'free_gb': 0.0, 

716 'cuda_available': False, 

717 } 

718 

719 

720# ── Module-level Singleton ─────────────────────────────────────── 

721 

722_manager: Optional[LlamaCppManager] = None 

723_manager_lock = threading.Lock() 

724 

725 

726def get_llamacpp_manager() -> LlamaCppManager: 

727 """Return the global LlamaCppManager singleton (thread-safe).""" 

728 global _manager 

729 if _manager is not None: 

730 return _manager 

731 

732 with _manager_lock: 

733 if _manager is not None: 

734 return _manager 

735 _manager = LlamaCppManager() 

736 return _manager