Coverage for integrations/service_tools/llamacpp

1"""

2llama.cpp Server Manager -- lifecycle management for local LLM inference.

4Manages llama-server (or llama-cpp-python) processes:

5 - Auto-downloads llama.cpp release binaries if not found

6 - Starts server with optimal settings for detected hardware

7 - Health monitoring and auto-restart

8 - Model hot-swap (stop -> load new GGUF -> start)

9 - Graceful shutdown

11Standalone mode: HARTOS manages its own llama.cpp (not waiting for Nunba).

12Bundled mode: Defers to Nunba's llama.cpp server.

14Usage:

15 from integrations.service_tools.llamacpp_manager import get_llamacpp_manager

17 mgr = get_llamacpp_manager()

18 mgr.start('/path/to/model.gguf')

19 print(mgr.health())

20 mgr.swap_model('/path/to/other.gguf')

21 mgr.stop()

22"""

24import logging

25import os

26import platform

27import shutil

28import stat

29import subprocess

30import sys

31import threading

32import time

33import zipfile

34from pathlib import Path

35from typing import Any, Dict, Optional

37logger = logging.getLogger(__name__)

39# Default directories

40_HEVOLVE_HOME = Path.home() / '.hevolve'

41_BIN_DIR = _HEVOLVE_HOME / 'bin'

42_MODELS_DIR = _HEVOLVE_HOME / 'models'

44# Health check timing

45_HEALTH_START_TIMEOUT = 30 # Max seconds to wait for server on start

46_HEALTH_POLL_INTERVAL = 0.5 # Initial poll interval (seconds)

47_HEALTH_POLL_MAX_INTERVAL = 2.0 # Max poll interval (exponential backoff cap)

48_HEALTH_CHECK_TIMEOUT = 3 # HTTP timeout for a single health check (seconds)

50# Process shutdown

51_STOP_GRACE_PERIOD = 5 # Seconds to wait after terminate() before kill()

53# GitHub release

54_GITHUB_RELEASE_API = 'https://api.github.com/repos/ggml-org/llama.cpp/releases/latest'

56# Platform binary name patterns for GitHub release assets

57_PLATFORM_ASSET_PATTERNS = {

58 ('Windows', 'AMD64'): 'win-amd64',

59 ('Windows', 'x86_64'): 'win-amd64',

60 ('Linux', 'x86_64'): 'ubuntu-x64',

61 ('Linux', 'aarch64'): 'ubuntu-arm64',

62 ('Darwin', 'x86_64'): 'macos-x64',

63 ('Darwin', 'arm64'): 'macos-arm64',

64}

67def _get_platform_key() -> str:

68 """Return the platform asset key for the current system."""

69 system = platform.system()

70 machine = platform.machine()

71 return _PLATFORM_ASSET_PATTERNS.get((system, machine), '')

74def _server_binary_name() -> str:

75 """Return the expected server binary filename for this OS."""

76 if sys.platform == 'win32':

77 return 'llama-server.exe'

78 return 'llama-server'

81def _http_get(url: str, timeout: int = _HEALTH_CHECK_TIMEOUT) -> Any:

82 """Perform an HTTP GET, preferring pooled session, falling back to urllib.

84 Returns the parsed JSON body on success, or None on failure.

85 """

86 # Try pooled session first (avoids new TCP connection)

87 try:

88 from core.http_pool import pooled_get

89 resp = pooled_get(url, timeout=(timeout, timeout))

90 resp.raise_for_status()

91 return resp.json()

92 except Exception:

93 pass

95 # Fallback: stdlib urllib (zero dependencies)

96 try:

97 import json

98 import urllib.request

99 req = urllib.request.Request(url, headers={'Accept': 'application/json'})

100 with urllib.request.urlopen(req, timeout=timeout) as resp:

101 return json.loads(resp.read().decode('utf-8'))

102 except Exception:

103 return None

104

105

106def _http_get_raw(url: str, timeout: int = 30) -> Optional[bytes]:

107 """Download raw bytes from a URL. Returns bytes or None."""

108 try:

109 import urllib.request

110 req = urllib.request.Request(url, headers={

111 'Accept': 'application/octet-stream',

112 'User-Agent': 'HARTOS-LlamaCppManager/1.0',

113 })

114 with urllib.request.urlopen(req, timeout=timeout) as resp:

115 return resp.read()

116 except Exception as exc:

117 logger.error(f"Download failed for {url}: {exc}")

118 return None

119

120

121def _http_get_json(url: str, timeout: int = 15) -> Optional[Dict]:

122 """Fetch JSON from a URL using urllib (for GitHub API). Returns dict or None."""

123 try:

124 import json

125 import urllib.request

126 req = urllib.request.Request(url, headers={

127 'Accept': 'application/vnd.github+json',

128 'User-Agent': 'HARTOS-LlamaCppManager/1.0',

129 })

130 with urllib.request.urlopen(req, timeout=timeout) as resp:

131 return json.loads(resp.read().decode('utf-8'))

132 except Exception as exc:

133 logger.error(f"GitHub API request failed: {exc}")

134 return None

135

136

137class LlamaCppManager:

138 """Manages a llama-server process for local LLM inference.

139

140 Thread-safe: all mutating operations (start/stop/swap) are guarded by a Lock.

141 """

142

143 def __init__(self):

144 self._process: Optional[subprocess.Popen] = None

145 self._current_model: Optional[str] = None

146 self._port: int = 8080

147 self._lock = threading.Lock()

148 self._server_binary: Optional[Path] = None

149

150 # ── Public API ───────────────────────────────────────────────

151

152 def start(self, model_path: str, port: int = 8080, **kwargs) -> bool:

153 """Start llama-server with the given GGUF model.

154

155 Auto-detects hardware and selects optimal parameters (GPU layers,

156 context size, thread count, flash attention).

157

158 Args:

159 model_path: Absolute path to a .gguf model file.

160 port: Port to listen on (default 8080, from port_registry 'llm').

161 **kwargs: Additional overrides for server params (n_gpu_layers,

162 ctx_size, threads, flash_attn, etc.).

163

164 Returns:

165 True if server started and health check passed, False otherwise.

166 """

167 with self._lock:

168 return self._start_locked(model_path, port, **kwargs)

169

170 def stop(self) -> bool:

171 """Gracefully stop the managed llama-server process.

172

173 Sends terminate signal, waits up to 5 seconds, then force-kills

174 if the process has not exited.

175

176 Returns:

177 True if the process was stopped (or was not running), False on error.

178 """

179 with self._lock:

180 return self._stop_locked()

181

182 def is_running(self) -> bool:

183 """Check if the managed server process is alive AND responding to health checks.

184

185 Returns:

186 True if the process is running and /health returns successfully.

187 """

188 if self._process is None:

189 return False

190 if self._process.poll() is not None:

191 # Process has exited

192 logger.warning(

193 f"llama-server process exited with code {self._process.returncode}")

194 self._process = None

195 return False

196 # Process alive -- verify health endpoint

197 return self._check_health()

198

199 def health(self) -> Dict:

200 """Query the llama-server /health endpoint.

201

202 Returns:

203 Parsed JSON from /health on success, or an error dict.

204 """

205 if self._process is None:

206 return {'status': 'not_running', 'error': 'No managed server process'}

207

208 url = f'http://127.0.0.1:{self._port}/health'

209 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT)

210 if result is not None:

211 return result

212 return {

213 'status': 'error',

214 'error': 'Health endpoint did not respond',

215 'port': self._port,

216 'model': self._current_model,

217 }

218

219 def swap_model(self, new_model_path: str, **kwargs) -> bool:

220 """Hot-swap: stop the current model and start with a new one.

221

222 Args:

223 new_model_path: Path to the new .gguf model file.

224 **kwargs: Additional server param overrides.

225

226 Returns:

227 True if the new model started successfully.

228 """

229 with self._lock:

230 port = self._port

231 logger.info(

232 f"Swapping model: {self._current_model} -> {new_model_path}")

233 self._stop_locked()

234 return self._start_locked(new_model_path, port, **kwargs)

235

236 def get_server_binary(self) -> Optional[Path]:

237 """Locate the llama-server binary on this system.

238

239 Search order:

240 1. Cached result from a previous call

241 2. System PATH (llama-server, llama-cpp-server)

242 3. ~/.hevolve/bin/llama-server[.exe]

243

244 Returns:

245 Path to the binary, or None if not found.

246 """

247 if self._server_binary and self._server_binary.exists():

248 return self._server_binary

249

250 binary_name = _server_binary_name()

251

252 # 1. Check PATH

253 for name in ('llama-server', 'llama-cpp-server'):

254 if sys.platform == 'win32':

255 name += '.exe'

256 found = shutil.which(name)

257 if found:

258 self._server_binary = Path(found)

259 logger.info(f"Found llama-server on PATH: {self._server_binary}")

260 return self._server_binary

261

262 # 2. Check ~/.hevolve/bin/

263 local_bin = _BIN_DIR / binary_name

264 if local_bin.exists():

265 self._server_binary = local_bin

266 logger.info(f"Found llama-server at: {self._server_binary}")

267 return self._server_binary

268

269 logger.info("llama-server binary not found on this system")

270 return None

271

272 def download_server(self) -> Optional[Path]:

273 """Download the latest llama.cpp release binary from GitHub.

274

275 Detects the current platform, downloads the appropriate archive,

276 extracts llama-server to ~/.hevolve/bin/, and makes it executable.

277

278 Returns:

279 Path to the downloaded binary, or None on failure.

280 """

281 platform_key = _get_platform_key()

282 if not platform_key:

283 logger.error(

284 f"Unsupported platform: {platform.system()} {platform.machine()}")

285 return None

286

287 # Fetch latest release metadata

288 logger.info("Fetching latest llama.cpp release from GitHub...")

289 release = _http_get_json(_GITHUB_RELEASE_API)

290 if not release:

291 logger.error("Failed to fetch release info from GitHub")

292 return None

293

294 tag = release.get('tag_name', 'unknown')

295 assets = release.get('assets', [])

296 logger.info(f"Latest release: {tag} ({len(assets)} assets)")

297

298 # Find matching asset

299 target_asset = None

300 for asset in assets:

301 name = asset.get('name', '')

302 # Match pattern: llama-{tag}-bin-{platform_key}.zip

303 if platform_key in name and name.endswith('.zip'):

304 target_asset = asset

305 break

306

307 if not target_asset:

308 # Broader search: any zip containing the platform key

309 for asset in assets:

310 name = asset.get('name', '')

311 if platform_key in name and ('.zip' in name or '.tar.gz' in name):

312 target_asset = asset

313 break

314

315 if not target_asset:

316 logger.error(

317 f"No matching asset found for platform '{platform_key}' "

318 f"in release {tag}. Available: "

319 f"{[a['name'] for a in assets[:10]]}")

320 return None

321

322 download_url = target_asset.get('browser_download_url', '')

323 asset_name = target_asset.get('name', '')

324 asset_size = target_asset.get('size', 0)

325 logger.info(

326 f"Downloading: {asset_name} ({asset_size / 1024 / 1024:.1f} MB)")

327

328 # Download

329 data = _http_get_raw(download_url, timeout=300)

330 if not data:

331 return None

332

333 # Extract

334 _BIN_DIR.mkdir(parents=True, exist_ok=True)

335 archive_path = _BIN_DIR / asset_name

336

337 try:

338 archive_path.write_bytes(data)

339 binary_name = _server_binary_name()

340 extracted_binary = None

341

342 if asset_name.endswith('.zip'):

343 with zipfile.ZipFile(archive_path, 'r') as zf:

344 # Find llama-server in the archive

345 for entry in zf.namelist():

346 basename = Path(entry).name

347 if basename == binary_name:

348 # Extract this single file to _BIN_DIR

349 source = zf.open(entry)

350 target = _BIN_DIR / binary_name

351 target.write_bytes(source.read())

352 source.close()

353 extracted_binary = target

354 break

355

356 if not extracted_binary:

357 # Extract all, then look for the binary

358 zf.extractall(_BIN_DIR)

359 for p in _BIN_DIR.rglob(binary_name):

360 extracted_binary = p

361 break

362 else:

363 # .tar.gz

364 import tarfile

365 with tarfile.open(archive_path, 'r:gz') as tf:

366 for member in tf.getmembers():

367 if Path(member.name).name == binary_name:

368 tf.extract(member, _BIN_DIR)

369 extracted_binary = _BIN_DIR / member.name

370 break

371 if not extracted_binary:

372 tf.extractall(_BIN_DIR)

373 for p in _BIN_DIR.rglob(binary_name):

374 extracted_binary = p

375 break

376

377 # Clean up archive

378 archive_path.unlink(missing_ok=True)

379

380 if not extracted_binary or not extracted_binary.exists():

381 logger.error(

382 f"Could not find {binary_name} in downloaded archive")

383 return None

384

385 # Move to canonical location if nested

386 canonical = _BIN_DIR / binary_name

387 if extracted_binary != canonical:

388 shutil.move(str(extracted_binary), str(canonical))

389 extracted_binary = canonical

390

391 # Make executable (Unix)

392 if sys.platform != 'win32':

393 extracted_binary.chmod(

394 extracted_binary.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)

395

396 self._server_binary = extracted_binary

397 logger.info(f"llama-server installed at: {extracted_binary}")

398 return extracted_binary

399

400 except Exception as exc:

401 logger.error(f"Failed to extract llama-server: {exc}")

402 archive_path.unlink(missing_ok=True)

403 return None

404

405 def get_optimal_params(self, model_path: str) -> Dict[str, Any]:

406 """Calculate optimal llama-server parameters based on hardware.

407

408 Examines the GGUF file size as a proxy for model weight size, then

409 checks available VRAM via vram_manager to decide GPU offload depth,

410 context length, and threading.

411

412 Args:

413 model_path: Path to the .gguf model file.

414

415 Returns:

416 Dict with keys: n_gpu_layers, ctx_size, threads, flash_attn,

417 host, port, and any additional flags.

418 """

419 params: Dict[str, Any] = {

420 'n_gpu_layers': 0,

421 'ctx_size': 4096,

422 'threads': max(1, (os.cpu_count() or 4) // 2),

423 'flash_attn': False,

424 'host': '127.0.0.1',

425 'port': self._port,

426 }

427

428 # Estimate model size from file

429 model_size_gb = 0.0

430 try:

431 model_size_gb = os.path.getsize(model_path) / (1024 ** 3)

432 logger.info(f"Model file size: {model_size_gb:.2f} GB")

433 except OSError:

434 logger.warning(f"Cannot stat model file: {model_path}")

435

436 # Query GPU via vram_manager

437 gpu_info = self._get_gpu_info()

438 free_vram = gpu_info.get('free_gb', 0.0)

439 cuda_available = gpu_info.get('cuda_available', False)

440 total_vram = gpu_info.get('total_gb', 0.0)

441

442 if cuda_available and free_vram > 0:

443 if model_size_gb > 0 and free_vram >= model_size_gb * 1.1:

444 # Enough VRAM to fit the entire model + overhead

445 params['n_gpu_layers'] = -1 # All layers on GPU

446 logger.info(

447 f"Full GPU offload: {free_vram:.1f} GB free >= "

448 f"{model_size_gb:.1f} GB model")

449 elif model_size_gb > 0:

450 # Partial offload: estimate fraction of layers that fit

451 # Typical GGUF has ~32-80 layers; use ratio as heuristic

452 ratio = free_vram / model_size_gb

453 # Clamp to reasonable range

454 estimated_layers = max(1, int(ratio * 40)) # assume ~40 layers

455 params['n_gpu_layers'] = estimated_layers

456 logger.info(

457 f"Partial GPU offload: {estimated_layers} layers "

458 f"({free_vram:.1f} GB free / {model_size_gb:.1f} GB model)")

459 else:

460 # Unknown model size, try full offload

461 params['n_gpu_layers'] = -1

462

463 # Context size: balance between LLM capability and leaving VRAM for TTS.

464 # Reserve ~3GB for TTS (Indic Parler ~1.2GB model + ~2GB inference).

465 # KV cache memory ≈ ctx * layers * hidden_dim * 2 * 2 bytes (FP16).

466 # Qwen 4B: 32 layers × 2560 dim × 2 heads × 2 bytes ≈ 0.3MB per 1K ctx.

467 vram_after_model = free_vram - model_size_gb

468 tts_reserve_gb = 3.0 # Reserve for GPU TTS (Indic Parler, F5, etc.)

469 vram_for_ctx = vram_after_model - tts_reserve_gb

470 if vram_for_ctx >= 3.0:

471 params['ctx_size'] = 10240 # 10K — good balance

472 elif vram_for_ctx >= 1.5:

473 params['ctx_size'] = 8192

474 elif vram_for_ctx >= 0.5:

475 params['ctx_size'] = 4096

476 else:

477 params['ctx_size'] = 2048

478

479 # Flash attention: available on modern NVIDIA GPUs (Ampere+)

480 # Heuristic: if GPU name contains known architectures

481 gpu_name = (gpu_info.get('name') or '').lower()

482 # Ampere: RTX 30xx, A100, etc. Hopper: H100. Ada: RTX 40xx

483 flash_capable_keywords = [

484 'rtx 30', 'rtx 40', 'rtx 50', 'a100', 'a10', 'h100',

485 'l40', 'rtx a', 'geforce 30', 'geforce 40',

486 ]

487 if any(kw in gpu_name for kw in flash_capable_keywords):

488 params['flash_attn'] = True

489 logger.info(f"Enabling flash attention for {gpu_info.get('name')}")

490

491 else:

492 # CPU-only mode

493 params['n_gpu_layers'] = 0

494 params['ctx_size'] = 2048 # Conservative for CPU

495 # Use more threads on CPU-only

496 params['threads'] = max(1, (os.cpu_count() or 4) - 1)

497 logger.info("CPU-only mode: no GPU available")

498

499 # ── ResourceGovernor cap: leave headroom for the rest of the OS ──

500 # Never use ALL cores — reserve 25% for foreground apps.

501 total_cores = os.cpu_count() or 4

502 max_threads = max(1, int(total_cores * 0.75))

503 if params['threads'] > max_threads:

504 logger.info("Capping threads %d → %d (75%% of %d cores)",

505 params['threads'], max_threads, total_cores)

506 params['threads'] = max_threads

507

508 # Cap context size based on available RAM (avoid low-memory warnings)

509 try:

510 import psutil

511 avail_gb = psutil.virtual_memory().available / (1024**3)

512 if avail_gb < 4.0 and params['ctx_size'] > 4096:

513 params['ctx_size'] = 4096

514 logger.info("Capping ctx_size to 4096 (only %.1fGB RAM available)", avail_gb)

515 elif avail_gb < 2.0 and params['ctx_size'] > 2048:

516 params['ctx_size'] = 2048

517 logger.info("Capping ctx_size to 2048 (only %.1fGB RAM available)", avail_gb)

518 except ImportError:

519 pass

520

521 return params

522

523 @property

524 def current_model(self) -> Optional[str]:

525 """Return the path of the currently loaded model, or None."""

526 return self._current_model

527

528 @property

529 def port(self) -> int:

530 """Return the port the server is (or will be) running on."""

531 return self._port

532

533 # ── Private Implementation ───────────────────────────────────

534

535 def _start_locked(self, model_path: str, port: int, **kwargs) -> bool:

536 """Start the server (caller must hold self._lock)."""

537 if self._process is not None and self._process.poll() is None:

538 logger.warning(

539 "Server already running (PID %d) -- stop first or use swap_model()",

540 self._process.pid)

541 return False

542

543 # Validate model file

544 if not os.path.isfile(model_path):

545 logger.error(f"Model file not found: {model_path}")

546 return False

547

548 # Find or download binary

549 binary = self.get_server_binary()

550 if binary is None:

551 logger.info("llama-server not found, attempting download...")

552 binary = self.download_server()

553 if binary is None:

554 logger.error(

555 "Cannot start: llama-server binary not available. "

556 "Install manually or check network.")

557 return False

558

559 self._port = port

560

561 # Calculate params

562 params = self.get_optimal_params(model_path)

563 # Apply user overrides

564 params.update(kwargs)

565 params['port'] = port

566

567 # Build command

568 cmd = [

569 str(binary),

570 '--model', str(model_path),

571 '--host', str(params.get('host', '0.0.0.0')),

572 '--port', str(params['port']),

573 '--ctx-size', str(params.get('ctx_size', 4096)),

574 '--threads', str(params.get('threads', 2)),

575 '--n-gpu-layers', str(params.get('n_gpu_layers', 0)),

576 ]

577

578 if params.get('flash_attn'):

579 cmd.append('--flash-attn')

580

581 # Pass through any extra CLI flags

582 extra_args = params.get('extra_args', [])

583 if extra_args:

584 cmd.extend(extra_args)

585

586 logger.info(f"Starting llama-server: {' '.join(cmd)}")

587

588 # Platform-specific subprocess options

589 popen_kwargs: Dict[str, Any] = {

590 'stdout': subprocess.PIPE,

591 'stderr': subprocess.PIPE,

592 }

593

594 if sys.platform == 'win32':

595 # Hide the console window on Windows

596 si = subprocess.STARTUPINFO()

597 si.dwFlags |= subprocess.STARTF_USESHOWWINDOW

598 si.wShowWindow = 0 # SW_HIDE

599 popen_kwargs['startupinfo'] = si

600 popen_kwargs['creationflags'] = subprocess.CREATE_NO_WINDOW

601

602 try:

603 self._process = subprocess.Popen(cmd, **popen_kwargs)

604 logger.info(f"llama-server started (PID {self._process.pid})")

605 except FileNotFoundError:

606 logger.error(f"Binary not found or not executable: {binary}")

607 self._process = None

608 return False

609 except PermissionError:

610 logger.error(f"Permission denied executing: {binary}")

611 self._process = None

612 return False

613 except OSError as exc:

614 logger.error(f"Failed to start llama-server: {exc}")

615 self._process = None

616 return False

617

618 # Wait for health endpoint with exponential backoff

619 self._current_model = model_path

620 if self._wait_for_health():

621 logger.info(

622 f"llama-server ready on port {port} "

623 f"(model: {os.path.basename(model_path)})")

624 return True

625 else:

626 logger.error(

627 f"llama-server health check failed after {_HEALTH_START_TIMEOUT}s "

628 "-- stopping process")

629 self._stop_locked()

630 return False

631

632 def _stop_locked(self) -> bool:

633 """Stop the server (caller must hold self._lock)."""

634 if self._process is None:

635 logger.debug("No server process to stop")

636 return True

637

638 pid = self._process.pid

639 logger.info(f"Stopping llama-server (PID {pid})...")

640

641 try:

642 # Graceful shutdown: terminate (SIGTERM on Unix, TerminateProcess on Windows)

643 self._process.terminate()

644

645 try:

646 self._process.wait(timeout=_STOP_GRACE_PERIOD)

647 logger.info(f"llama-server (PID {pid}) terminated gracefully")

648 except subprocess.TimeoutExpired:

649 # Force kill

650 logger.warning(

651 f"llama-server (PID {pid}) did not exit in "

652 f"{_STOP_GRACE_PERIOD}s -- force killing")

653 self._process.kill()

654 self._process.wait(timeout=5)

655 logger.info(f"llama-server (PID {pid}) killed")

656

657 except ProcessLookupError:

658 logger.debug(f"Process {pid} already exited")

659 except OSError as exc:

660 logger.error(f"Error stopping llama-server (PID {pid}): {exc}")

661 return False

662 finally:

663 self._process = None

664 self._current_model = None

665

666 return True

667

668 def _check_health(self) -> bool:

669 """Single health check against /health endpoint."""

670 url = f'http://127.0.0.1:{self._port}/health'

671 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT)

672 return result is not None

673

674 def _wait_for_health(self) -> bool:

675 """Wait for the server health endpoint with exponential backoff.

676

677 Polls GET /health up to _HEALTH_START_TIMEOUT seconds.

678 """

679 deadline = time.monotonic() + _HEALTH_START_TIMEOUT

680 interval = _HEALTH_POLL_INTERVAL

681

682 while time.monotonic() < deadline:

683 # Check if process died

684 if self._process is not None and self._process.poll() is not None:

685 rc = self._process.returncode

686 logger.error(f"llama-server exited prematurely (code {rc})")

687 # Try to read stderr for diagnostics

688 try:

689 stderr = self._process.stderr.read().decode('utf-8', errors='replace')

690 if stderr:

691 logger.error(f"llama-server stderr: {stderr[:1000]}")

692 except Exception:

693 pass

694 return False

695

696 if self._check_health():

697 return True

698

699 time.sleep(interval)

700 interval = min(interval * 1.5, _HEALTH_POLL_MAX_INTERVAL)

701

702 return False

703

704 @staticmethod

705 def _get_gpu_info() -> Dict:

706 """Query GPU info via vram_manager singleton."""

707 try:

708 from .vram_manager import vram_manager

709 return vram_manager.detect_gpu()

710 except Exception as exc:

711 logger.debug(f"vram_manager unavailable: {exc}")

712 return {

713 'name': None,

714 'total_gb': 0.0,

715 'free_gb': 0.0,

716 'cuda_available': False,

717 }

718

719

720# ── Module-level Singleton ───────────────────────────────────────

721

722_manager: Optional[LlamaCppManager] = None

723_manager_lock = threading.Lock()

724

725

726def get_llamacpp_manager() -> LlamaCppManager:

727 """Return the global LlamaCppManager singleton (thread-safe)."""

728 global _manager

729 if _manager is not None:

730 return _manager

731

732 with _manager_lock:

733 if _manager is not None:

734 return _manager

735 _manager = LlamaCppManager()

736 return _manager

Coverage for integrations / service_tools / llamacpp_manager.py: 0.0%

368 statements