Coverage for integrations / service_tools / llamacpp_manager.py: 0.0%
368 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2llama.cpp Server Manager -- lifecycle management for local LLM inference.
4Manages llama-server (or llama-cpp-python) processes:
5 - Auto-downloads llama.cpp release binaries if not found
6 - Starts server with optimal settings for detected hardware
7 - Health monitoring and auto-restart
8 - Model hot-swap (stop -> load new GGUF -> start)
9 - Graceful shutdown
11Standalone mode: HARTOS manages its own llama.cpp (not waiting for Nunba).
12Bundled mode: Defers to Nunba's llama.cpp server.
14Usage:
15 from integrations.service_tools.llamacpp_manager import get_llamacpp_manager
17 mgr = get_llamacpp_manager()
18 mgr.start('/path/to/model.gguf')
19 print(mgr.health())
20 mgr.swap_model('/path/to/other.gguf')
21 mgr.stop()
22"""
24import logging
25import os
26import platform
27import shutil
28import stat
29import subprocess
30import sys
31import threading
32import time
33import zipfile
34from pathlib import Path
35from typing import Any, Dict, Optional
37logger = logging.getLogger(__name__)
39# Default directories
40_HEVOLVE_HOME = Path.home() / '.hevolve'
41_BIN_DIR = _HEVOLVE_HOME / 'bin'
42_MODELS_DIR = _HEVOLVE_HOME / 'models'
44# Health check timing
45_HEALTH_START_TIMEOUT = 30 # Max seconds to wait for server on start
46_HEALTH_POLL_INTERVAL = 0.5 # Initial poll interval (seconds)
47_HEALTH_POLL_MAX_INTERVAL = 2.0 # Max poll interval (exponential backoff cap)
48_HEALTH_CHECK_TIMEOUT = 3 # HTTP timeout for a single health check (seconds)
50# Process shutdown
51_STOP_GRACE_PERIOD = 5 # Seconds to wait after terminate() before kill()
53# GitHub release
54_GITHUB_RELEASE_API = 'https://api.github.com/repos/ggml-org/llama.cpp/releases/latest'
56# Platform binary name patterns for GitHub release assets
57_PLATFORM_ASSET_PATTERNS = {
58 ('Windows', 'AMD64'): 'win-amd64',
59 ('Windows', 'x86_64'): 'win-amd64',
60 ('Linux', 'x86_64'): 'ubuntu-x64',
61 ('Linux', 'aarch64'): 'ubuntu-arm64',
62 ('Darwin', 'x86_64'): 'macos-x64',
63 ('Darwin', 'arm64'): 'macos-arm64',
64}
67def _get_platform_key() -> str:
68 """Return the platform asset key for the current system."""
69 system = platform.system()
70 machine = platform.machine()
71 return _PLATFORM_ASSET_PATTERNS.get((system, machine), '')
74def _server_binary_name() -> str:
75 """Return the expected server binary filename for this OS."""
76 if sys.platform == 'win32':
77 return 'llama-server.exe'
78 return 'llama-server'
81def _http_get(url: str, timeout: int = _HEALTH_CHECK_TIMEOUT) -> Any:
82 """Perform an HTTP GET, preferring pooled session, falling back to urllib.
84 Returns the parsed JSON body on success, or None on failure.
85 """
86 # Try pooled session first (avoids new TCP connection)
87 try:
88 from core.http_pool import pooled_get
89 resp = pooled_get(url, timeout=(timeout, timeout))
90 resp.raise_for_status()
91 return resp.json()
92 except Exception:
93 pass
95 # Fallback: stdlib urllib (zero dependencies)
96 try:
97 import json
98 import urllib.request
99 req = urllib.request.Request(url, headers={'Accept': 'application/json'})
100 with urllib.request.urlopen(req, timeout=timeout) as resp:
101 return json.loads(resp.read().decode('utf-8'))
102 except Exception:
103 return None
106def _http_get_raw(url: str, timeout: int = 30) -> Optional[bytes]:
107 """Download raw bytes from a URL. Returns bytes or None."""
108 try:
109 import urllib.request
110 req = urllib.request.Request(url, headers={
111 'Accept': 'application/octet-stream',
112 'User-Agent': 'HARTOS-LlamaCppManager/1.0',
113 })
114 with urllib.request.urlopen(req, timeout=timeout) as resp:
115 return resp.read()
116 except Exception as exc:
117 logger.error(f"Download failed for {url}: {exc}")
118 return None
121def _http_get_json(url: str, timeout: int = 15) -> Optional[Dict]:
122 """Fetch JSON from a URL using urllib (for GitHub API). Returns dict or None."""
123 try:
124 import json
125 import urllib.request
126 req = urllib.request.Request(url, headers={
127 'Accept': 'application/vnd.github+json',
128 'User-Agent': 'HARTOS-LlamaCppManager/1.0',
129 })
130 with urllib.request.urlopen(req, timeout=timeout) as resp:
131 return json.loads(resp.read().decode('utf-8'))
132 except Exception as exc:
133 logger.error(f"GitHub API request failed: {exc}")
134 return None
137class LlamaCppManager:
138 """Manages a llama-server process for local LLM inference.
140 Thread-safe: all mutating operations (start/stop/swap) are guarded by a Lock.
141 """
143 def __init__(self):
144 self._process: Optional[subprocess.Popen] = None
145 self._current_model: Optional[str] = None
146 self._port: int = 8080
147 self._lock = threading.Lock()
148 self._server_binary: Optional[Path] = None
150 # ── Public API ───────────────────────────────────────────────
152 def start(self, model_path: str, port: int = 8080, **kwargs) -> bool:
153 """Start llama-server with the given GGUF model.
155 Auto-detects hardware and selects optimal parameters (GPU layers,
156 context size, thread count, flash attention).
158 Args:
159 model_path: Absolute path to a .gguf model file.
160 port: Port to listen on (default 8080, from port_registry 'llm').
161 **kwargs: Additional overrides for server params (n_gpu_layers,
162 ctx_size, threads, flash_attn, etc.).
164 Returns:
165 True if server started and health check passed, False otherwise.
166 """
167 with self._lock:
168 return self._start_locked(model_path, port, **kwargs)
170 def stop(self) -> bool:
171 """Gracefully stop the managed llama-server process.
173 Sends terminate signal, waits up to 5 seconds, then force-kills
174 if the process has not exited.
176 Returns:
177 True if the process was stopped (or was not running), False on error.
178 """
179 with self._lock:
180 return self._stop_locked()
182 def is_running(self) -> bool:
183 """Check if the managed server process is alive AND responding to health checks.
185 Returns:
186 True if the process is running and /health returns successfully.
187 """
188 if self._process is None:
189 return False
190 if self._process.poll() is not None:
191 # Process has exited
192 logger.warning(
193 f"llama-server process exited with code {self._process.returncode}")
194 self._process = None
195 return False
196 # Process alive -- verify health endpoint
197 return self._check_health()
199 def health(self) -> Dict:
200 """Query the llama-server /health endpoint.
202 Returns:
203 Parsed JSON from /health on success, or an error dict.
204 """
205 if self._process is None:
206 return {'status': 'not_running', 'error': 'No managed server process'}
208 url = f'http://127.0.0.1:{self._port}/health'
209 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT)
210 if result is not None:
211 return result
212 return {
213 'status': 'error',
214 'error': 'Health endpoint did not respond',
215 'port': self._port,
216 'model': self._current_model,
217 }
219 def swap_model(self, new_model_path: str, **kwargs) -> bool:
220 """Hot-swap: stop the current model and start with a new one.
222 Args:
223 new_model_path: Path to the new .gguf model file.
224 **kwargs: Additional server param overrides.
226 Returns:
227 True if the new model started successfully.
228 """
229 with self._lock:
230 port = self._port
231 logger.info(
232 f"Swapping model: {self._current_model} -> {new_model_path}")
233 self._stop_locked()
234 return self._start_locked(new_model_path, port, **kwargs)
236 def get_server_binary(self) -> Optional[Path]:
237 """Locate the llama-server binary on this system.
239 Search order:
240 1. Cached result from a previous call
241 2. System PATH (llama-server, llama-cpp-server)
242 3. ~/.hevolve/bin/llama-server[.exe]
244 Returns:
245 Path to the binary, or None if not found.
246 """
247 if self._server_binary and self._server_binary.exists():
248 return self._server_binary
250 binary_name = _server_binary_name()
252 # 1. Check PATH
253 for name in ('llama-server', 'llama-cpp-server'):
254 if sys.platform == 'win32':
255 name += '.exe'
256 found = shutil.which(name)
257 if found:
258 self._server_binary = Path(found)
259 logger.info(f"Found llama-server on PATH: {self._server_binary}")
260 return self._server_binary
262 # 2. Check ~/.hevolve/bin/
263 local_bin = _BIN_DIR / binary_name
264 if local_bin.exists():
265 self._server_binary = local_bin
266 logger.info(f"Found llama-server at: {self._server_binary}")
267 return self._server_binary
269 logger.info("llama-server binary not found on this system")
270 return None
272 def download_server(self) -> Optional[Path]:
273 """Download the latest llama.cpp release binary from GitHub.
275 Detects the current platform, downloads the appropriate archive,
276 extracts llama-server to ~/.hevolve/bin/, and makes it executable.
278 Returns:
279 Path to the downloaded binary, or None on failure.
280 """
281 platform_key = _get_platform_key()
282 if not platform_key:
283 logger.error(
284 f"Unsupported platform: {platform.system()} {platform.machine()}")
285 return None
287 # Fetch latest release metadata
288 logger.info("Fetching latest llama.cpp release from GitHub...")
289 release = _http_get_json(_GITHUB_RELEASE_API)
290 if not release:
291 logger.error("Failed to fetch release info from GitHub")
292 return None
294 tag = release.get('tag_name', 'unknown')
295 assets = release.get('assets', [])
296 logger.info(f"Latest release: {tag} ({len(assets)} assets)")
298 # Find matching asset
299 target_asset = None
300 for asset in assets:
301 name = asset.get('name', '')
302 # Match pattern: llama-{tag}-bin-{platform_key}.zip
303 if platform_key in name and name.endswith('.zip'):
304 target_asset = asset
305 break
307 if not target_asset:
308 # Broader search: any zip containing the platform key
309 for asset in assets:
310 name = asset.get('name', '')
311 if platform_key in name and ('.zip' in name or '.tar.gz' in name):
312 target_asset = asset
313 break
315 if not target_asset:
316 logger.error(
317 f"No matching asset found for platform '{platform_key}' "
318 f"in release {tag}. Available: "
319 f"{[a['name'] for a in assets[:10]]}")
320 return None
322 download_url = target_asset.get('browser_download_url', '')
323 asset_name = target_asset.get('name', '')
324 asset_size = target_asset.get('size', 0)
325 logger.info(
326 f"Downloading: {asset_name} ({asset_size / 1024 / 1024:.1f} MB)")
328 # Download
329 data = _http_get_raw(download_url, timeout=300)
330 if not data:
331 return None
333 # Extract
334 _BIN_DIR.mkdir(parents=True, exist_ok=True)
335 archive_path = _BIN_DIR / asset_name
337 try:
338 archive_path.write_bytes(data)
339 binary_name = _server_binary_name()
340 extracted_binary = None
342 if asset_name.endswith('.zip'):
343 with zipfile.ZipFile(archive_path, 'r') as zf:
344 # Find llama-server in the archive
345 for entry in zf.namelist():
346 basename = Path(entry).name
347 if basename == binary_name:
348 # Extract this single file to _BIN_DIR
349 source = zf.open(entry)
350 target = _BIN_DIR / binary_name
351 target.write_bytes(source.read())
352 source.close()
353 extracted_binary = target
354 break
356 if not extracted_binary:
357 # Extract all, then look for the binary
358 zf.extractall(_BIN_DIR)
359 for p in _BIN_DIR.rglob(binary_name):
360 extracted_binary = p
361 break
362 else:
363 # .tar.gz
364 import tarfile
365 with tarfile.open(archive_path, 'r:gz') as tf:
366 for member in tf.getmembers():
367 if Path(member.name).name == binary_name:
368 tf.extract(member, _BIN_DIR)
369 extracted_binary = _BIN_DIR / member.name
370 break
371 if not extracted_binary:
372 tf.extractall(_BIN_DIR)
373 for p in _BIN_DIR.rglob(binary_name):
374 extracted_binary = p
375 break
377 # Clean up archive
378 archive_path.unlink(missing_ok=True)
380 if not extracted_binary or not extracted_binary.exists():
381 logger.error(
382 f"Could not find {binary_name} in downloaded archive")
383 return None
385 # Move to canonical location if nested
386 canonical = _BIN_DIR / binary_name
387 if extracted_binary != canonical:
388 shutil.move(str(extracted_binary), str(canonical))
389 extracted_binary = canonical
391 # Make executable (Unix)
392 if sys.platform != 'win32':
393 extracted_binary.chmod(
394 extracted_binary.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
396 self._server_binary = extracted_binary
397 logger.info(f"llama-server installed at: {extracted_binary}")
398 return extracted_binary
400 except Exception as exc:
401 logger.error(f"Failed to extract llama-server: {exc}")
402 archive_path.unlink(missing_ok=True)
403 return None
405 def get_optimal_params(self, model_path: str) -> Dict[str, Any]:
406 """Calculate optimal llama-server parameters based on hardware.
408 Examines the GGUF file size as a proxy for model weight size, then
409 checks available VRAM via vram_manager to decide GPU offload depth,
410 context length, and threading.
412 Args:
413 model_path: Path to the .gguf model file.
415 Returns:
416 Dict with keys: n_gpu_layers, ctx_size, threads, flash_attn,
417 host, port, and any additional flags.
418 """
419 params: Dict[str, Any] = {
420 'n_gpu_layers': 0,
421 'ctx_size': 4096,
422 'threads': max(1, (os.cpu_count() or 4) // 2),
423 'flash_attn': False,
424 'host': '127.0.0.1',
425 'port': self._port,
426 }
428 # Estimate model size from file
429 model_size_gb = 0.0
430 try:
431 model_size_gb = os.path.getsize(model_path) / (1024 ** 3)
432 logger.info(f"Model file size: {model_size_gb:.2f} GB")
433 except OSError:
434 logger.warning(f"Cannot stat model file: {model_path}")
436 # Query GPU via vram_manager
437 gpu_info = self._get_gpu_info()
438 free_vram = gpu_info.get('free_gb', 0.0)
439 cuda_available = gpu_info.get('cuda_available', False)
440 total_vram = gpu_info.get('total_gb', 0.0)
442 if cuda_available and free_vram > 0:
443 if model_size_gb > 0 and free_vram >= model_size_gb * 1.1:
444 # Enough VRAM to fit the entire model + overhead
445 params['n_gpu_layers'] = -1 # All layers on GPU
446 logger.info(
447 f"Full GPU offload: {free_vram:.1f} GB free >= "
448 f"{model_size_gb:.1f} GB model")
449 elif model_size_gb > 0:
450 # Partial offload: estimate fraction of layers that fit
451 # Typical GGUF has ~32-80 layers; use ratio as heuristic
452 ratio = free_vram / model_size_gb
453 # Clamp to reasonable range
454 estimated_layers = max(1, int(ratio * 40)) # assume ~40 layers
455 params['n_gpu_layers'] = estimated_layers
456 logger.info(
457 f"Partial GPU offload: {estimated_layers} layers "
458 f"({free_vram:.1f} GB free / {model_size_gb:.1f} GB model)")
459 else:
460 # Unknown model size, try full offload
461 params['n_gpu_layers'] = -1
463 # Context size: balance between LLM capability and leaving VRAM for TTS.
464 # Reserve ~3GB for TTS (Indic Parler ~1.2GB model + ~2GB inference).
465 # KV cache memory ≈ ctx * layers * hidden_dim * 2 * 2 bytes (FP16).
466 # Qwen 4B: 32 layers × 2560 dim × 2 heads × 2 bytes ≈ 0.3MB per 1K ctx.
467 vram_after_model = free_vram - model_size_gb
468 tts_reserve_gb = 3.0 # Reserve for GPU TTS (Indic Parler, F5, etc.)
469 vram_for_ctx = vram_after_model - tts_reserve_gb
470 if vram_for_ctx >= 3.0:
471 params['ctx_size'] = 10240 # 10K — good balance
472 elif vram_for_ctx >= 1.5:
473 params['ctx_size'] = 8192
474 elif vram_for_ctx >= 0.5:
475 params['ctx_size'] = 4096
476 else:
477 params['ctx_size'] = 2048
479 # Flash attention: available on modern NVIDIA GPUs (Ampere+)
480 # Heuristic: if GPU name contains known architectures
481 gpu_name = (gpu_info.get('name') or '').lower()
482 # Ampere: RTX 30xx, A100, etc. Hopper: H100. Ada: RTX 40xx
483 flash_capable_keywords = [
484 'rtx 30', 'rtx 40', 'rtx 50', 'a100', 'a10', 'h100',
485 'l40', 'rtx a', 'geforce 30', 'geforce 40',
486 ]
487 if any(kw in gpu_name for kw in flash_capable_keywords):
488 params['flash_attn'] = True
489 logger.info(f"Enabling flash attention for {gpu_info.get('name')}")
491 else:
492 # CPU-only mode
493 params['n_gpu_layers'] = 0
494 params['ctx_size'] = 2048 # Conservative for CPU
495 # Use more threads on CPU-only
496 params['threads'] = max(1, (os.cpu_count() or 4) - 1)
497 logger.info("CPU-only mode: no GPU available")
499 # ── ResourceGovernor cap: leave headroom for the rest of the OS ──
500 # Never use ALL cores — reserve 25% for foreground apps.
501 total_cores = os.cpu_count() or 4
502 max_threads = max(1, int(total_cores * 0.75))
503 if params['threads'] > max_threads:
504 logger.info("Capping threads %d → %d (75%% of %d cores)",
505 params['threads'], max_threads, total_cores)
506 params['threads'] = max_threads
508 # Cap context size based on available RAM (avoid low-memory warnings)
509 try:
510 import psutil
511 avail_gb = psutil.virtual_memory().available / (1024**3)
512 if avail_gb < 4.0 and params['ctx_size'] > 4096:
513 params['ctx_size'] = 4096
514 logger.info("Capping ctx_size to 4096 (only %.1fGB RAM available)", avail_gb)
515 elif avail_gb < 2.0 and params['ctx_size'] > 2048:
516 params['ctx_size'] = 2048
517 logger.info("Capping ctx_size to 2048 (only %.1fGB RAM available)", avail_gb)
518 except ImportError:
519 pass
521 return params
523 @property
524 def current_model(self) -> Optional[str]:
525 """Return the path of the currently loaded model, or None."""
526 return self._current_model
528 @property
529 def port(self) -> int:
530 """Return the port the server is (or will be) running on."""
531 return self._port
533 # ── Private Implementation ───────────────────────────────────
535 def _start_locked(self, model_path: str, port: int, **kwargs) -> bool:
536 """Start the server (caller must hold self._lock)."""
537 if self._process is not None and self._process.poll() is None:
538 logger.warning(
539 "Server already running (PID %d) -- stop first or use swap_model()",
540 self._process.pid)
541 return False
543 # Validate model file
544 if not os.path.isfile(model_path):
545 logger.error(f"Model file not found: {model_path}")
546 return False
548 # Find or download binary
549 binary = self.get_server_binary()
550 if binary is None:
551 logger.info("llama-server not found, attempting download...")
552 binary = self.download_server()
553 if binary is None:
554 logger.error(
555 "Cannot start: llama-server binary not available. "
556 "Install manually or check network.")
557 return False
559 self._port = port
561 # Calculate params
562 params = self.get_optimal_params(model_path)
563 # Apply user overrides
564 params.update(kwargs)
565 params['port'] = port
567 # Build command
568 cmd = [
569 str(binary),
570 '--model', str(model_path),
571 '--host', str(params.get('host', '0.0.0.0')),
572 '--port', str(params['port']),
573 '--ctx-size', str(params.get('ctx_size', 4096)),
574 '--threads', str(params.get('threads', 2)),
575 '--n-gpu-layers', str(params.get('n_gpu_layers', 0)),
576 ]
578 if params.get('flash_attn'):
579 cmd.append('--flash-attn')
581 # Pass through any extra CLI flags
582 extra_args = params.get('extra_args', [])
583 if extra_args:
584 cmd.extend(extra_args)
586 logger.info(f"Starting llama-server: {' '.join(cmd)}")
588 # Platform-specific subprocess options
589 popen_kwargs: Dict[str, Any] = {
590 'stdout': subprocess.PIPE,
591 'stderr': subprocess.PIPE,
592 }
594 if sys.platform == 'win32':
595 # Hide the console window on Windows
596 si = subprocess.STARTUPINFO()
597 si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
598 si.wShowWindow = 0 # SW_HIDE
599 popen_kwargs['startupinfo'] = si
600 popen_kwargs['creationflags'] = subprocess.CREATE_NO_WINDOW
602 try:
603 self._process = subprocess.Popen(cmd, **popen_kwargs)
604 logger.info(f"llama-server started (PID {self._process.pid})")
605 except FileNotFoundError:
606 logger.error(f"Binary not found or not executable: {binary}")
607 self._process = None
608 return False
609 except PermissionError:
610 logger.error(f"Permission denied executing: {binary}")
611 self._process = None
612 return False
613 except OSError as exc:
614 logger.error(f"Failed to start llama-server: {exc}")
615 self._process = None
616 return False
618 # Wait for health endpoint with exponential backoff
619 self._current_model = model_path
620 if self._wait_for_health():
621 logger.info(
622 f"llama-server ready on port {port} "
623 f"(model: {os.path.basename(model_path)})")
624 return True
625 else:
626 logger.error(
627 f"llama-server health check failed after {_HEALTH_START_TIMEOUT}s "
628 "-- stopping process")
629 self._stop_locked()
630 return False
632 def _stop_locked(self) -> bool:
633 """Stop the server (caller must hold self._lock)."""
634 if self._process is None:
635 logger.debug("No server process to stop")
636 return True
638 pid = self._process.pid
639 logger.info(f"Stopping llama-server (PID {pid})...")
641 try:
642 # Graceful shutdown: terminate (SIGTERM on Unix, TerminateProcess on Windows)
643 self._process.terminate()
645 try:
646 self._process.wait(timeout=_STOP_GRACE_PERIOD)
647 logger.info(f"llama-server (PID {pid}) terminated gracefully")
648 except subprocess.TimeoutExpired:
649 # Force kill
650 logger.warning(
651 f"llama-server (PID {pid}) did not exit in "
652 f"{_STOP_GRACE_PERIOD}s -- force killing")
653 self._process.kill()
654 self._process.wait(timeout=5)
655 logger.info(f"llama-server (PID {pid}) killed")
657 except ProcessLookupError:
658 logger.debug(f"Process {pid} already exited")
659 except OSError as exc:
660 logger.error(f"Error stopping llama-server (PID {pid}): {exc}")
661 return False
662 finally:
663 self._process = None
664 self._current_model = None
666 return True
668 def _check_health(self) -> bool:
669 """Single health check against /health endpoint."""
670 url = f'http://127.0.0.1:{self._port}/health'
671 result = _http_get(url, timeout=_HEALTH_CHECK_TIMEOUT)
672 return result is not None
674 def _wait_for_health(self) -> bool:
675 """Wait for the server health endpoint with exponential backoff.
677 Polls GET /health up to _HEALTH_START_TIMEOUT seconds.
678 """
679 deadline = time.monotonic() + _HEALTH_START_TIMEOUT
680 interval = _HEALTH_POLL_INTERVAL
682 while time.monotonic() < deadline:
683 # Check if process died
684 if self._process is not None and self._process.poll() is not None:
685 rc = self._process.returncode
686 logger.error(f"llama-server exited prematurely (code {rc})")
687 # Try to read stderr for diagnostics
688 try:
689 stderr = self._process.stderr.read().decode('utf-8', errors='replace')
690 if stderr:
691 logger.error(f"llama-server stderr: {stderr[:1000]}")
692 except Exception:
693 pass
694 return False
696 if self._check_health():
697 return True
699 time.sleep(interval)
700 interval = min(interval * 1.5, _HEALTH_POLL_MAX_INTERVAL)
702 return False
704 @staticmethod
705 def _get_gpu_info() -> Dict:
706 """Query GPU info via vram_manager singleton."""
707 try:
708 from .vram_manager import vram_manager
709 return vram_manager.detect_gpu()
710 except Exception as exc:
711 logger.debug(f"vram_manager unavailable: {exc}")
712 return {
713 'name': None,
714 'total_gb': 0.0,
715 'free_gb': 0.0,
716 'cuda_available': False,
717 }
720# ── Module-level Singleton ───────────────────────────────────────
722_manager: Optional[LlamaCppManager] = None
723_manager_lock = threading.Lock()
726def get_llamacpp_manager() -> LlamaCppManager:
727 """Return the global LlamaCppManager singleton (thread-safe)."""
728 global _manager
729 if _manager is not None:
730 return _manager
732 with _manager_lock:
733 if _manager is not None:
734 return _manager
735 _manager = LlamaCppManager()
736 return _manager