Coverage for core / health_probe.py: 87.3%
55 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""Canonical runtime-state probes.
3Single source of truth for "is the daemon actually running?", "is the
4LLM server actually reachable?", "is Flask up?". Replaces the
5duplicated, drift-prone probes that previously lived inline in BOTH
6`integrations/mcp/mcp_server.py` and
7`integrations/mcp/mcp_http_bridge.py`.
9Why this module exists (root-cause notes from 2026-05-01 incident):
111. The old `daemon_enabled` probe read
12 ``os.environ.get('HEVOLVE_AGENT_ENGINE_ENABLED', 'false')`` — a
13 *config snapshot*, not the actual thread state. It returned
14 ``'false'`` even when the daemon thread was alive, because the
15 env-var auto-setter at ``integrations/social/__init__.py:348``
16 only runs `if env is None`, leaving any other unset/empty value
17 to default to `'false'`. Probes must read the actual
18 ``agent_daemon._running`` singleton state.
202. The old `llm_server` probe hit
21 ``http://localhost:{get_port('llm')}/health`` (default 8080). On
22 installs where llama-server binds to a non-default port (set via
23 ``HEVOLVE_LOCAL_LLM_URL``, ``LLAMA_CPP_PORT``, or written into
24 ``~/.nunba/llama_config.json:server_port``), this hardcoded URL
25 misses entirely. The canonical resolver
26 ``core.port_registry.get_local_llm_url()`` already walks 7
27 candidate sources and probes each — both MCP probes must route
28 through it instead of duplicating a worse version of the same
29 logic.
31Public API: each `probe_*` function returns a plain dict that the
32MCP tools serialize to JSON. Side-effect free, fast (≤200 ms total
33on a healthy host).
35Per CLAUDE.md DRY gate — no parallel implementations of these probes
36are allowed elsewhere. If you find yourself writing
37``os.environ.get('HEVOLVE_AGENT_ENGINE_ENABLED', ...)`` to figure out
38"is the daemon on?", you are in the wrong place; call
39``probe_agent_daemon()`` instead.
40"""
41from __future__ import annotations
42import os
43from typing import Dict, Any
46def probe_agent_daemon() -> Dict[str, Any]:
47 """Return the actual agent daemon thread state plus config.
49 Reads ``agent_daemon._running`` and ``agent_daemon._thread`` —
50 NOT the ``HEVOLVE_AGENT_ENGINE_ENABLED`` env var (which is the
51 pre-boot intent, not the live state). Falls back to env var if
52 the daemon module cannot be imported (extreme degraded boot).
53 """
54 out: Dict[str, Any] = {
55 'poll_interval': int(os.environ.get('HEVOLVE_AGENT_POLL_INTERVAL', '30')),
56 'max_concurrent': int(os.environ.get('HEVOLVE_AGENT_MAX_CONCURRENT', '10')),
57 'speculative_enabled': (
58 os.environ.get('HEVOLVE_SPECULATIVE_ENABLED', 'false').lower() == 'true'
59 ),
60 }
61 try:
62 from integrations.agent_engine.agent_daemon import agent_daemon
63 out['daemon_enabled'] = bool(agent_daemon._running)
64 out['daemon_thread_alive'] = bool(
65 agent_daemon._thread and agent_daemon._thread.is_alive()
66 )
67 out['daemon_tick_count'] = int(getattr(agent_daemon, '_tick_count', 0))
68 except Exception as e:
69 # Degraded fallback — couldn't reach the daemon module at all.
70 out['daemon_enabled'] = (
71 os.environ.get('HEVOLVE_AGENT_ENGINE_ENABLED', 'false').lower() == 'true'
72 )
73 out['daemon_thread_alive'] = False
74 out['daemon_probe_error'] = str(e)
75 return out
78def probe_llm(include_models: bool = False) -> Dict[str, Any]:
79 """Return live LLM server state via an HTTP-fidelity probe.
81 Issues an actual HTTP GET to ``<url>/models`` and checks for a 200
82 response. Distinct from the TCP-only ``_probe_llm_endpoint`` in
83 ``core.port_registry`` which is the cheap candidate-filter for
84 ``get_local_llm_url`` — that one stays TCP-only on purpose
85 (sub-1ms per candidate). This probe upgrades to HTTP fidelity
86 so a half-loaded llama-server (port bound but model not ready)
87 is correctly reported as ``down`` (#459).
89 SRP (#458): the default response is a single HTTP request — no
90 second-call side effect. Pass ``include_models=True`` when you
91 actually need the model-list payload; otherwise the response body
92 is discarded.
94 Always returns the URL we tried so debugging is one log line
95 instead of "down" with no clue.
96 """
97 out: Dict[str, Any] = {}
98 try:
99 from core.port_registry import get_local_llm_url
100 url = get_local_llm_url()
101 out['url'] = url
102 except Exception as e:
103 out['status'] = 'probe_error'
104 out['error'] = str(e)
105 return out
106 try:
107 from core.http_pool import pooled_get
108 # ``get_local_llm_url`` returns the ".../v1" suffix so /models
109 # is the OpenAI-compatible models endpoint. A 200 here proves
110 # the LLM is actually serving — port-bound-but-stuck processes
111 # return 5xx / connection-error / timeout.
112 models_url = url.rstrip('/') + '/models'
113 resp = pooled_get(models_url, timeout=2)
114 if resp.status_code == 200:
115 out['status'] = 'up'
116 if include_models:
117 try:
118 data = resp.json()
119 out['models'] = [
120 m.get('id', 'unknown')
121 for m in data.get('data', [])
122 ]
123 except Exception:
124 pass
125 else:
126 out['status'] = 'down'
127 out['code'] = resp.status_code
128 except Exception as e:
129 out['status'] = 'down'
130 out['error'] = str(e)
131 return out
134def probe_nunba_flask() -> Dict[str, Any]:
135 """Return Nunba Flask server state.
137 Resolves the port via the canonical ``core.port_registry.get_port
138 ('flask')`` resolver instead of the previously-hardcoded :5000
139 literal (#460) — env override ``HART_FLASK_PORT`` is honored
140 automatically.
141 """
142 out: Dict[str, Any] = {}
143 try:
144 from core.port_registry import get_port
145 port = get_port('flask')
146 out['port'] = port
147 except Exception as e:
148 out['status'] = 'probe_error'
149 out['error'] = str(e)
150 return out
151 try:
152 from core.http_pool import pooled_get
153 resp = pooled_get(f'http://localhost:{port}/health', timeout=2)
154 out['status'] = ('up' if resp.status_code == 200
155 else f'status_{resp.status_code}')
156 out['code'] = resp.status_code
157 except Exception as e:
158 out['status'] = 'down'
159 out['error'] = str(e)
160 return out
163def probe_langchain() -> Dict[str, Any]:
164 """Return langchain GPT API sidecar state.
166 Resolves the port via the canonical ``core.port_registry.get_port
167 ('langchain')`` resolver instead of the previously-hardcoded :6778
168 literal (#460) — env override ``HART_LANGCHAIN_PORT`` is honored
169 automatically.
170 """
171 out: Dict[str, Any] = {}
172 try:
173 from core.port_registry import get_port
174 port = get_port('langchain')
175 out['port'] = port
176 except Exception as e:
177 out['status'] = 'probe_error'
178 out['error'] = str(e)
179 return out
180 try:
181 from core.http_pool import pooled_get
182 resp = pooled_get(f'http://localhost:{port}/health', timeout=2)
183 out['status'] = 'up' if resp.status_code == 200 else 'error'
184 out['code'] = resp.status_code
185 except Exception:
186 out['status'] = 'down'
187 return out
190__all__ = [
191 'probe_agent_daemon',
192 'probe_llm',
193 'probe_nunba_flask',
194 'probe_langchain',
195]