Coverage for core / verified_llm.py: 0.0%
63 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""Verified-signal LLM health check.
3Why this exists
4---------------
5Shallow-signal health checks lie:
6- /health returns 200 when llama-server is up but no model is loaded
7- /v1/models returns the catalog entry even if inference is broken
8- process-alive is a proxy for "maybe working" — not a guarantee
10This module issues a real inference — POST /v1/chat/completions with
11a minimal prompt — and asserts the response contains non-empty text.
12That is the only "yes it works" signal that matters.
14Symptom class
15-------------
16Mirrors the verified_ready pattern used for TTS (see commit b84437d
17on Nunba: runtime verification of Indic Parler). Same root cause:
18success at one layer does not imply capability at the next layer.
20API
21---
22 is_llm_inference_verified(url, timeout=5.0) -> bool
23 verify_llm(url, timeout=5.0) -> dict # detail diag
25Both accept:
26- url: full http://host:port base; defaults to http://127.0.0.1:8080
27- timeout: total seconds for the HTTP call
29The bool API is a drop-in replacement for is_llm_available().
30The dict API returns {ok, reason, http_status, content_snippet,
31elapsed_ms} for deeper triage.
32"""
34from __future__ import annotations
36import json
37import logging
38import time
39import urllib.error
40import urllib.request
41from typing import Any, Dict, Optional
43logger = logging.getLogger(__name__)
45# Exposed for tests / monkey-patch
46DEFAULT_PROBE_PROMPT = "hi"
47DEFAULT_BASE_URL = "http://127.0.0.1:8080"
48DEFAULT_TIMEOUT = 5.0
49DEFAULT_MAX_TOKENS = 4
52def _probe_payload(prompt: str, max_tokens: int) -> bytes:
53 body = {
54 "messages": [{"role": "user", "content": prompt}],
55 "max_tokens": max_tokens,
56 "temperature": 0.0,
57 "stream": False,
58 }
59 return json.dumps(body).encode("utf-8")
62def _extract_content(data: Dict[str, Any]) -> str:
63 """Pull text out of an OpenAI-compatible /v1/chat/completions response.
65 Handles the 2 common shapes:
66 - {'choices': [{'message': {'content': '...'}}]}
67 - {'choices': [{'text': '...'}]} (legacy /v1/completions)
69 Returns '' if no content can be found.
70 """
71 choices = data.get("choices") or []
72 if not choices:
73 return ""
74 first = choices[0] or {}
75 msg = first.get("message") or {}
76 content = msg.get("content")
77 if isinstance(content, str) and content.strip():
78 return content
79 # legacy /v1/completions style
80 text = first.get("text")
81 if isinstance(text, str) and text.strip():
82 return text
83 return ""
86def verify_llm(
87 url: str = DEFAULT_BASE_URL,
88 timeout: float = DEFAULT_TIMEOUT,
89 prompt: str = DEFAULT_PROBE_PROMPT,
90 max_tokens: int = DEFAULT_MAX_TOKENS,
91) -> Dict[str, Any]:
92 """Verified-signal health check via real inference.
94 Returns dict:
95 - ok (bool): True iff the LLM produced non-empty content
96 - reason (str): human-readable failure class
97 - http_status (int | None): HTTP status code if reachable
98 - content_snippet (str): first 64 chars of the reply (on ok=True)
99 - elapsed_ms (int): end-to-end time
100 """
101 endpoint = url.rstrip("/") + "/v1/chat/completions"
102 payload = _probe_payload(prompt, max_tokens)
103 started = time.monotonic()
105 result: Dict[str, Any] = {
106 "ok": False,
107 "reason": "unknown",
108 "http_status": None,
109 "content_snippet": "",
110 "elapsed_ms": 0,
111 }
113 try:
114 req = urllib.request.Request(
115 endpoint,
116 data=payload,
117 method="POST",
118 headers={"Content-Type": "application/json"},
119 )
120 with urllib.request.urlopen(req, timeout=timeout) as resp:
121 result["http_status"] = resp.status
122 if resp.status != 200:
123 result["reason"] = f"http_{resp.status}"
124 return result
125 raw = resp.read()
126 except urllib.error.HTTPError as e:
127 result["http_status"] = e.code
128 result["reason"] = f"http_{e.code}"
129 return result
130 except urllib.error.URLError as e:
131 result["reason"] = f"unreachable:{e.reason}"
132 return result
133 except Exception as e: # timeout, socket, malformed URL
134 result["reason"] = f"exception:{type(e).__name__}"
135 return result
136 finally:
137 result["elapsed_ms"] = int((time.monotonic() - started) * 1000)
139 # Parse body
140 try:
141 data = json.loads(raw)
142 except Exception:
143 result["reason"] = "malformed_json"
144 return result
146 content = _extract_content(data)
147 if not content:
148 result["reason"] = "empty_content"
149 return result
151 result["ok"] = True
152 result["reason"] = "verified"
153 result["content_snippet"] = content[:64]
154 return result
157def is_llm_inference_verified(
158 url: str = DEFAULT_BASE_URL,
159 timeout: float = DEFAULT_TIMEOUT,
160) -> bool:
161 """Boolean drop-in for is_llm_available().
163 Returns True iff a real /v1/chat/completions probe produced
164 non-empty content within `timeout` seconds.
165 """
166 try:
167 return verify_llm(url=url, timeout=timeout)["ok"]
168 except Exception as exc: # defense-in-depth — never raise
169 logger.debug("verify_llm raised unexpectedly: %s", exc)
170 return False
173__all__ = [
174 "DEFAULT_BASE_URL",
175 "DEFAULT_PROBE_PROMPT",
176 "DEFAULT_TIMEOUT",
177 "DEFAULT_MAX_TOKENS",
178 "is_llm_inference_verified",
179 "verify_llm",
180]