Coverage for core / verified_llm.py: 0.0%

63 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1"""Verified-signal LLM health check. 

2 

3Why this exists 

4--------------- 

5Shallow-signal health checks lie: 

6- /health returns 200 when llama-server is up but no model is loaded 

7- /v1/models returns the catalog entry even if inference is broken 

8- process-alive is a proxy for "maybe working" — not a guarantee 

9 

10This module issues a real inference — POST /v1/chat/completions with 

11a minimal prompt — and asserts the response contains non-empty text. 

12That is the only "yes it works" signal that matters. 

13 

14Symptom class 

15------------- 

16Mirrors the verified_ready pattern used for TTS (see commit b84437d 

17on Nunba: runtime verification of Indic Parler). Same root cause: 

18success at one layer does not imply capability at the next layer. 

19 

20API 

21--- 

22 is_llm_inference_verified(url, timeout=5.0) -> bool 

23 verify_llm(url, timeout=5.0) -> dict # detail diag 

24 

25Both accept: 

26- url: full http://host:port base; defaults to http://127.0.0.1:8080 

27- timeout: total seconds for the HTTP call 

28 

29The bool API is a drop-in replacement for is_llm_available(). 

30The dict API returns {ok, reason, http_status, content_snippet, 

31elapsed_ms} for deeper triage. 

32""" 

33 

34from __future__ import annotations 

35 

36import json 

37import logging 

38import time 

39import urllib.error 

40import urllib.request 

41from typing import Any, Dict, Optional 

42 

43logger = logging.getLogger(__name__) 

44 

45# Exposed for tests / monkey-patch 

46DEFAULT_PROBE_PROMPT = "hi" 

47DEFAULT_BASE_URL = "http://127.0.0.1:8080" 

48DEFAULT_TIMEOUT = 5.0 

49DEFAULT_MAX_TOKENS = 4 

50 

51 

52def _probe_payload(prompt: str, max_tokens: int) -> bytes: 

53 body = { 

54 "messages": [{"role": "user", "content": prompt}], 

55 "max_tokens": max_tokens, 

56 "temperature": 0.0, 

57 "stream": False, 

58 } 

59 return json.dumps(body).encode("utf-8") 

60 

61 

62def _extract_content(data: Dict[str, Any]) -> str: 

63 """Pull text out of an OpenAI-compatible /v1/chat/completions response. 

64 

65 Handles the 2 common shapes: 

66 - {'choices': [{'message': {'content': '...'}}]} 

67 - {'choices': [{'text': '...'}]} (legacy /v1/completions) 

68 

69 Returns '' if no content can be found. 

70 """ 

71 choices = data.get("choices") or [] 

72 if not choices: 

73 return "" 

74 first = choices[0] or {} 

75 msg = first.get("message") or {} 

76 content = msg.get("content") 

77 if isinstance(content, str) and content.strip(): 

78 return content 

79 # legacy /v1/completions style 

80 text = first.get("text") 

81 if isinstance(text, str) and text.strip(): 

82 return text 

83 return "" 

84 

85 

86def verify_llm( 

87 url: str = DEFAULT_BASE_URL, 

88 timeout: float = DEFAULT_TIMEOUT, 

89 prompt: str = DEFAULT_PROBE_PROMPT, 

90 max_tokens: int = DEFAULT_MAX_TOKENS, 

91) -> Dict[str, Any]: 

92 """Verified-signal health check via real inference. 

93 

94 Returns dict: 

95 - ok (bool): True iff the LLM produced non-empty content 

96 - reason (str): human-readable failure class 

97 - http_status (int | None): HTTP status code if reachable 

98 - content_snippet (str): first 64 chars of the reply (on ok=True) 

99 - elapsed_ms (int): end-to-end time 

100 """ 

101 endpoint = url.rstrip("/") + "/v1/chat/completions" 

102 payload = _probe_payload(prompt, max_tokens) 

103 started = time.monotonic() 

104 

105 result: Dict[str, Any] = { 

106 "ok": False, 

107 "reason": "unknown", 

108 "http_status": None, 

109 "content_snippet": "", 

110 "elapsed_ms": 0, 

111 } 

112 

113 try: 

114 req = urllib.request.Request( 

115 endpoint, 

116 data=payload, 

117 method="POST", 

118 headers={"Content-Type": "application/json"}, 

119 ) 

120 with urllib.request.urlopen(req, timeout=timeout) as resp: 

121 result["http_status"] = resp.status 

122 if resp.status != 200: 

123 result["reason"] = f"http_{resp.status}" 

124 return result 

125 raw = resp.read() 

126 except urllib.error.HTTPError as e: 

127 result["http_status"] = e.code 

128 result["reason"] = f"http_{e.code}" 

129 return result 

130 except urllib.error.URLError as e: 

131 result["reason"] = f"unreachable:{e.reason}" 

132 return result 

133 except Exception as e: # timeout, socket, malformed URL 

134 result["reason"] = f"exception:{type(e).__name__}" 

135 return result 

136 finally: 

137 result["elapsed_ms"] = int((time.monotonic() - started) * 1000) 

138 

139 # Parse body 

140 try: 

141 data = json.loads(raw) 

142 except Exception: 

143 result["reason"] = "malformed_json" 

144 return result 

145 

146 content = _extract_content(data) 

147 if not content: 

148 result["reason"] = "empty_content" 

149 return result 

150 

151 result["ok"] = True 

152 result["reason"] = "verified" 

153 result["content_snippet"] = content[:64] 

154 return result 

155 

156 

157def is_llm_inference_verified( 

158 url: str = DEFAULT_BASE_URL, 

159 timeout: float = DEFAULT_TIMEOUT, 

160) -> bool: 

161 """Boolean drop-in for is_llm_available(). 

162 

163 Returns True iff a real /v1/chat/completions probe produced 

164 non-empty content within `timeout` seconds. 

165 """ 

166 try: 

167 return verify_llm(url=url, timeout=timeout)["ok"] 

168 except Exception as exc: # defense-in-depth — never raise 

169 logger.debug("verify_llm raised unexpectedly: %s", exc) 

170 return False 

171 

172 

173__all__ = [ 

174 "DEFAULT_BASE_URL", 

175 "DEFAULT_PROBE_PROMPT", 

176 "DEFAULT_TIMEOUT", 

177 "DEFAULT_MAX_TOKENS", 

178 "is_llm_inference_verified", 

179 "verify_llm", 

180]