Coverage for core / diag.py: 94.9%

39 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2core.diag — canonical thread-stack dump for live-hang diagnosis. 

3 

4WHY THIS EXISTS 

5─────────────── 

6Three call sites used to maintain their own thread-dump implementations: 

7 1. app.py `_dump_all_thread_stacks` (startup watchdog) 

8 2. main.py `/api/admin/diag/thread-dump` admin endpoint 

9 3. HARTOS security/node_watchdog.py (NodeWatchdog FROZEN restart) 

10 

11Sites 2 and 3 each carried fragile `getattr(__main__, '_dump_...')` lookup 

12chains because in frozen mode `app.py` is the entry script and the function 

13isn't reachable by normal import. Whenever the symbol moved or the frozen 

14bundle layout changed, the chain silently fell through to "thread dump 

15unavailable" — defeating the entire diagnostic purpose. 

16 

17This module is the SINGLE canonical implementation. It also publishes 

18itself on `builtins._nunba_dump_threads` so frozen-mode lookups (where 

19neither `core.diag` nor `app` may be import-resolvable from a watchdog 

20spawned by HARTOS) keep working via the same trick `app.py` uses for 

21`_nunba_trace`. 

22 

23WHO CALLS IT 

24──────────── 

25- app.py `_startup_watchdog` (15s into a stalled startup phase) 

26- main.py `/api/admin/diag/thread-dump` (operator on-demand) 

27- HARTOS security/node_watchdog.py (BEFORE killing a frozen daemon) 

28 

29WHERE THE DUMP GOES 

30─────────────────── 

31Two sinks ALWAYS: 

32 - The Python logger (may be delayed if MainThread holds the GIL). 

33 - `_nunba_trace` builtin (flushes immediately to startup_trace.log, 

34 survives GIL-held hangs — the whole reason this module exists). 

35 

36Callers can inject extra sinks (e.g., WAMP publish, metrics counter, 

37crash-reporter breadcrumb) via the `sinks` parameter without monkey-patching. 

38 

39NOT FOR PRODUCTION TELEMETRY 

40──────────────────────────── 

41Stack dumps leak file paths, env hints (cwd in tracebacks), and call-graph 

42shape. Admin endpoint MUST stay behind `require_local_or_token` on flat 

43tier and `require_central` on regional/central tier. 

44""" 

45from __future__ import annotations 

46 

47import builtins 

48import logging 

49import sys 

50import threading 

51import traceback 

52from typing import Callable, List, Optional 

53 

54logger = logging.getLogger(__name__) 

55 

56 

57def _trace_sink(payload: str) -> None: 

58 """Write to the startup trace channel (immediate flush, GIL-resilient). 

59 

60 The `_nunba_trace` builtin is published by app.py at process boot. If 

61 the watchdog runs before app.py finished initialising (or in a stripped 

62 test environment), this is a silent no-op. 

63 """ 

64 _t = getattr(builtins, '_nunba_trace', None) 

65 if _t is None: 

66 return 

67 try: 

68 _t(payload) 

69 except Exception: 

70 # The trace sink itself failing is a last-line-of-defence failure; 

71 # we cannot recurse into logger.error here because that's what the 

72 # caller already tried. Swallow. 

73 pass 

74 

75 

76def _logger_sink(payload: str) -> None: 

77 """Write to the Python logger. May be delayed if MainThread is wedged.""" 

78 try: 

79 logger.error(payload) 

80 except Exception: 

81 pass 

82 

83 

84def dump_all_thread_stacks( 

85 reason: str, 

86 sinks: Optional[List[Callable[[str], None]]] = None, 

87) -> str: 

88 """Dump EVERY live thread (including MainThread) with its current Python 

89 stack frame. 

90 

91 Args: 

92 reason: Human-readable why-this-fired (e.g. "Phase 'wmic_probe' 

93 stuck 30s", "admin diag", "NodeWatchdog FROZEN restart: tts"). 

94 Included as the dump header so cross-referencing logs is easy. 

95 sinks: Extra one-arg callables that receive the formatted payload 

96 string. The default [logger, trace] sinks are ALWAYS invoked 

97 in addition to whatever you pass. Use this for WAMP publish, 

98 metrics, crash-reporter breadcrumb — anything that must NOT 

99 replace the canonical sinks. 

100 

101 Returns: 

102 The formatted multi-line dump string (also returned so test code 

103 can assert against it without monkey-patching the logger). 

104 """ 

105 # Build the dump payload first — no I/O while collecting frames so we 

106 # capture a coherent snapshot even if a thread is mid-syscall. 

107 lines = [f"[THREAD DUMP] {reason}"] 

108 try: 

109 frames = sys._current_frames() 

110 except Exception as e: 

111 # _current_frames is a CPython feature; on a non-CPython runtime 

112 # it could raise. Emit a stub so the caller still gets feedback. 

113 payload = f"[THREAD DUMP] {reason}\n (_current_frames unavailable: {e})" 

114 for sink in (_logger_sink, _trace_sink, *(sinks or [])): 

115 try: 

116 sink(payload) 

117 except Exception: 

118 pass 

119 return payload 

120 

121 name_by_id = {t.ident: t.name for t in threading.enumerate()} 

122 try: 

123 main_ident = threading.main_thread().ident 

124 except Exception: 

125 main_ident = None 

126 

127 for tid, frame in frames.items(): 

128 tname = name_by_id.get(tid, 'unknown') 

129 marker = ' [MAIN]' if tid == main_ident else '' 

130 lines.append(f" ── Thread {tname}{marker} (id={tid}) ──") 

131 try: 

132 formatted = traceback.format_stack(frame) 

133 lines.append(' ' + ' '.join(formatted).rstrip()) 

134 except Exception as fe: 

135 lines.append(f" (format_stack failed: {fe})") 

136 

137 payload = '\n'.join(lines) 

138 

139 # Always-invoked sinks: logger + trace. Caller-supplied sinks run AFTER 

140 # the canonical pair so a buggy custom sink can't suppress diagnostics. 

141 for sink in (_logger_sink, _trace_sink, *(sinks or [])): 

142 try: 

143 sink(payload) 

144 except Exception: 

145 # A failing sink must not stop the others. We deliberately do 

146 # not log the failure (would recurse if logger sink is broken). 

147 pass 

148 

149 return payload 

150 

151 

152# ── Builtin publication for frozen-mode cross-module lookup ────────────── 

153# In frozen bundles HARTOS-side watchdogs may not be able to `import core.diag` 

154# because the bundle's importlib-machinery only sees Nunba's own pyc cache 

155# and HARTOS sits in a sibling site-packages. Exposing the function on 

156# `builtins` mirrors how `app.py` publishes `_nunba_trace` — the watchdog 

157# does `getattr(__import__('builtins'), '_nunba_dump_threads', None)` and 

158# gets a working callable regardless of import topology. 

159# 

160# Idempotent: re-publishing during hot-reload / repeated test imports is OK. 

161try: 

162 builtins._nunba_dump_threads = dump_all_thread_stacks # type: ignore[attr-defined] 

163except Exception: 

164 pass 

165 

166 

167__all__ = ['dump_all_thread_stacks']