Coverage for core / diag.py: 94.9%
39 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2core.diag — canonical thread-stack dump for live-hang diagnosis.
4WHY THIS EXISTS
5───────────────
6Three call sites used to maintain their own thread-dump implementations:
7 1. app.py `_dump_all_thread_stacks` (startup watchdog)
8 2. main.py `/api/admin/diag/thread-dump` admin endpoint
9 3. HARTOS security/node_watchdog.py (NodeWatchdog FROZEN restart)
11Sites 2 and 3 each carried fragile `getattr(__main__, '_dump_...')` lookup
12chains because in frozen mode `app.py` is the entry script and the function
13isn't reachable by normal import. Whenever the symbol moved or the frozen
14bundle layout changed, the chain silently fell through to "thread dump
15unavailable" — defeating the entire diagnostic purpose.
17This module is the SINGLE canonical implementation. It also publishes
18itself on `builtins._nunba_dump_threads` so frozen-mode lookups (where
19neither `core.diag` nor `app` may be import-resolvable from a watchdog
20spawned by HARTOS) keep working via the same trick `app.py` uses for
21`_nunba_trace`.
23WHO CALLS IT
24────────────
25- app.py `_startup_watchdog` (15s into a stalled startup phase)
26- main.py `/api/admin/diag/thread-dump` (operator on-demand)
27- HARTOS security/node_watchdog.py (BEFORE killing a frozen daemon)
29WHERE THE DUMP GOES
30───────────────────
31Two sinks ALWAYS:
32 - The Python logger (may be delayed if MainThread holds the GIL).
33 - `_nunba_trace` builtin (flushes immediately to startup_trace.log,
34 survives GIL-held hangs — the whole reason this module exists).
36Callers can inject extra sinks (e.g., WAMP publish, metrics counter,
37crash-reporter breadcrumb) via the `sinks` parameter without monkey-patching.
39NOT FOR PRODUCTION TELEMETRY
40────────────────────────────
41Stack dumps leak file paths, env hints (cwd in tracebacks), and call-graph
42shape. Admin endpoint MUST stay behind `require_local_or_token` on flat
43tier and `require_central` on regional/central tier.
44"""
45from __future__ import annotations
47import builtins
48import logging
49import sys
50import threading
51import traceback
52from typing import Callable, List, Optional
54logger = logging.getLogger(__name__)
57def _trace_sink(payload: str) -> None:
58 """Write to the startup trace channel (immediate flush, GIL-resilient).
60 The `_nunba_trace` builtin is published by app.py at process boot. If
61 the watchdog runs before app.py finished initialising (or in a stripped
62 test environment), this is a silent no-op.
63 """
64 _t = getattr(builtins, '_nunba_trace', None)
65 if _t is None:
66 return
67 try:
68 _t(payload)
69 except Exception:
70 # The trace sink itself failing is a last-line-of-defence failure;
71 # we cannot recurse into logger.error here because that's what the
72 # caller already tried. Swallow.
73 pass
76def _logger_sink(payload: str) -> None:
77 """Write to the Python logger. May be delayed if MainThread is wedged."""
78 try:
79 logger.error(payload)
80 except Exception:
81 pass
84def dump_all_thread_stacks(
85 reason: str,
86 sinks: Optional[List[Callable[[str], None]]] = None,
87) -> str:
88 """Dump EVERY live thread (including MainThread) with its current Python
89 stack frame.
91 Args:
92 reason: Human-readable why-this-fired (e.g. "Phase 'wmic_probe'
93 stuck 30s", "admin diag", "NodeWatchdog FROZEN restart: tts").
94 Included as the dump header so cross-referencing logs is easy.
95 sinks: Extra one-arg callables that receive the formatted payload
96 string. The default [logger, trace] sinks are ALWAYS invoked
97 in addition to whatever you pass. Use this for WAMP publish,
98 metrics, crash-reporter breadcrumb — anything that must NOT
99 replace the canonical sinks.
101 Returns:
102 The formatted multi-line dump string (also returned so test code
103 can assert against it without monkey-patching the logger).
104 """
105 # Build the dump payload first — no I/O while collecting frames so we
106 # capture a coherent snapshot even if a thread is mid-syscall.
107 lines = [f"[THREAD DUMP] {reason}"]
108 try:
109 frames = sys._current_frames()
110 except Exception as e:
111 # _current_frames is a CPython feature; on a non-CPython runtime
112 # it could raise. Emit a stub so the caller still gets feedback.
113 payload = f"[THREAD DUMP] {reason}\n (_current_frames unavailable: {e})"
114 for sink in (_logger_sink, _trace_sink, *(sinks or [])):
115 try:
116 sink(payload)
117 except Exception:
118 pass
119 return payload
121 name_by_id = {t.ident: t.name for t in threading.enumerate()}
122 try:
123 main_ident = threading.main_thread().ident
124 except Exception:
125 main_ident = None
127 for tid, frame in frames.items():
128 tname = name_by_id.get(tid, 'unknown')
129 marker = ' [MAIN]' if tid == main_ident else ''
130 lines.append(f" ── Thread {tname}{marker} (id={tid}) ──")
131 try:
132 formatted = traceback.format_stack(frame)
133 lines.append(' ' + ' '.join(formatted).rstrip())
134 except Exception as fe:
135 lines.append(f" (format_stack failed: {fe})")
137 payload = '\n'.join(lines)
139 # Always-invoked sinks: logger + trace. Caller-supplied sinks run AFTER
140 # the canonical pair so a buggy custom sink can't suppress diagnostics.
141 for sink in (_logger_sink, _trace_sink, *(sinks or [])):
142 try:
143 sink(payload)
144 except Exception:
145 # A failing sink must not stop the others. We deliberately do
146 # not log the failure (would recurse if logger sink is broken).
147 pass
149 return payload
152# ── Builtin publication for frozen-mode cross-module lookup ──────────────
153# In frozen bundles HARTOS-side watchdogs may not be able to `import core.diag`
154# because the bundle's importlib-machinery only sees Nunba's own pyc cache
155# and HARTOS sits in a sibling site-packages. Exposing the function on
156# `builtins` mirrors how `app.py` publishes `_nunba_trace` — the watchdog
157# does `getattr(__import__('builtins'), '_nunba_dump_threads', None)` and
158# gets a working callable regardless of import topology.
159#
160# Idempotent: re-publishing during hot-reload / repeated test imports is OK.
161try:
162 builtins._nunba_dump_threads = dump_all_thread_stacks # type: ignore[attr-defined]
163except Exception:
164 pass
167__all__ = ['dump_all_thread_stacks']