Skip to content

Commit 62387b2

Browse files
committed
feat(coral-tpu): TPU health watchdog — detect hangs and silent stalls
Two failure modes now monitored: 1. invoke() hang: We run interpreter.invoke() on a daemon thread with a 10s timeout. If it doesn't return, we emit a tpu_error/invoke_timeout event and exit(1) so Aegis restarts the skill. 2. Silent stall: After the TPU has produced >=5 genuine detections, if 30 consecutive frames return zero objects, we emit tpu_error/stall. stall resets after reporting so Aegis gets one clear signal per episode rather than a flood of events. Both events include retriable:true so Aegis can auto-restart.
1 parent 18aca72 commit 62387b2

1 file changed

Lines changed: 112 additions & 4 deletions

File tree

  • skills/detection/yolo-detection-2026-coral-tpu/scripts

skills/detection/yolo-detection-2026-coral-tpu/scripts/detect.py

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
import sys
1616
import time
1717
import signal
18+
import threading
1819
from pathlib import Path
20+
from typing import Optional, Tuple, List, Dict, Any
1921

2022
import numpy as np
2123
from PIL import Image
@@ -99,6 +101,75 @@ def emit_and_reset(self):
99101
return stats
100102

101103

104+
class TPUHealthWatchdog:
105+
"""
106+
Detects two distinct TPU failure modes:
107+
108+
1. Inference hang: interpreter.invoke() takes longer than `invoke_timeout_s`.
109+
This happens when the USB connection is lost or the TPU kernel driver locks.
110+
We run invoke() on a daemon thread and join with a timeout.
111+
112+
2. Silent stall: The TPU keeps returning results (no hang) but every result
113+
is empty (0 detections) for `stall_frames` consecutive frames, AFTER the
114+
skill had at least `min_active_frames` successful frames earlier.
115+
This catches thermal throttling where the TPU resets internally.
116+
"""
117+
118+
def __init__(self, invoke_timeout_s=10, stall_frames=30, min_active_frames=5):
119+
self.invoke_timeout_s = invoke_timeout_s
120+
self.stall_frames = stall_frames
121+
self.min_active_frames = min_active_frames
122+
123+
self._consecutive_zero = 0
124+
self._total_frames_with_detections = 0
125+
self._invoke_exception: Optional[Exception] = None
126+
127+
def run_invoke(self, interpreter):
128+
"""Run interpreter.invoke() with a hard timeout. Raises RuntimeError on hang."""
129+
self._invoke_exception = None
130+
completed = [False]
131+
132+
def _invoke():
133+
try:
134+
interpreter.invoke()
135+
completed[0] = True
136+
except Exception as e:
137+
self._invoke_exception = e
138+
139+
t = threading.Thread(target=_invoke, daemon=True)
140+
t.start()
141+
t.join(timeout=self.invoke_timeout_s)
142+
143+
if t.is_alive():
144+
# Thread is still blocked inside invoke() — TPU USB hang
145+
raise RuntimeError(
146+
f"TPU invoke() timed out after {self.invoke_timeout_s}s — "
147+
"USB connection may be lost or TPU is locked up"
148+
)
149+
150+
if self._invoke_exception is not None:
151+
raise self._invoke_exception
152+
153+
def record(self, n_detections):
154+
"""Call after each frame. Returns a health status string or None."""
155+
if n_detections > 0:
156+
self._total_frames_with_detections += 1
157+
self._consecutive_zero = 0
158+
return None
159+
160+
self._consecutive_zero += 1
161+
162+
# Only fire the stall alert after the TPU was genuinely producing results
163+
if (self._total_frames_with_detections >= self.min_active_frames
164+
and self._consecutive_zero >= self.stall_frames):
165+
return "stall"
166+
167+
return None
168+
169+
def reset_stall(self):
170+
self._consecutive_zero = 0
171+
172+
102173
class CoralDetector:
103174
"""Edge TPU object detector using ai-edge-litert with libedgetpu delegate."""
104175

@@ -108,6 +179,11 @@ def __init__(self, params):
108179
self.input_size = int(params.get("input_size", 320))
109180
self.interpreter = None
110181
self.tpu_count = 0
182+
self.watchdog = TPUHealthWatchdog(
183+
invoke_timeout_s=10,
184+
stall_frames=30,
185+
min_active_frames=5,
186+
)
111187

112188
# Parse target classes
113189
classes_str = params.get("classes", "person,car,dog,cat")
@@ -201,9 +277,13 @@ def detect_frame(self, frame_path):
201277
input_data = np.expand_dims(np.array(img_resized, dtype=np.uint8), axis=0)
202278
self.interpreter.set_tensor(input_details["index"], input_data)
203279

204-
# Run inference
280+
# Run inference with hard timeout via watchdog
205281
t_pre = time.perf_counter()
206-
self.interpreter.invoke()
282+
try:
283+
self.watchdog.run_invoke(self.interpreter)
284+
except RuntimeError as e:
285+
log(f"TPU invoke() failed: {e}")
286+
return [], {}, "hang"
207287
t_infer = time.perf_counter()
208288

209289
# Parse output tensors (works for both Edge TPU and CPU)
@@ -254,7 +334,10 @@ def detect_frame(self, frame_path):
254334
"total": round((t_post - t0) * 1000, 2),
255335
}
256336

257-
return objects, timings
337+
# Record with watchdog — returns "stall" if TPU has gone silent
338+
health = self.watchdog.record(len(objects))
339+
340+
return objects, timings, health
258341

259342

260343
# ─── Helpers ──────────────────────────────────────────────────────────────────
@@ -347,7 +430,20 @@ def on_signal(sig, frame):
347430
})
348431
continue
349432

350-
objects, timings = detector.detect_frame(frame_path)
433+
objects, timings, health = detector.detect_frame(frame_path)
434+
435+
# Check for TPU hang (invoke timeout)
436+
if health == "hang":
437+
emit_json({
438+
"event": "tpu_error",
439+
"frame_id": frame_id,
440+
"camera_id": camera_id,
441+
"error": "invoke_timeout",
442+
"message": "TPU invoke() timed out — USB connection may be lost",
443+
"retriable": True,
444+
})
445+
# Exit with code 1 so Aegis restarts us
446+
sys.exit(1)
351447

352448
# Emit detections
353449
emit_json({
@@ -358,6 +454,18 @@ def on_signal(sig, frame):
358454
"objects": objects,
359455
})
360456

457+
# Check for silent stall (zero results for too long)
458+
if health == "stall":
459+
emit_json({
460+
"event": "tpu_error",
461+
"frame_id": frame_id,
462+
"camera_id": camera_id,
463+
"error": "stall",
464+
"message": "TPU has returned 0 detections for 30 consecutive frames — possible thermal throttle or silent reset",
465+
"retriable": True,
466+
})
467+
detector.watchdog.reset_stall() # Prevent repeated spam; let Aegis decide to restart
468+
361469
# Track performance
362470
if timings:
363471
perf.record(timings)

0 commit comments

Comments
 (0)