1515import sys
1616import time
1717import signal
18+ import threading
1819from pathlib import Path
20+ from typing import Optional , Tuple , List , Dict , Any
1921
2022import numpy as np
2123from PIL import Image
@@ -99,6 +101,75 @@ def emit_and_reset(self):
99101 return stats
100102
101103
104+ class TPUHealthWatchdog :
105+ """
106+ Detects two distinct TPU failure modes:
107+
108+ 1. Inference hang: interpreter.invoke() takes longer than `invoke_timeout_s`.
109+ This happens when the USB connection is lost or the TPU kernel driver locks.
110+ We run invoke() on a daemon thread and join with a timeout.
111+
112+ 2. Silent stall: The TPU keeps returning results (no hang) but every result
113+ is empty (0 detections) for `stall_frames` consecutive frames, AFTER the
114+ skill had at least `min_active_frames` successful frames earlier.
115+ This catches thermal throttling where the TPU resets internally.
116+ """
117+
118+ def __init__ (self , invoke_timeout_s = 10 , stall_frames = 30 , min_active_frames = 5 ):
119+ self .invoke_timeout_s = invoke_timeout_s
120+ self .stall_frames = stall_frames
121+ self .min_active_frames = min_active_frames
122+
123+ self ._consecutive_zero = 0
124+ self ._total_frames_with_detections = 0
125+ self ._invoke_exception : Optional [Exception ] = None
126+
127+ def run_invoke (self , interpreter ):
128+ """Run interpreter.invoke() with a hard timeout. Raises RuntimeError on hang."""
129+ self ._invoke_exception = None
130+ completed = [False ]
131+
132+ def _invoke ():
133+ try :
134+ interpreter .invoke ()
135+ completed [0 ] = True
136+ except Exception as e :
137+ self ._invoke_exception = e
138+
139+ t = threading .Thread (target = _invoke , daemon = True )
140+ t .start ()
141+ t .join (timeout = self .invoke_timeout_s )
142+
143+ if t .is_alive ():
144+ # Thread is still blocked inside invoke() — TPU USB hang
145+ raise RuntimeError (
146+ f"TPU invoke() timed out after { self .invoke_timeout_s } s — "
147+ "USB connection may be lost or TPU is locked up"
148+ )
149+
150+ if self ._invoke_exception is not None :
151+ raise self ._invoke_exception
152+
153+ def record (self , n_detections ):
154+ """Call after each frame. Returns a health status string or None."""
155+ if n_detections > 0 :
156+ self ._total_frames_with_detections += 1
157+ self ._consecutive_zero = 0
158+ return None
159+
160+ self ._consecutive_zero += 1
161+
162+ # Only fire the stall alert after the TPU was genuinely producing results
163+ if (self ._total_frames_with_detections >= self .min_active_frames
164+ and self ._consecutive_zero >= self .stall_frames ):
165+ return "stall"
166+
167+ return None
168+
169+ def reset_stall (self ):
170+ self ._consecutive_zero = 0
171+
172+
102173class CoralDetector :
103174 """Edge TPU object detector using ai-edge-litert with libedgetpu delegate."""
104175
@@ -108,6 +179,11 @@ def __init__(self, params):
108179 self .input_size = int (params .get ("input_size" , 320 ))
109180 self .interpreter = None
110181 self .tpu_count = 0
182+ self .watchdog = TPUHealthWatchdog (
183+ invoke_timeout_s = 10 ,
184+ stall_frames = 30 ,
185+ min_active_frames = 5 ,
186+ )
111187
112188 # Parse target classes
113189 classes_str = params .get ("classes" , "person,car,dog,cat" )
@@ -201,9 +277,13 @@ def detect_frame(self, frame_path):
201277 input_data = np .expand_dims (np .array (img_resized , dtype = np .uint8 ), axis = 0 )
202278 self .interpreter .set_tensor (input_details ["index" ], input_data )
203279
204- # Run inference
280+ # Run inference with hard timeout via watchdog
205281 t_pre = time .perf_counter ()
206- self .interpreter .invoke ()
282+ try :
283+ self .watchdog .run_invoke (self .interpreter )
284+ except RuntimeError as e :
285+ log (f"TPU invoke() failed: { e } " )
286+ return [], {}, "hang"
207287 t_infer = time .perf_counter ()
208288
209289 # Parse output tensors (works for both Edge TPU and CPU)
@@ -254,7 +334,10 @@ def detect_frame(self, frame_path):
254334 "total" : round ((t_post - t0 ) * 1000 , 2 ),
255335 }
256336
257- return objects , timings
337+ # Record with watchdog — returns "stall" if TPU has gone silent
338+ health = self .watchdog .record (len (objects ))
339+
340+ return objects , timings , health
258341
259342
260343# ─── Helpers ──────────────────────────────────────────────────────────────────
@@ -347,7 +430,20 @@ def on_signal(sig, frame):
347430 })
348431 continue
349432
350- objects , timings = detector .detect_frame (frame_path )
433+ objects , timings , health = detector .detect_frame (frame_path )
434+
435+ # Check for TPU hang (invoke timeout)
436+ if health == "hang" :
437+ emit_json ({
438+ "event" : "tpu_error" ,
439+ "frame_id" : frame_id ,
440+ "camera_id" : camera_id ,
441+ "error" : "invoke_timeout" ,
442+ "message" : "TPU invoke() timed out — USB connection may be lost" ,
443+ "retriable" : True ,
444+ })
445+ # Exit with code 1 so Aegis restarts us
446+ sys .exit (1 )
351447
352448 # Emit detections
353449 emit_json ({
@@ -358,6 +454,18 @@ def on_signal(sig, frame):
358454 "objects" : objects ,
359455 })
360456
457+ # Check for silent stall (zero results for too long)
458+ if health == "stall" :
459+ emit_json ({
460+ "event" : "tpu_error" ,
461+ "frame_id" : frame_id ,
462+ "camera_id" : camera_id ,
463+ "error" : "stall" ,
464+ "message" : "TPU has returned 0 detections for 30 consecutive frames — possible thermal throttle or silent reset" ,
465+ "retriable" : True ,
466+ })
467+ detector .watchdog .reset_stall () # Prevent repeated spam; let Aegis decide to restart
468+
361469 # Track performance
362470 if timings :
363471 perf .record (timings )
0 commit comments