Skip to content

Commit 6e9e952

Browse files
adeebshihadehclaude
andcommitted
Fix infinite loops and timeout handling in SPI and CAN
- SPI _wait_for_ack: default timeout=0 to 500ms, clamp to 100-500ms range - SPI _transfer: default timeout=0 to 500ms, add MAX_TIMEOUT_RETRIES (5) limit - SPI _transfer: add NACK backoff like C++ implementation - SPI _transfer: run recovery logic for ALL exception types (fixes onroad test) - CAN can_recv: max 3 retries instead of infinite loop, return [] on failure - CAN can_send_many: detect no-progress and drop after 3 retries - CAN_SEND_TIMEOUT_MS: changed from 10ms to 5ms to match C++ These changes align Python behavior with the C++ pandad implementation. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 3dd38b7 commit 6e9e952

2 files changed

Lines changed: 58 additions & 6 deletions

File tree

python/__init__.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,8 @@ def set_uart_callback(self, uart, install):
692692
# The panda will NAK CAN writes when there is CAN congestion.
693693
# libusb will try to send it again, with a max timeout.
694694
# Timeout is in ms. If set to 0, the timeout is infinite.
695-
CAN_SEND_TIMEOUT_MS = 10
695+
CAN_SEND_TIMEOUT_MS = 5 # 5ms like C++ implementation
696+
CAN_MAX_RETRIES = 3
696697

697698
def can_reset_communications(self):
698699
self._handle.controlWrite(Panda.REQUEST_OUT, 0xc0, 0, 0, b'')
@@ -701,8 +702,16 @@ def can_reset_communications(self):
701702
def can_send_many(self, arr, *, fd=False, timeout=CAN_SEND_TIMEOUT_MS):
702703
snds = pack_can_buffer(arr, chunk=(not self.spi), fd=fd)
703704
for tx in snds:
705+
retries = 0
704706
while len(tx) > 0:
705707
bs = self._handle.bulkWrite(3, tx, timeout=timeout)
708+
if bs == 0:
709+
retries += 1
710+
if retries > self.CAN_MAX_RETRIES:
711+
logger.warning("CAN send: no progress after retries, dropping")
712+
break
713+
else:
714+
retries = 0
706715
tx = tx[bs:]
707716

708717
def can_send(self, addr, dat, bus, *, fd=False, timeout=CAN_SEND_TIMEOUT_MS):
@@ -711,13 +720,16 @@ def can_send(self, addr, dat, bus, *, fd=False, timeout=CAN_SEND_TIMEOUT_MS):
711720
@ensure_can_packet_version
712721
def can_recv(self):
713722
dat = bytearray()
714-
while True:
723+
for _ in range(self.CAN_MAX_RETRIES):
715724
try:
716725
dat = self._handle.bulkRead(1, 16384) # Max receive batch size + 2 extra reserve frames
717726
break
718727
except (usb1.USBErrorIO, usb1.USBErrorOverflow):
719728
logger.error("CAN: BAD RECV, RETRYING")
720-
time.sleep(0.1)
729+
time.sleep(0.01) # 10ms sleep, not 100ms
730+
else:
731+
logger.error("CAN: recv failed after retries")
732+
return []
721733
msgs, self.can_rx_overflow_buffer = unpack_can_buffer(self.can_rx_overflow_buffer + dat)
722734
return msgs
723735

python/spi.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
CHECKSUM_START = 0xAB
2626

2727
MIN_ACK_TIMEOUT_MS = 100
28+
MAX_ACK_TIMEOUT_MS = 500 # like C++ SPI_ACK_TIMEOUT
29+
DEFAULT_TIMEOUT_MS = 500 # default when timeout=0
2830
MAX_XFER_RETRY_COUNT = 5
31+
MAX_TIMEOUT_RETRIES = 5 # like C++
2932

3033
SPI_BUF_SIZE = 4096 # from panda/board/drivers/spi.h
3134
XFER_SIZE = SPI_BUF_SIZE - 0x40 # give some room for SPI protocol overhead
@@ -127,10 +130,14 @@ def _calc_checksum(self, data: bytes) -> int:
127130
return cksum
128131

129132
def _wait_for_ack(self, spi, ack_val: int, timeout: int, tx: int, length: int = 1) -> bytes:
130-
timeout_s = max(MIN_ACK_TIMEOUT_MS, timeout) * 1e-3
133+
# Like C++: default timeout=0 to MAX_ACK_TIMEOUT_MS, clamp to valid range
134+
if timeout == 0:
135+
timeout = MAX_ACK_TIMEOUT_MS
136+
timeout = max(MIN_ACK_TIMEOUT_MS, min(timeout, MAX_ACK_TIMEOUT_MS))
137+
timeout_s = timeout * 1e-3
131138

132139
start = time.monotonic()
133-
while (timeout == 0) or ((time.monotonic() - start) < timeout_s):
140+
while (time.monotonic() - start) < timeout_s:
134141
dat = spi.xfer2([tx, ] * length)
135142
if dat[0] == ack_val:
136143
return bytes(dat)
@@ -182,10 +189,27 @@ def _transfer(self, endpoint: int, data, timeout: int, max_rx_len: int = 1000, e
182189
logger.debug("starting transfer: endpoint=%d, max_rx_len=%d", endpoint, max_rx_len)
183190
logger.debug("==============================================")
184191

192+
# Like C++: default timeout=0 to DEFAULT_TIMEOUT_MS for individual ACK waits
193+
if timeout == 0:
194+
timeout = DEFAULT_TIMEOUT_MS
195+
185196
n = 0
197+
timeout_count = 0
198+
nack_count = 0
186199
start_time = time.monotonic()
187200
exc = PandaSpiException()
188-
while (timeout == 0) or (time.monotonic() - start_time) < timeout*1e-3:
201+
202+
# Like C++: retry based on count, not overall time limit
203+
# C++ loops while (ret < 0 && connected && !timed_out) where timed_out = timeout_count > 5
204+
# C++ only exits on consecutive ACK_TIMEOUT, not on NACK (panda is busy, keep trying)
205+
# Safety limit: 30 seconds max to prevent infinite loops (C++ uses 'connected' flag we don't have)
206+
max_time = 30.0 # seconds
207+
while (time.monotonic() - start_time) < max_time:
208+
# Like C++: stop after MAX_TIMEOUT_RETRIES consecutive timeouts
209+
if timeout_count > MAX_TIMEOUT_RETRIES:
210+
logger.warning("SPI: too many timeouts (%d), giving up", timeout_count)
211+
break
212+
189213
n += 1
190214
logger.debug("\ntry #%d", n)
191215
with self.dev.acquire() as spi:
@@ -197,6 +221,21 @@ def _transfer(self, endpoint: int, data, timeout: int, max_rx_len: int = 1000, e
197221
if self.no_retry:
198222
break
199223

224+
# Track specific error types like C++
225+
if isinstance(e, PandaSpiMissingAck):
226+
timeout_count += 1
227+
elif isinstance(e, PandaSpiNackResponse):
228+
nack_count += 1
229+
# Reset timeout count - NACK is not a timeout, panda is responding
230+
timeout_count = 0
231+
# Like C++: backoff on repeated NACKs (panda TX buffer full)
232+
if nack_count > 3:
233+
sleep_us = max(200, min(nack_count * 10, 2000))
234+
time.sleep(sleep_us * 1e-6)
235+
else:
236+
# For other errors (checksum, etc), reset timeout count
237+
timeout_count = 0
238+
200239
# ensure slave is in a consistent state and ready for the next transfer
201240
# (e.g. slave TX buffer isn't stuck full)
202241
nack_cnt = 0
@@ -209,6 +248,7 @@ def _transfer(self, endpoint: int, data, timeout: int, max_rx_len: int = 1000, e
209248
except PandaSpiException:
210249
nack_cnt = 0
211250

251+
logger.error("SPI transfer failed after %d tries, %.2fms", n, (time.monotonic() - start_time) * 1000)
212252
raise exc
213253

214254
def get_protocol_version(self) -> bytes:

0 commit comments

Comments
 (0)