Skip to content

Commit 487557a

Browse files
committed
Move transient-error classification into the SDK; simplify retry loop
Address review feedback on the upload retry: - The retry decision now delegates to APIFailure.is_transient_error() (socketdev>=3.3.0, SocketDev/socket-sdk-python#93), which classifies by the HTTP status code the SDK records when raising. The CLI no longer encodes the SDK's exception hierarchy or parses status codes out of message text, so SDK restructuring can't silently break the classification. - The backoff schedule is now the single source of truth for the loop: FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None), where each entry is the wait before the next attempt and the final None re-raises instead of retrying. FULL_SCAN_UPLOAD_MAX_ATTEMPTS is computed from its length. Note: uv.lock is intentionally not regenerated yet - socketdev 3.3.0 must be released to PyPI first (blocked on socket-sdk-python#93).
1 parent 8dc0970 commit 487557a

4 files changed

Lines changed: 81 additions & 89 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
jitter). Such failures are intermittent and a retried upload almost always succeeds.
1111
In these failure modes the server never finished reading the request body, so no scan
1212
was created and a retry does not duplicate one; in the rare case where a gateway
13-
timeout races a request the server later
14-
completes, the extra scan is benign and superseded by the retried one (as if the CLI had
15-
run twice).
13+
timeout races a request the server later completes, the extra scan is benign and
14+
superseded by the retried one (as if the CLI had run twice).
1615
Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each
1716
retry logs a warning explaining what failed and when the next attempt happens.
17+
- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions
18+
it raises and owns the transient-vs-deterministic classification
19+
(`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of
20+
exception message text.
1821

1922
## 2.4.7
2023

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ dependencies = [
1616
'GitPython',
1717
'packaging',
1818
'python-dotenv',
19-
"socketdev>=3.2.1,<4.0.0",
19+
"socketdev>=3.3.0,<4.0.0",
2020
"bs4>=0.0.2",
2121
"markdown>=3.10",
2222
"brotli>=1.0.9; platform_python_implementation == 'CPython'",

socketsecurity/core/__init__.py

Lines changed: 20 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,7 @@
1414
if TYPE_CHECKING:
1515
from socketsecurity.config import CliConfig
1616
from socketdev import socketdev
17-
from socketdev.exceptions import (
18-
APIBadGateway,
19-
APIConnectionError,
20-
APIFailure,
21-
APITimeout,
22-
)
17+
from socketdev.exceptions import APIFailure
2318
from socketdev.fullscans import FullScanParams, SocketArtifact
2419
from socketdev.org import Organization
2520
from socketdev.repos import RepositoryInfo
@@ -84,44 +79,18 @@
8479

8580
# Full scan upload retry policy. An upload can fail transiently at the gateway/connection
8681
# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout)
87-
# without the server having created the scan. In these failure modes no scan was created,
88-
# so a retry does not duplicate one. (A duplicate is possible only if a gateway timeout
89-
# races a request the server later completes; that is benign - the retried scan supersedes
90-
# the orphaned one, same as running the CLI twice.)
91-
FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3
92-
# Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of
93-
# CI jobs hitting the same failure doesn't retry in lock-step).
94-
FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0)
82+
# without the server having created the scan; the SDK classifies those failures via
83+
# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was
84+
# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway
85+
# timeout races a request the server later completes; that is benign - the retried scan
86+
# supersedes the orphaned one, same as running the CLI twice.)
87+
#
88+
# Each schedule entry is the wait before the next attempt once the current one fails (plus
89+
# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in
90+
# lock-step); the final None means the last attempt's failure is re-raised, not retried.
91+
FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None)
92+
FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)
9593
FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0
96-
# Transient gateway/timeout HTTP statuses that the SDK does NOT raise as a dedicated
97-
# exception class (502 has APIBadGateway; 408/503/504 surface as the catch-all APIFailure
98-
# with the status only present in the message text - see _is_transient_full_scan_upload_error).
99-
FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES = frozenset({408, 503, 504})
100-
# Matches the status code the SDK embeds in catch-all APIFailure messages
101-
# (socketdev/core/api.py: "Bad Request: HTTP original_status_code:<code> ...").
102-
_API_FAILURE_STATUS_CODE_RE = re.compile(r"original_status_code:(\d{3})")
103-
104-
105-
def _is_transient_full_scan_upload_error(error: Exception) -> bool:
106-
"""Whether a full-scan upload failure is transient and safe to retry.
107-
108-
Transient means the failure happened at the gateway/connection level, normally before the
109-
server finished reading the request body (so no scan was created server-side): HTTP
110-
502/503/504/408, client-side timeouts, and dropped/reset connections. 4xx client errors
111-
(400/401/403/404/429) and success responses carrying an error payload are never retried.
112-
"""
113-
if isinstance(error, (APIBadGateway, APIConnectionError, APITimeout)):
114-
# 502 / connection reset-dropped / request timeout - the SDK raises dedicated classes.
115-
return True
116-
if type(error) is APIFailure:
117-
# The SDK raises 408/503/504 (and every other status without a dedicated class,
118-
# including 400) as the catch-all APIFailure, so match on the exact class plus the
119-
# status code embedded in the message. Subclasses (APIAccessDenied, APIResourceNotFound,
120-
# APIInsufficientQuota, ...) are deliberately excluded - those are never transient.
121-
match = _API_FAILURE_STATUS_CODE_RE.search(str(error))
122-
if match:
123-
return int(match.group(1)) in FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES
124-
return False
12594

12695

12796
def _humanize_alert_type(alert_type: str) -> str:
@@ -837,19 +806,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
837806
# Retry transient gateway/timeout failures (502/503/504/408, dropped connections,
838807
# timeouts) with increasing waits. In these failure modes the server never finished
839808
# reading the request body, so no scan was created and a retry does not duplicate
840-
# one (see the retry-policy comment above FULL_SCAN_UPLOAD_MAX_ATTEMPTS). fullscans.post()
841-
# rebuilds its lazy file loaders from the plain paths in upload_files on every call,
842-
# so simply calling it again per attempt is safe. The loop must stay inside this try
843-
# so the temp .br files (cleaned up in the finally below) outlive every attempt.
844-
for attempt in range(1, FULL_SCAN_UPLOAD_MAX_ATTEMPTS + 1):
809+
# one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS).
810+
# fullscans.post() rebuilds its lazy file loaders from the plain paths in
811+
# upload_files on every call, so simply calling it again per attempt is safe. The
812+
# loop must stay inside this try so the temp .br files (cleaned up in the finally
813+
# below) outlive every attempt.
814+
for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1):
845815
try:
846816
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
847817
break
848818
except APIFailure as error:
849-
if attempt >= FULL_SCAN_UPLOAD_MAX_ATTEMPTS or not _is_transient_full_scan_upload_error(error):
819+
if backoff_seconds is None or not error.is_transient_error():
850820
raise
851-
backoff_index = min(attempt, len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)) - 1
852-
wait_seconds = FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS[backoff_index] + random.uniform(
821+
wait_seconds = backoff_seconds + random.uniform(
853822
0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS
854823
)
855824
# SDK error messages can span many lines (path + response headers); the

tests/unit/test_full_scan_retry.py

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
33
A `POST /orgs/<org>/full-scans` upload can fail transiently (an HTTP 502/503/504/408, a
44
dropped or reset connection, or a timeout) without the server having created the scan.
5-
`Core.create_full_scan` retries those transient failures; these tests cover the retry
6-
classification, the loop bounds, and that the temporary brotli-compressed facts files
7-
survive until every attempt has finished.
5+
`Core.create_full_scan` retries the failures the SDK classifies as transient
6+
(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry
7+
decision, the loop bounds, and that the temporary brotli-compressed facts files survive
8+
until every attempt has finished.
89
"""
910

1011
import logging
@@ -26,7 +27,6 @@
2627
SOCKET_FACTS_BROTLI_FILENAME,
2728
SOCKET_FACTS_FILENAME,
2829
Core,
29-
_is_transient_full_scan_upload_error,
3030
)
3131

3232

@@ -46,13 +46,14 @@ def _success_response():
4646
return response
4747

4848

49-
# Catch-all APIFailure messages as the SDK formats them (socketdev/core/api.py); only the
50-
# embedded original_status_code distinguishes a transient 503/504/408 from e.g. a 400.
49+
# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class
50+
# (socketdev/core/api.py); the recorded status_code drives is_transient_error().
5151
def _catch_all_failure(status_code: int) -> APIFailure:
5252
return APIFailure(
5353
f"Bad Request: HTTP original_status_code:{status_code}\n"
5454
f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n"
55-
f"Headers:\ncf-ray: abc123"
55+
f"Headers:\ncf-ray: abc123",
56+
status_code=status_code,
5657
)
5758

5859

@@ -121,11 +122,14 @@ def test_upload_raises_after_exhausting_attempts(
121122
)
122123

123124

124-
def test_upload_retries_on_catch_all_503(core_with_mock_sdk, tmp_path, no_sleep):
125+
@pytest.mark.parametrize("status_code", [408, 503, 504])
126+
def test_upload_retries_on_catch_all_transient_statuses(
127+
core_with_mock_sdk, tmp_path, no_sleep, status_code
128+
):
125129
manifest = tmp_path / "package.json"
126130
manifest.write_text("{}")
127131
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
128-
_catch_all_failure(503),
132+
_catch_all_failure(status_code),
129133
_success_response(),
130134
]
131135

@@ -164,13 +168,17 @@ def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep):
164168
no_sleep.assert_not_called()
165169

166170

167-
@pytest.mark.parametrize("error_class", [APIAccessDenied, APIResourceNotFound])
171+
@pytest.mark.parametrize(
172+
"error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)]
173+
)
168174
def test_upload_does_not_retry_on_dedicated_4xx_classes(
169-
core_with_mock_sdk, tmp_path, no_sleep, error_class
175+
core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code
170176
):
171177
manifest = tmp_path / "package.json"
172178
manifest.write_text("{}")
173-
core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class()
179+
core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class(
180+
status_code=status_code
181+
)
174182

175183
with pytest.raises(error_class):
176184
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
@@ -241,25 +249,37 @@ def test_temp_br_file_cleaned_after_exhausted_retries(
241249
assert not compressed.exists()
242250

243251

244-
def test_transient_classifier():
245-
assert _is_transient_full_scan_upload_error(APIBadGateway())
246-
assert _is_transient_full_scan_upload_error(APIConnectionError())
247-
assert _is_transient_full_scan_upload_error(APITimeout())
248-
assert _is_transient_full_scan_upload_error(_catch_all_failure(408))
249-
assert _is_transient_full_scan_upload_error(_catch_all_failure(503))
250-
assert _is_transient_full_scan_upload_error(_catch_all_failure(504))
251-
252-
assert not _is_transient_full_scan_upload_error(_catch_all_failure(400))
253-
assert not _is_transient_full_scan_upload_error(_catch_all_failure(500))
254-
assert not _is_transient_full_scan_upload_error(
255-
APIFailure()
256-
) # wrapped unexpected error
257-
assert not _is_transient_full_scan_upload_error(APIAccessDenied("denied"))
258-
assert not _is_transient_full_scan_upload_error(APIResourceNotFound())
259-
# Subclasses never match the catch-all branch, even with a retryable-looking message.
260-
assert not _is_transient_full_scan_upload_error(
261-
APIAccessDenied("original_status_code:503")
262-
)
263-
assert not _is_transient_full_scan_upload_error(
264-
ValueError("original_status_code:503")
265-
)
252+
class _StubFailure(APIFailure):
253+
"""An APIFailure whose transience is fixed, regardless of class or status code."""
254+
255+
def __init__(self, transient: bool):
256+
super().__init__("stub failure")
257+
self._transient = transient
258+
259+
def is_transient_error(self) -> bool:
260+
return self._transient
261+
262+
263+
@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)])
264+
def test_retry_decision_delegates_to_sdk_classification(
265+
core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls
266+
):
267+
# The CLI encodes no knowledge of the SDK's exception hierarchy or status codes:
268+
# the retry decision is exactly APIFailure.is_transient_error(). (The transient /
269+
# non-transient truth table itself is tested in the SDK, next to the code that
270+
# raises the exceptions.)
271+
manifest = tmp_path / "package.json"
272+
manifest.write_text("{}")
273+
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
274+
_StubFailure(transient),
275+
_success_response(),
276+
]
277+
278+
if transient:
279+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
280+
assert full_scan.id == "scan-1"
281+
else:
282+
with pytest.raises(_StubFailure):
283+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
284+
285+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls

0 commit comments

Comments
 (0)