Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# Changelog

## 2.4.8

### Fixed: retry transient full-scan upload failures

- The full-scan upload (`POST /orgs/<org>/full-scans`) now retries transient
gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and
request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus
jitter). Such failures are intermittent and a retried upload almost always succeeds.
In these failure modes the server never finished reading the request body, so no scan
was created and a retry does not duplicate one; in the rare case where a gateway
timeout races a request the server later
completes, the extra scan is benign and superseded by the retried one (as if the CLI had
run twice).
Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each
retry logs a warning explaining what failed and when the next attempt happens.

## 2.4.7

### Changed: pin @coana-tech/cli version; auto-update is now opt-in
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"

[project]
name = "socketsecurity"
version = "2.4.7"
version = "2.4.8"
requires-python = ">= 3.11"
license = {"file" = "LICENSE"}
dependencies = [
Expand Down
2 changes: 1 addition & 1 deletion socketsecurity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__author__ = 'socket.dev'
__version__ = '2.4.7'
__version__ = '2.4.8'
USER_AGENT = f'SocketPythonCLI/{__version__}'
77 changes: 75 additions & 2 deletions socketsecurity/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import random
import re
import sys
import tarfile
Expand All @@ -13,7 +14,12 @@
if TYPE_CHECKING:
from socketsecurity.config import CliConfig
from socketdev import socketdev
from socketdev.exceptions import APIFailure
from socketdev.exceptions import (
APIBadGateway,
APIConnectionError,
APIFailure,
APITimeout,
)
from socketdev.fullscans import FullScanParams, SocketArtifact
from socketdev.org import Organization
from socketdev.repos import RepositoryInfo
Expand Down Expand Up @@ -76,6 +82,47 @@
TIER1_FINALIZE_MAX_ATTEMPTS = 3
TIER1_FINALIZE_BACKOFF_SECONDS = 1.0

# Full scan upload retry policy. An upload can fail transiently at the gateway/connection
# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout)
# without the server having created the scan. In these failure modes no scan was created,
# so a retry does not duplicate one. (A duplicate is possible only if a gateway timeout
# races a request the server later completes; that is benign - the retried scan supersedes
# the orphaned one, same as running the CLI twice.)
FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3
# Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of
# CI jobs hitting the same failure doesn't retry in lock-step).
FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0)
FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0
# Transient gateway/timeout HTTP statuses that the SDK does NOT raise as a dedicated
# exception class (502 has APIBadGateway; 408/503/504 surface as the catch-all APIFailure
# with the status only present in the message text - see _is_transient_full_scan_upload_error).
FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES = frozenset({408, 503, 504})
# Matches the status code the SDK embeds in catch-all APIFailure messages
# (socketdev/core/api.py: "Bad Request: HTTP original_status_code:<code> ...").
_API_FAILURE_STATUS_CODE_RE = re.compile(r"original_status_code:(\d{3})")


def _is_transient_full_scan_upload_error(error: Exception) -> bool:
Comment thread
BarrensZeppelin marked this conversation as resolved.
Outdated
"""Whether a full-scan upload failure is transient and safe to retry.

Transient means the failure happened at the gateway/connection level, normally before the
server finished reading the request body (so no scan was created server-side): HTTP
502/503/504/408, client-side timeouts, and dropped/reset connections. 4xx client errors
(400/401/403/404/429) and success responses carrying an error payload are never retried.
"""
if isinstance(error, (APIBadGateway, APIConnectionError, APITimeout)):
# 502 / connection reset-dropped / request timeout - the SDK raises dedicated classes.
return True
if type(error) is APIFailure:
# The SDK raises 408/503/504 (and every other status without a dedicated class,
# including 400) as the catch-all APIFailure, so match on the exact class plus the
# status code embedded in the message. Subclasses (APIAccessDenied, APIResourceNotFound,
# APIInsufficientQuota, ...) are deliberately excluded - those are never transient.
match = _API_FAILURE_STATUS_CODE_RE.search(str(error))
if match:
return int(match.group(1)) in FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES
return False


def _humanize_alert_type(alert_type: str) -> str:
"""Convert a camelCase/PascalCase alert type into a Title-Cased label.
Expand Down Expand Up @@ -787,7 +834,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
# facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files)
try:
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
# Retry transient gateway/timeout failures (502/503/504/408, dropped connections,
# timeouts) with increasing waits. In these failure modes the server never finished
# reading the request body, so no scan was created and a retry does not duplicate
# one (see the retry-policy comment above FULL_SCAN_UPLOAD_MAX_ATTEMPTS). fullscans.post()
# rebuilds its lazy file loaders from the plain paths in upload_files on every call,
# so simply calling it again per attempt is safe. The loop must stay inside this try
# so the temp .br files (cleaned up in the finally below) outlive every attempt.
for attempt in range(1, FULL_SCAN_UPLOAD_MAX_ATTEMPTS + 1):
try:
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
break
except APIFailure as error:
if attempt >= FULL_SCAN_UPLOAD_MAX_ATTEMPTS or not _is_transient_full_scan_upload_error(error):
raise
backoff_index = min(attempt, len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)) - 1
wait_seconds = FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS[backoff_index] + random.uniform(
0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS
)
# SDK error messages can span many lines (path + response headers); the
Comment thread
BarrensZeppelin marked this conversation as resolved.
Outdated
# first line carries the status, which is all the warning needs.
error_summary = str(error).strip().splitlines()[0] if str(error).strip() else ""
log.warning(
f"Full scan upload failed with {type(error).__name__}({error_summary}), "
f"retrying in {wait_seconds:.0f}s "
f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})"
)
time.sleep(wait_seconds)
finally:
for temp_file in compressed_temp_files:
try:
Expand Down
Loading
Loading