|
1 | 1 | import logging |
2 | 2 | import os |
| 3 | +import random |
3 | 4 | import re |
4 | 5 | import sys |
5 | 6 | import tarfile |
|
13 | 14 | if TYPE_CHECKING: |
14 | 15 | from socketsecurity.config import CliConfig |
15 | 16 | from socketdev import socketdev |
16 | | -from socketdev.exceptions import APIFailure |
| 17 | +from socketdev.exceptions import ( |
| 18 | + APIBadGateway, |
| 19 | + APIConnectionError, |
| 20 | + APIFailure, |
| 21 | + APITimeout, |
| 22 | +) |
17 | 23 | from socketdev.fullscans import FullScanParams, SocketArtifact |
18 | 24 | from socketdev.org import Organization |
19 | 25 | from socketdev.repos import RepositoryInfo |
|
76 | 82 | TIER1_FINALIZE_MAX_ATTEMPTS = 3 |
77 | 83 | TIER1_FINALIZE_BACKOFF_SECONDS = 1.0 |
78 | 84 |
|
| 85 | +# Full scan upload retry policy. Production gateways occasionally drop an upload mid-request |
| 86 | +# (a backend pod stalls and stops reading the body; the client then sees a 502/408 or a reset |
| 87 | +# connection). Those episodes are transient and pod-local: a retried upload routed to another |
| 88 | +# backend almost always succeeds, and because the server never finished reading the request |
| 89 | +# body, no scan was created, so retrying cannot duplicate a scan. |
| 90 | +FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3 |
| 91 | +# Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of |
| 92 | +# CI jobs hitting the same episode doesn't retry in lock-step). |
| 93 | +FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0) |
| 94 | +FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0 |
| 95 | +# Transient gateway/timeout HTTP statuses that the SDK does NOT raise as a dedicated |
| 96 | +# exception class (502 has APIBadGateway; 408/503/504 surface as the catch-all APIFailure |
| 97 | +# with the status only present in the message text - see _is_transient_full_scan_upload_error). |
| 98 | +FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES = frozenset({408, 503, 504}) |
| 99 | +# Matches the status code the SDK embeds in catch-all APIFailure messages |
| 100 | +# (socketdev/core/api.py: "Bad Request: HTTP original_status_code:<code> ..."). |
| 101 | +_API_FAILURE_STATUS_CODE_RE = re.compile(r"original_status_code:(\d{3})") |
| 102 | + |
| 103 | + |
| 104 | +def _is_transient_full_scan_upload_error(error: Exception) -> bool: |
| 105 | + """Whether a full-scan upload failure is transient and safe to retry. |
| 106 | +
|
| 107 | + Transient means the failure happened at the gateway/connection level before the server |
| 108 | + finished reading the request body (so no scan was created server-side): HTTP 502/503/504/408, |
| 109 | + client-side timeouts, and dropped/reset connections. 4xx client errors (400/401/403/404/429) |
| 110 | + and success responses carrying an error payload are never retried. |
| 111 | + """ |
| 112 | + if isinstance(error, (APIBadGateway, APIConnectionError, APITimeout)): |
| 113 | + # 502 / connection reset-dropped / request timeout - the SDK raises dedicated classes. |
| 114 | + return True |
| 115 | + if type(error) is APIFailure: |
| 116 | + # The SDK raises 408/503/504 (and every other status without a dedicated class, |
| 117 | + # including 400) as the catch-all APIFailure, so match on the exact class plus the |
| 118 | + # status code embedded in the message. Subclasses (APIAccessDenied, APIResourceNotFound, |
| 119 | + # APIInsufficientQuota, ...) are deliberately excluded - those are never transient. |
| 120 | + match = _API_FAILURE_STATUS_CODE_RE.search(str(error)) |
| 121 | + if match: |
| 122 | + return int(match.group(1)) in FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES |
| 123 | + return False |
| 124 | + |
79 | 125 |
|
80 | 126 | def _humanize_alert_type(alert_type: str) -> str: |
81 | 127 | """Convert a camelCase/PascalCase alert type into a Title-Cased label. |
@@ -787,7 +833,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: |
787 | 833 | # facts file under the per-file upload size cap. See _compress_facts_files_for_upload. |
788 | 834 | upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files) |
789 | 835 | try: |
790 | | - res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) |
| 836 | + # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, |
| 837 | + # timeouts) with increasing waits; a stalled backend pod recovers or gets routed |
| 838 | + # around within minutes, and since it never finished reading the request body no |
| 839 | + # scan was created, so a retry cannot duplicate one. fullscans.post() rebuilds its |
| 840 | + # lazy file loaders from the plain paths in upload_files on every call, so simply |
| 841 | + # calling it again per attempt is safe. The loop must stay inside this try so the |
| 842 | + # temp .br files (cleaned up in the finally below) outlive every attempt. |
| 843 | + for attempt in range(1, FULL_SCAN_UPLOAD_MAX_ATTEMPTS + 1): |
| 844 | + try: |
| 845 | + res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) |
| 846 | + break |
| 847 | + except APIFailure as error: |
| 848 | + if attempt >= FULL_SCAN_UPLOAD_MAX_ATTEMPTS or not _is_transient_full_scan_upload_error(error): |
| 849 | + raise |
| 850 | + backoff_index = min(attempt, len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)) - 1 |
| 851 | + wait_seconds = FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS[backoff_index] + random.uniform( |
| 852 | + 0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS |
| 853 | + ) |
| 854 | + # SDK error messages can span many lines (path + response headers); the |
| 855 | + # first line carries the status, which is all the warning needs. |
| 856 | + error_summary = str(error).strip().splitlines()[0] if str(error).strip() else "" |
| 857 | + log.warning( |
| 858 | + f"Full scan upload failed with {type(error).__name__}({error_summary}), " |
| 859 | + f"retrying in {wait_seconds:.0f}s " |
| 860 | + f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" |
| 861 | + ) |
| 862 | + time.sleep(wait_seconds) |
791 | 863 | finally: |
792 | 864 | for temp_file in compressed_temp_files: |
793 | 865 | try: |
|
0 commit comments