Skip to content

Commit b773fb9

Browse files
Merge branch '4.6' into backport/4.6/17133
2 parents abe4d94 + 6d3c240 commit b773fb9

5 files changed

Lines changed: 304 additions & 6 deletions

File tree

ddtrace/testing/internal/http.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
DEFAULT_TIMEOUT_SECONDS = 15.0
3333
MAX_ATTEMPTS = 5
34+
MAX_RETRY_AFTER_SECONDS = 120.0
3435

3536
log = logging.getLogger(__name__)
3637

@@ -51,6 +52,7 @@ class BackendResult:
5152
parsed_response: t.Any = None
5253
is_gzip_response: bool = False
5354
elapsed_seconds: float = 0.0
55+
retry_after_seconds: t.Optional[float] = None
5456

5557
def on_error_raise_exception(self) -> None:
5658
if self.error_type:
@@ -64,7 +66,13 @@ class Subdomain(str, Enum):
6466
CICOVREPRT = "ci-intake"
6567

6668

67-
RETRIABLE_ERRORS = {ErrorType.TIMEOUT, ErrorType.NETWORK, ErrorType.CODE_5XX, ErrorType.BAD_JSON}
69+
RETRIABLE_ERRORS = {
70+
ErrorType.TIMEOUT,
71+
ErrorType.NETWORK,
72+
ErrorType.CODE_5XX,
73+
ErrorType.BAD_JSON,
74+
ErrorType.RATE_LIMITED,
75+
}
6876

6977

7078
class BackendConnectorSetup:
@@ -286,6 +294,23 @@ def _do_single_request(
286294
result.error_description = f"{result.response.status} {result.response.reason}"
287295
if result.response.status >= 500:
288296
result.error_type = ErrorType.CODE_5XX
297+
elif result.response.status == 429:
298+
result.error_type = ErrorType.RATE_LIMITED
299+
reset_header = result.response.headers.get("X-RateLimit-Reset")
300+
if reset_header is not None:
301+
try:
302+
reset_value = int(reset_header)
303+
now = int(time.time())
304+
if reset_value > now:
305+
# Unix timestamp: wait until that point in time
306+
delay = float(reset_value - now)
307+
else:
308+
# Duration in seconds
309+
delay = float(reset_value)
310+
# Cap to avoid unreasonable waits (e.g. expired timestamp misread as duration)
311+
result.retry_after_seconds = min(delay, MAX_RETRY_AFTER_SECONDS)
312+
except ValueError:
313+
pass # Fall back to exponential backoff in the retry loop
289314
elif result.response.status >= 400:
290315
result.error_type = ErrorType.CODE_4XX
291316
else:
@@ -352,7 +377,10 @@ def request(
352377
)
353378

354379
if result.error_type and result.error_type in RETRIABLE_ERRORS and attempts_so_far < max_attempts:
355-
delay_seconds = random.uniform(0, (1.618 ** (attempts_so_far - 1))) # nosec: B311
380+
if result.retry_after_seconds is not None:
381+
delay_seconds = result.retry_after_seconds
382+
else:
383+
delay_seconds = random.uniform(0, (1.618 ** (attempts_so_far - 1))) # nosec: B311
356384
log.debug(
357385
"Retrying %s %s in %.3f seconds (%d attempts so far)", method, path, delay_seconds, attempts_so_far
358386
)

ddtrace/testing/internal/telemetry.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ErrorType(str, Enum):
2828
TIMEOUT = "timeout"
2929
NETWORK = "network"
3030
CODE_4XX = "status_code_4xx_response"
31+
RATE_LIMITED = "rate_limited"
3132
CODE_5XX = "status_code_5xx_response"
3233
BAD_JSON = "bad_json"
3334
UNKNOWN = "unknown"
@@ -206,7 +207,7 @@ def record_event_payload_error(self, endpoint: str, error: ErrorType) -> None:
206207
# `endpoint_payload.requests_errors` accepts a different set of error types, so we need to convert them here.
207208
if error == ErrorType.TIMEOUT:
208209
endpoint_error = "timeout"
209-
elif error in (ErrorType.CODE_4XX, ErrorType.CODE_5XX):
210+
elif error in (ErrorType.CODE_4XX, ErrorType.RATE_LIMITED, ErrorType.CODE_5XX):
210211
endpoint_error = "status_code"
211212
else:
212213
endpoint_error = "network"
@@ -313,4 +314,6 @@ def record_request(
313314
self.record_error(error)
314315

315316
def record_error(self, error: ErrorType) -> None:
316-
self.telemetry_api.add_count_metric(self.error, 1, {"error_type": error})
317+
# Map RATE_LIMITED to the same telemetry value as CODE_4XX for cross-language consistency
318+
error_type = ErrorType.CODE_4XX if error == ErrorType.RATE_LIMITED else error
319+
self.telemetry_api.add_count_metric(self.error, 1, {"error_type": error_type})
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
fixes:
3+
- |
4+
CI Visibility: Fixes an issue where HTTP 429 (Too Many Requests) responses from the Datadog
5+
backend were treated as non-retriable errors, causing CI visibility data to be dropped when
6+
the backend applied rate limiting. The backend connector now retries on 429 responses and
7+
respects the ``X-RateLimit-Reset`` header when present to determine the retry delay.

tests/testing/internal/test_http.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from ddtrace.testing.internal.errors import SetupError
1212
from ddtrace.testing.internal.http import DEFAULT_TIMEOUT_SECONDS
13+
from ddtrace.testing.internal.http import MAX_RETRY_AFTER_SECONDS
1314
from ddtrace.testing.internal.http import BackendConnector
1415
from ddtrace.testing.internal.http import BackendConnectorAgentlessSetup
1516
from ddtrace.testing.internal.http import BackendConnectorEVPProxySetup
@@ -376,6 +377,238 @@ def test_post_json_unknown_error(self, mock_time: Mock, mock_sleep: Mock, mock_h
376377
call(seconds=0.0, response_bytes=None, compressed_response=False, error=ErrorType.UNKNOWN),
377378
]
378379

380+
@patch("http.client.HTTPSConnection")
381+
@patch("time.sleep")
382+
@patch("time.perf_counter", return_value=0.0)
383+
def test_post_json_rate_limited_retry_then_ok(
384+
self, mock_time: Mock, mock_sleep: Mock, mock_https_connection: Mock
385+
) -> None:
386+
mock_response_429 = Mock()
387+
mock_response_429.headers = {}
388+
mock_response_429.read.return_value = b"Rate limited"
389+
mock_response_429.status = 429
390+
mock_response_429.reason = "Too Many Requests"
391+
392+
mock_response_ok = Mock()
393+
mock_response_ok.headers = {"Content-Length": 14}
394+
mock_response_ok.read.return_value = b'{"answer": 42}'
395+
mock_response_ok.status = 200
396+
397+
mock_conn = Mock()
398+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
399+
mock_https_connection.return_value = mock_conn
400+
401+
mock_telemetry = Mock()
402+
403+
connector = BackendConnector(url="https://api.example.com")
404+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=mock_telemetry)
405+
406+
assert mock_conn.request.call_args_list == [
407+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
408+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
409+
]
410+
assert len(mock_sleep.call_args_list) == 1
411+
412+
assert result.error_type is None
413+
assert result.parsed_response == {"answer": 42}
414+
415+
assert mock_telemetry.record_request.call_args_list == [
416+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
417+
call(seconds=0.0, response_bytes=14, compressed_response=False, error=None),
418+
]
419+
420+
@patch("http.client.HTTPSConnection")
421+
@patch("time.sleep")
422+
@patch("time.perf_counter", return_value=0.0)
423+
def test_post_json_rate_limited_retry_limit(
424+
self, mock_time: Mock, mock_sleep: Mock, mock_https_connection: Mock
425+
) -> None:
426+
mock_response_429 = Mock()
427+
mock_response_429.headers = {}
428+
mock_response_429.read.return_value = b"Rate limited"
429+
mock_response_429.status = 429
430+
mock_response_429.reason = "Too Many Requests"
431+
432+
mock_conn = Mock()
433+
mock_conn.getresponse.return_value = mock_response_429
434+
mock_https_connection.return_value = mock_conn
435+
436+
mock_telemetry = Mock()
437+
438+
connector = BackendConnector(url="https://api.example.com")
439+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=mock_telemetry)
440+
441+
assert mock_conn.request.call_args_list == [
442+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
443+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
444+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
445+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
446+
call("POST", "/v1/some-endpoint", body=b'{"question": 1}', headers={"Content-Type": "application/json"}),
447+
]
448+
assert len(mock_sleep.call_args_list) == 4
449+
450+
assert result.error_type is ErrorType.RATE_LIMITED
451+
assert result.error_description == "429 Too Many Requests"
452+
453+
assert mock_telemetry.record_request.call_args_list == [
454+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
455+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
456+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
457+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
458+
call(seconds=0.0, response_bytes=0, compressed_response=False, error=ErrorType.RATE_LIMITED),
459+
]
460+
461+
@patch("http.client.HTTPSConnection")
462+
@patch("time.sleep")
463+
@patch("time.time", return_value=1700000000)
464+
@patch("time.perf_counter", return_value=0.0)
465+
def test_post_json_rate_limited_uses_header_unix_timestamp(
466+
self, mock_perf: Mock, mock_time: Mock, mock_sleep: Mock, mock_https_connection: Mock
467+
) -> None:
468+
"""When X-RateLimit-Reset is a future Unix timestamp, sleep until that point."""
469+
reset_timestamp = 1700000000 + 60 # 60 seconds in the future
470+
471+
mock_response_429 = Mock()
472+
mock_response_429.headers = {"X-RateLimit-Reset": str(reset_timestamp)}
473+
mock_response_429.read.return_value = b"Rate limited"
474+
mock_response_429.status = 429
475+
mock_response_429.reason = "Too Many Requests"
476+
477+
mock_response_ok = Mock()
478+
mock_response_ok.headers = {"Content-Length": 14}
479+
mock_response_ok.read.return_value = b'{"answer": 42}'
480+
mock_response_ok.status = 200
481+
482+
mock_conn = Mock()
483+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
484+
mock_https_connection.return_value = mock_conn
485+
486+
connector = BackendConnector(url="https://api.example.com")
487+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=Mock())
488+
489+
assert result.error_type is None
490+
mock_sleep.assert_called_once_with(60.0)
491+
492+
@patch("http.client.HTTPSConnection")
493+
@patch("time.sleep")
494+
@patch("time.time", return_value=1700000000)
495+
@patch("time.perf_counter", return_value=0.0)
496+
def test_post_json_rate_limited_uses_header_duration(
497+
self, mock_perf: Mock, mock_time: Mock, mock_sleep: Mock, mock_https_connection: Mock
498+
) -> None:
499+
"""When X-RateLimit-Reset is a small value (≤ current time), treat it as a duration in seconds."""
500+
mock_response_429 = Mock()
501+
mock_response_429.headers = {"X-RateLimit-Reset": "30"}
502+
mock_response_429.read.return_value = b"Rate limited"
503+
mock_response_429.status = 429
504+
mock_response_429.reason = "Too Many Requests"
505+
506+
mock_response_ok = Mock()
507+
mock_response_ok.headers = {"Content-Length": 14}
508+
mock_response_ok.read.return_value = b'{"answer": 42}'
509+
mock_response_ok.status = 200
510+
511+
mock_conn = Mock()
512+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
513+
mock_https_connection.return_value = mock_conn
514+
515+
connector = BackendConnector(url="https://api.example.com")
516+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=Mock())
517+
518+
assert result.error_type is None
519+
mock_sleep.assert_called_once_with(30.0)
520+
521+
@patch("http.client.HTTPSConnection")
522+
@patch("time.sleep")
523+
@patch("time.time", return_value=1700000000)
524+
@patch("time.perf_counter", return_value=0.0)
525+
def test_post_json_rate_limited_caps_retry_delay(
526+
self, mock_perf: Mock, mock_time: Mock, mock_sleep: Mock, mock_https_connection: Mock
527+
) -> None:
528+
"""Retry delay is capped at 120 seconds to avoid unreasonable waits."""
529+
reset_timestamp = 1700000000 + 600 # 600 seconds in the future, exceeds 120s cap
530+
531+
mock_response_429 = Mock()
532+
mock_response_429.headers = {"X-RateLimit-Reset": str(reset_timestamp)}
533+
mock_response_429.read.return_value = b"Rate limited"
534+
mock_response_429.status = 429
535+
mock_response_429.reason = "Too Many Requests"
536+
537+
mock_response_ok = Mock()
538+
mock_response_ok.headers = {"Content-Length": 14}
539+
mock_response_ok.read.return_value = b'{"answer": 42}'
540+
mock_response_ok.status = 200
541+
542+
mock_conn = Mock()
543+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
544+
mock_https_connection.return_value = mock_conn
545+
546+
connector = BackendConnector(url="https://api.example.com")
547+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=Mock())
548+
549+
assert result.error_type is None
550+
mock_sleep.assert_called_once_with(MAX_RETRY_AFTER_SECONDS)
551+
552+
@patch("http.client.HTTPSConnection")
553+
@patch("random.uniform", return_value=0.5)
554+
@patch("time.sleep")
555+
@patch("time.perf_counter", return_value=0.0)
556+
def test_post_json_rate_limited_falls_back_to_exponential_backoff_without_header(
557+
self, mock_perf: Mock, mock_sleep: Mock, mock_uniform: Mock, mock_https_connection: Mock
558+
) -> None:
559+
"""When no X-RateLimit-Reset header is present, exponential backoff is used."""
560+
mock_response_429 = Mock()
561+
mock_response_429.headers = {}
562+
mock_response_429.read.return_value = b"Rate limited"
563+
mock_response_429.status = 429
564+
mock_response_429.reason = "Too Many Requests"
565+
566+
mock_response_ok = Mock()
567+
mock_response_ok.headers = {"Content-Length": 14}
568+
mock_response_ok.read.return_value = b'{"answer": 42}'
569+
mock_response_ok.status = 200
570+
571+
mock_conn = Mock()
572+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
573+
mock_https_connection.return_value = mock_conn
574+
575+
connector = BackendConnector(url="https://api.example.com")
576+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=Mock())
577+
578+
assert result.error_type is None
579+
mock_uniform.assert_called_once()
580+
mock_sleep.assert_called_once_with(0.5)
581+
582+
@patch("http.client.HTTPSConnection")
583+
@patch("random.uniform", return_value=0.5)
584+
@patch("time.sleep")
585+
@patch("time.perf_counter", return_value=0.0)
586+
def test_post_json_rate_limited_falls_back_to_exponential_backoff_with_invalid_header(
587+
self, mock_perf: Mock, mock_sleep: Mock, mock_uniform: Mock, mock_https_connection: Mock
588+
) -> None:
589+
"""When X-RateLimit-Reset header is non-numeric, exponential backoff is used."""
590+
mock_response_429 = Mock()
591+
mock_response_429.headers = {"X-RateLimit-Reset": "not-a-number"}
592+
mock_response_429.read.return_value = b"Rate limited"
593+
mock_response_429.status = 429
594+
mock_response_429.reason = "Too Many Requests"
595+
596+
mock_response_ok = Mock()
597+
mock_response_ok.headers = {"Content-Length": 14}
598+
mock_response_ok.read.return_value = b'{"answer": 42}'
599+
mock_response_ok.status = 200
600+
601+
mock_conn = Mock()
602+
mock_conn.getresponse.side_effect = [mock_response_429, mock_response_ok]
603+
mock_https_connection.return_value = mock_conn
604+
605+
connector = BackendConnector(url="https://api.example.com")
606+
result = connector.post_json("/v1/some-endpoint", data={"question": 1}, telemetry=Mock())
607+
608+
assert result.error_type is None
609+
mock_uniform.assert_called_once()
610+
mock_sleep.assert_called_once_with(0.5)
611+
379612
@patch("http.client.HTTPSConnection")
380613
@patch("uuid.uuid4")
381614
def test_post_files_multiple_files(self, mock_uuid: Mock, mock_https_connection: Mock) -> None:

tests/testing/internal/test_telemetry.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ def telemetry_api() -> t.Generator[TelemetryAPI, None, None]:
2626
api = TelemetryAPI(connector_setup=Mock())
2727

2828
mock_writer = Mock()
29-
api.writer = mock_writer
29+
api.writer = mock_writer # type: ignore[assignment]
3030

31-
yield api
31+
yield api # type: ignore[misc]
3232

3333

3434
class TestTelemetry:
@@ -91,6 +91,32 @@ def test_record_request_without_response_bytes(self, telemetry_api: TelemetryAPI
9191
call(CIVISIBILITY, "known_tests.request_ms", 1.41, ()),
9292
]
9393

94+
def test_record_request_rate_limited_maps_to_4xx(self, telemetry_api: TelemetryAPI) -> None:
95+
"""RATE_LIMITED is emitted as status_code_4xx_response for cross-language consistency."""
96+
request_telemetry = telemetry_api.with_request_metric_names(
97+
count="known_tests.request",
98+
duration="known_tests.request_ms",
99+
response_bytes="known_tests.response_bytes",
100+
error="known_tests.request_errors",
101+
)
102+
103+
request_telemetry.record_request(
104+
seconds=1.41,
105+
response_bytes=42,
106+
compressed_response=False,
107+
error=ErrorType.RATE_LIMITED,
108+
)
109+
110+
assert telemetry_api.writer.add_count_metric.call_args_list == [
111+
call(CIVISIBILITY, "known_tests.request", 1, ()),
112+
call(
113+
CIVISIBILITY,
114+
"known_tests.request_errors",
115+
1,
116+
(("error_type", ErrorType.CODE_4XX.value),),
117+
),
118+
]
119+
94120
def test_record_request_without_error(self, telemetry_api: TelemetryAPI) -> None:
95121
request_telemetry = telemetry_api.with_request_metric_names(
96122
count="known_tests.request",
@@ -315,6 +341,7 @@ def test_record_event_payload_ok(self, telemetry_api: TelemetryAPI) -> None:
315341
(ErrorType.NETWORK, "network"),
316342
(ErrorType.CODE_4XX, "status_code"),
317343
(ErrorType.CODE_5XX, "status_code"),
344+
(ErrorType.RATE_LIMITED, "status_code"),
318345
(ErrorType.BAD_JSON, "network"),
319346
(ErrorType.UNKNOWN, "network"),
320347
],

0 commit comments

Comments
 (0)