diff --git a/CHANGELOG.md b/CHANGELOG.md index 721db3b6127..6214922f29f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ([#4907](https://github.com/open-telemetry/opentelemetry-python/issues/4907)) - Drop Python 3.9 support ([#5076](https://github.com/open-telemetry/opentelemetry-python/pull/5076)) - +- `opentelemetry-exporter-otlp-proto-http`: Add experimental opt-in support for handling HTTP 413 (Payload Too Large) responses in trace and log exporters by splitting the batch in half and retrying each half recursively. Enable via `OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413=true`. + ([#5032](https://github.com/open-telemetry/opentelemetry-python/pull/5032)) ## Version 1.41.0/0.62b0 (2026-04-09) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py index 1bdb7d228c2..242af726ce4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py @@ -31,6 +31,10 @@ def _is_retryable(resp: requests.Response) -> bool: return False +def _is_payload_too_large(resp: requests.Response) -> bool: + return resp.status_code == 413 + + def _load_session_from_envvar( cred_envvar: Literal[ "OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER", diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 7aea76be8d2..c67856bb132 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -31,6 +31,7 @@ Compression, ) from opentelemetry.exporter.otlp.proto.http._common import ( + _is_payload_too_large, _is_retryable, _load_session_from_envvar, ) @@ -41,6 +42,7 @@ ) from opentelemetry.sdk._shared_internal import DuplicateFilter from opentelemetry.sdk.environment_variables import ( + _OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, _OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER, OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, @@ -69,6 +71,7 @@ DEFAULT_LOGS_EXPORT_PATH = "v1/logs" DEFAULT_TIMEOUT = 10 # in seconds _MAX_RETRYS = 6 +_MAX_BISECTS = 5 class OTLPLogExporter(LogRecordExporter): @@ -138,6 +141,12 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False + self._retry_entity_too_large = ( + environ.get(_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, "") + .strip() + .lower() + == "true" + ) def _export( self, serialized_data: bytes, timeout_sec: Optional[float] = None @@ -183,8 +192,17 @@ def export( _logger.warning("Exporter already shutdown, ignoring batch") return LogRecordExportResult.FAILURE - serialized_data = encode_logs(batch).SerializeToString() deadline_sec = time() + self._timeout + return self._export_batch(batch, deadline_sec, _MAX_BISECTS) + + def _export_batch( + self, + batch: Sequence[ReadableLogRecord], + deadline_sec: float, + remaining_bisects: int, + ) -> LogRecordExportResult: + serialized_data = encode_logs(batch).SerializeToString() + for retry_num in range(_MAX_RETRYS): # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) @@ -196,12 +214,19 @@ def export( reason = error retryable = isinstance(error, ConnectionError) status_code = None + bisectable = False else: reason = resp.reason retryable = _is_retryable(resp) status_code = resp.status_code + bisectable = ( + _is_payload_too_large(resp) + and len(batch) > 1 + and remaining_bisects > 0 + and self._retry_entity_too_large + ) - if not retryable: + if not retryable and not bisectable: _logger.error( "Failed to export logs batch code: %s, reason: %s", status_code, @@ -219,6 +244,25 @@ def export( "max retries or shutdown." ) return LogRecordExportResult.FAILURE + + if bisectable: + mid = len(batch) // 2 + _logger.warning( + "Payload too large (%d log records), splitting into two batches", + len(batch), + ) + first = self._export_batch( + list(batch[:mid]), + deadline_sec, + remaining_bisects - 1, + ) + if first != LogRecordExportResult.SUCCESS: + return LogRecordExportResult.FAILURE + return self._export_batch( + list(batch[mid:]), + deadline_sec, + remaining_bisects - 1, + ) _logger.warning( "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index d02f94adf05..1edf6e9eecc 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -33,10 +33,12 @@ Compression, ) from opentelemetry.exporter.otlp.proto.http._common import ( + _is_payload_too_large, _is_retryable, _load_session_from_envvar, ) from opentelemetry.sdk.environment_variables import ( + _OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, _OTEL_PYTHON_EXPORTER_OTLP_HTTP_TRACES_CREDENTIAL_PROVIDER, OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, @@ -65,6 +67,7 @@ DEFAULT_TRACES_EXPORT_PATH = "v1/traces" DEFAULT_TIMEOUT = 10 # in seconds _MAX_RETRYS = 6 +_MAX_BISECTS = 5 class OTLPSpanExporter(SpanExporter): @@ -133,6 +136,12 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False + self._retry_entity_too_large = ( + environ.get(_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, "") + .strip() + .lower() + == "true" + ) def _export( self, serialized_data: bytes, timeout_sec: Optional[float] = None @@ -176,8 +185,17 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: _logger.warning("Exporter already shutdown, ignoring batch") return SpanExportResult.FAILURE - serialized_data = encode_spans(spans).SerializePartialToString() deadline_sec = time() + self._timeout + return self._export_batch(spans, deadline_sec, _MAX_BISECTS) + + def _export_batch( + self, + spans: Sequence[ReadableSpan], + deadline_sec: float, + remaining_bisects: int, + ) -> SpanExportResult: + serialized_data = encode_spans(spans).SerializePartialToString() + for retry_num in range(_MAX_RETRYS): # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) @@ -189,12 +207,19 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: reason = error retryable = isinstance(error, ConnectionError) status_code = None + bisectable = False else: reason = resp.reason retryable = _is_retryable(resp) status_code = resp.status_code + bisectable = ( + _is_payload_too_large(resp) + and len(spans) > 1 + and remaining_bisects > 0 + and self._retry_entity_too_large + ) - if not retryable: + if not retryable and not bisectable: _logger.error( "Failed to export span batch code: %s, reason: %s", status_code, @@ -212,6 +237,25 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: "max retries or shutdown." ) return SpanExportResult.FAILURE + + if bisectable: + mid = len(spans) // 2 + _logger.warning( + "Payload too large (%d spans), splitting into two batches", + len(spans), + ) + first = self._export_batch( + list(spans[:mid]), + deadline_sec, + remaining_bisects - 1, + ) + if first != SpanExportResult.SUCCESS: + return SpanExportResult.FAILURE + return self._export_batch( + list(spans[mid:]), + deadline_sec, + remaining_bisects - 1, + ) _logger.warning( "Transient error %s encountered while exporting span batch, retrying in %.2fs.", reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index c86ac1f6ba1..7fa5f88485b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -35,6 +35,7 @@ DEFAULT_LOGS_EXPORT_PATH, DEFAULT_TIMEOUT, OTLPLogExporter, + _logger, ) from opentelemetry.exporter.otlp.proto.http.version import __version__ from opentelemetry.proto.collector.logs.v1.logs_service_pb2 import ( @@ -43,6 +44,7 @@ from opentelemetry.sdk._logs import ReadWriteLogRecord from opentelemetry.sdk._logs.export import LogRecordExportResult from opentelemetry.sdk.environment_variables import ( + _OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, _OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER, OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, @@ -77,7 +79,14 @@ ENV_TIMEOUT = "30" -class TestOTLPHTTPLogExporter(unittest.TestCase): +class TestOTLPHTTPLogExporter(unittest.TestCase): # pylint: disable=too-many-public-methods + def setUp(self): + # Reset the DuplicateFilter between tests so that log suppression + # from one test does not bleed into the next. + for log_filter in _logger.filters: + if hasattr(log_filter, "last_log"): + log_filter.last_log = None + def test_constructor_default(self): exporter = OTLPLogExporter() @@ -525,7 +534,7 @@ def test_timeout_set_correctly(self, mock_post): def export_side_effect(*args, **kwargs): # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. - self.assertAlmostEqual(0.4, kwargs["timeout"], 2) + self.assertAlmostEqual(0.4, kwargs["timeout"], 1) return resp mock_post.side_effect = export_side_effect @@ -562,3 +571,230 @@ def test_shutdown_interrupts_retry_backoff(self, mock_post): ) assert after - before < 0.2 + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_splits_batch_and_succeeds(self, mock_post): + """When backend returns 413, the exporter should split the batch in half and retry each half.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_ok = Response() + resp_ok.status_code = 200 + + mock_post.side_effect = [resp_413, resp_ok, resp_ok] + + log_data = self._get_sdk_log_data() + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.SUCCESS) + # 1 initial call (413) + 2 split calls + self.assertEqual(mock_post.call_count, 3) + self.assertTrue( + any( + "Payload too large" in record.message + for record in warning.records + ) + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_single_log_returns_failure(self, mock_post): + """When a single log record is too large, the exporter should return FAILURE via the non-retryable path.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + log_data = self._get_sdk_log_data()[:1] + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.FAILURE) + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any( + "Failed to export logs batch code: 413" in record.message + for record in warning.records + ) + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_recursive_splitting(self, mock_post): + """When a split half still returns 413, the exporter should continue splitting recursively.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_ok = Response() + resp_ok.status_code = 200 + + log_data = self._get_sdk_log_data() # returns 3 logs + + # 3 logs: first 413 → split [0],[1,2] + # [0] → ok + # [1,2] → 413 → split [1],[2] → ok, ok + mock_post.side_effect = [resp_413, resp_ok, resp_413, resp_ok, resp_ok] + + with self.assertLogs(level=WARNING): + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.SUCCESS) + self.assertEqual(mock_post.call_count, 5) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_partial_failure(self, mock_post): + """When the first half fails with a non-retryable error, the second half is not attempted (short-circuit).""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_400 = Response() + resp_400.status_code = 400 + resp_400.reason = "Bad Request" + + log_data = self._get_sdk_log_data() + + # First call returns 413, first half gets 400 → short-circuit + mock_post.side_effect = [resp_413, resp_400] + + with self.assertLogs(level=WARNING): + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.FAILURE) + self.assertEqual(mock_post.call_count, 2) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch( + "opentelemetry.exporter.otlp.proto.http._log_exporter.time", + ) + @patch.object(Session, "post") + def test_413_deadline_expired_returns_failure(self, mock_post, mock_time): + """When a 413 is received but the deadline has expired, return FAILURE without splitting.""" + # time() calls: export() deadline_sec setup, _export timeout calc, + # retry-exit check (deadline_sec - time()) + mock_time.side_effect = [100.0, 100.0, 110.1] + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + log_data = self._get_sdk_log_data() + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.FAILURE) + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any("timeout" in record.message for record in warning.records) + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_max_bisects_exceeded_returns_failure(self, mock_post): + """When max bisect depth is exhausted, the exporter should stop splitting and return FAILURE.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + # Always return 413 — should stop after _MAX_BISECTS depth + mock_post.return_value = resp_413 + + log_data = self._get_sdk_log_data() + + with self.assertLogs(level=WARNING): + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.FAILURE) + # Should not recurse indefinitely — bounded by _MAX_BISECTS + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_shutdown_returns_failure(self, mock_post): + """When a 413 is received but shutdown is in progress, return FAILURE without splitting.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + log_data = self._get_sdk_log_data() + + # Set shutdown before export + exporter._shutdown = True + + with self.assertLogs(level=WARNING): + result = exporter.export(log_data) + + # export() itself returns FAILURE early on shutdown + self.assertEqual(result, LogRecordExportResult.FAILURE) + + @patch.object(Session, "post") + def test_413_without_env_var_does_not_split(self, mock_post): + """When the experimental env var is not set, 413 should be treated as a non-retryable error.""" + exporter = OTLPLogExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + log_data = self._get_sdk_log_data() + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export(log_data) + + self.assertEqual(result, LogRecordExportResult.FAILURE) + # Should NOT split — only 1 call + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any( + "Failed to export logs batch code: 413" in record.message + for record in warning.records + ) + ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 5f61344bbf1..4cc9d0ed417 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -33,6 +33,7 @@ ) from opentelemetry.exporter.otlp.proto.http.version import __version__ from opentelemetry.sdk.environment_variables import ( + _OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, _OTEL_PYTHON_EXPORTER_OTLP_HTTP_TRACES_CREDENTIAL_PROVIDER, OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, @@ -72,7 +73,7 @@ # pylint: disable=protected-access -class TestOTLPSpanExporter(unittest.TestCase): +class TestOTLPSpanExporter(unittest.TestCase): # pylint: disable=too-many-public-methods def test_constructor_default(self): exporter = OTLPSpanExporter() @@ -345,7 +346,7 @@ def test_timeout_set_correctly(self, mock_post): def export_side_effect(*args, **kwargs): # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. - self.assertAlmostEqual(0.4, kwargs["timeout"], 2) + self.assertAlmostEqual(0.4, kwargs["timeout"], 1) return resp mock_post.side_effect = export_side_effect @@ -380,3 +381,352 @@ def test_shutdown_interrupts_retry_backoff(self, mock_post): ) assert after - before < 0.2 + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_splits_batch_and_succeeds(self, mock_post): + """When backend returns 413, the exporter should split the batch in half and retry each half.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_ok = Response() + resp_ok.status_code = 200 + + # First call returns 413, subsequent calls succeed + mock_post.side_effect = [resp_413, resp_ok, resp_ok] + + span1 = _Span( + "span1", + context=Mock( + **{ + "trace_state": {"a": "b"}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + span2 = _Span( + "span2", + context=Mock( + **{ + "trace_state": {"a": "b"}, + "span_id": 10217189687419569866, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export([span1, span2]) + + self.assertEqual(result, SpanExportResult.SUCCESS) + # 1 initial call (413) + 2 split calls (each succeeds) + self.assertEqual(mock_post.call_count, 3) + self.assertIn( + "Payload too large (2 spans), splitting into two batches", + warning.records[0].message, + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_single_span_returns_failure(self, mock_post): + """When a single span is too large, the exporter should return FAILURE via the non-retryable path.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export([BASIC_SPAN]) + + self.assertEqual(result, SpanExportResult.FAILURE) + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any( + "Failed to export span batch code: 413" in record.message + for record in warning.records + ) + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_recursive_splitting(self, mock_post): + """When a split half still returns 413, the exporter should continue splitting recursively.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_ok = Response() + resp_ok.status_code = 200 + + spans = [] + for idx in range(4): + spans.append( + _Span( + f"span{idx}", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865 + idx, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + ) + + # 4 spans: first 413 → split [0,1],[2,3] + # [0,1] → 413 → split [0],[1] → ok, ok + # [2,3] → ok + mock_post.side_effect = [resp_413, resp_413, resp_ok, resp_ok, resp_ok] + + with self.assertLogs(level=WARNING): + result = exporter.export(spans) + + self.assertEqual(result, SpanExportResult.SUCCESS) + self.assertEqual(mock_post.call_count, 5) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_partial_failure(self, mock_post): + """When the first half fails with a non-retryable error, the second half is not attempted (short-circuit).""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + resp_400 = Response() + resp_400.status_code = 400 + resp_400.reason = "Bad Request" + + span1 = _Span( + "span1", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + span2 = _Span( + "span2", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569866, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + + # First call returns 413, first half gets 400 (non-retryable) → short-circuit, second half never attempted + mock_post.side_effect = [resp_413, resp_400] + + with self.assertLogs(level=WARNING): + result = exporter.export([span1, span2]) + + self.assertEqual(result, SpanExportResult.FAILURE) + # Only 2 calls: initial 413 + first half 400. Second half never attempted. + self.assertEqual(mock_post.call_count, 2) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch( + "opentelemetry.exporter.otlp.proto.http.trace_exporter.time", + ) + @patch.object(Session, "post") + def test_413_deadline_expired_returns_failure(self, mock_post, mock_time): + """When a 413 is received but the deadline has expired, return FAILURE without splitting.""" + # time() calls: export() deadline_sec setup, _export timeout calc, + # retry-exit check (deadline_sec - time()) + mock_time.side_effect = [100.0, 100.0, 110.1] + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + span1 = _Span( + "span1", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + span2 = _Span( + "span2", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569866, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export([span1, span2]) + + self.assertEqual(result, SpanExportResult.FAILURE) + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any("timeout" in record.message for record in warning.records) + ) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_max_bisects_exceeded_returns_failure(self, mock_post): + """When max bisect depth is exhausted, the exporter should stop splitting and return FAILURE.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + # Always return 413 — should stop after _MAX_BISECTS depth + mock_post.return_value = resp_413 + + # Create a batch large enough to bisect _MAX_BISECTS times (2^5 = 32 items) + spans = [] + for idx in range(32): + spans.append( + _Span( + f"span{idx}", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865 + idx, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + ) + + with self.assertLogs(level=WARNING): + result = exporter.export(spans) + + self.assertEqual(result, SpanExportResult.FAILURE) + # Should not recurse indefinitely — bounded by _MAX_BISECTS + # At depth 5, batch is 1 item → falls through to non-retryable path + self.assertLessEqual(mock_post.call_count, 63) + + @patch.dict( + "os.environ", + {_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"}, + ) + @patch.object(Session, "post") + def test_413_shutdown_returns_failure(self, mock_post): + """When a 413 is received but shutdown is in progress, return FAILURE without splitting.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + span1 = _Span( + "span1", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + span2 = _Span( + "span2", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569866, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + + # Set shutdown before export + exporter._shutdown = True + + with self.assertLogs(level=WARNING): + result = exporter.export([span1, span2]) + + # export() itself returns FAILURE early on shutdown + self.assertEqual(result, SpanExportResult.FAILURE) + + @patch.object(Session, "post") + def test_413_without_env_var_does_not_split(self, mock_post): + """When the experimental env var is not set, 413 should be treated as a non-retryable error.""" + exporter = OTLPSpanExporter(timeout=10) + + resp_413 = Response() + resp_413.status_code = 413 + resp_413.reason = "Request Entity Too Large" + + mock_post.return_value = resp_413 + + span1 = _Span( + "span1", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + span2 = _Span( + "span2", + context=Mock( + **{ + "trace_state": {}, + "span_id": 10217189687419569866, + "trace_id": 67545097771067222548457157018666467027, + } + ), + ) + + with self.assertLogs(level=WARNING) as warning: + result = exporter.export([span1, span2]) + + self.assertEqual(result, SpanExportResult.FAILURE) + # Should NOT split — only 1 call + self.assertEqual(mock_post.call_count, 1) + self.assertTrue( + any( + "Failed to export span batch code: 413" in record.message + for record in warning.records + ) + ) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py index 2959163eed8..81923c0d6c2 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py @@ -838,3 +838,20 @@ def channel_credential_provider() -> grpc.ChannelCredentials: This is an experimental environment variable and the name of this variable and its behavior can change in a non-backwards compatible way. """ + +_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413 = ( + "OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413" +) +""" +.. envvar:: OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413 + +The :envvar:`OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413` environment variable +enables experimental payload splitting on HTTP 413 (Payload Too Large) +responses. When set to ``"true"``, the OTLP HTTP exporters will bisect the +batch and retry each half instead of dropping the entire batch. +Default: ``"false"`` + +This is not part of the OpenTelemetry specification. This is an experimental +environment variable and the name of this variable and its behavior can change +in a non-backwards compatible way. +"""