Skip to content

Commit 1ffcda9

Browse files
Make 413 payload splitting opt-in via experimental env var
The HTTP 413 payload splitting behavior is not part of the OpenTelemetry specification. Gate it behind the experimental environment variable OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413 (must be set to "true" to enable). When unset, 413 responses are treated as non-retryable errors. Also refactors the control flow per review feedback: the bisectable flag is computed alongside retryable, checked after the retry-exit block, and the splitting logic is moved to after line 257 in the original code.
1 parent ba3fc8d commit 1ffcda9

File tree

6 files changed

+231
-64
lines changed

6 files changed

+231
-64
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414

1515
- `opentelemetry-sdk`: Add `service` resource detector support to declarative file configuration via `detection_development.detectors[].service`
1616
([#5003](https://github.com/open-telemetry/opentelemetry-python/pull/5003))
17-
- `opentelemetry-exporter-otlp-proto-http`: Handle HTTP 413 (Payload Too Large) responses in trace and log exporters by splitting the batch in half and retrying each half recursively
17+
- `opentelemetry-exporter-otlp-proto-http`: Add experimental opt-in support for handling HTTP 413 (Payload Too Large) responses in trace and log exporters by splitting the batch in half and retrying each half recursively. Enable via `OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413=true`.
1818
([#5032](https://github.com/open-telemetry/opentelemetry-python/pull/5032))
1919

2020
## Version 1.41.0/0.62b0 (2026-04-09)

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
)
4343
from opentelemetry.sdk._shared_internal import DuplicateFilter
4444
from opentelemetry.sdk.environment_variables import (
45+
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413,
4546
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER,
4647
OTEL_EXPORTER_OTLP_CERTIFICATE,
4748
OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE,
@@ -207,44 +208,24 @@ def _export_batch(
207208
reason = error
208209
retryable = isinstance(error, ConnectionError)
209210
status_code = None
211+
bisectable = False
210212
else:
211213
reason = resp.reason
212214
retryable = _is_retryable(resp)
213215
status_code = resp.status_code
214-
215-
if (
216+
bisectable = (
216217
_is_payload_too_large(resp)
217218
and len(batch) > 1
218219
and remaining_bisects > 0
219-
):
220-
if time() >= deadline_sec or self._shutdown:
221-
_logger.error(
222-
"Payload too large but %s, dropping %d log records",
223-
"shutdown in progress"
224-
if self._shutdown
225-
else "deadline expired",
226-
len(batch),
227-
)
228-
return LogRecordExportResult.FAILURE
229-
mid = len(batch) // 2
230-
_logger.warning(
231-
"Payload too large (%d log records), splitting into two batches",
232-
len(batch),
233-
)
234-
first = self._export_batch(
235-
list(batch[:mid]),
236-
deadline_sec,
237-
remaining_bisects - 1,
238-
)
239-
if first != LogRecordExportResult.SUCCESS:
240-
return LogRecordExportResult.FAILURE
241-
return self._export_batch(
242-
list(batch[mid:]),
243-
deadline_sec,
244-
remaining_bisects - 1,
220+
and environ.get(
221+
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, ""
245222
)
223+
.strip()
224+
.lower()
225+
== "true"
226+
)
246227

247-
if not retryable:
228+
if not retryable and not bisectable:
248229
_logger.error(
249230
"Failed to export logs batch code: %s, reason: %s",
250231
status_code,
@@ -262,6 +243,34 @@ def _export_batch(
262243
"max retries or shutdown."
263244
)
264245
return LogRecordExportResult.FAILURE
246+
247+
if bisectable:
248+
if time() >= deadline_sec or self._shutdown:
249+
_logger.error(
250+
"Payload too large but %s, dropping %d log records",
251+
"shutdown in progress"
252+
if self._shutdown
253+
else "deadline expired",
254+
len(batch),
255+
)
256+
return LogRecordExportResult.FAILURE
257+
mid = len(batch) // 2
258+
_logger.warning(
259+
"Payload too large (%d log records), splitting into two batches",
260+
len(batch),
261+
)
262+
first = self._export_batch(
263+
list(batch[:mid]),
264+
deadline_sec,
265+
remaining_bisects - 1,
266+
)
267+
if first != LogRecordExportResult.SUCCESS:
268+
return LogRecordExportResult.FAILURE
269+
return self._export_batch(
270+
list(batch[mid:]),
271+
deadline_sec,
272+
remaining_bisects - 1,
273+
)
265274
_logger.warning(
266275
"Transient error %s encountered while exporting logs batch, retrying in %.2fs.",
267276
reason,

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
_load_session_from_envvar,
3939
)
4040
from opentelemetry.sdk.environment_variables import (
41+
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413,
4142
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_TRACES_CREDENTIAL_PROVIDER,
4243
OTEL_EXPORTER_OTLP_CERTIFICATE,
4344
OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE,
@@ -200,44 +201,24 @@ def _export_batch(
200201
reason = error
201202
retryable = isinstance(error, ConnectionError)
202203
status_code = None
204+
bisectable = False
203205
else:
204206
reason = resp.reason
205207
retryable = _is_retryable(resp)
206208
status_code = resp.status_code
207-
208-
if (
209+
bisectable = (
209210
_is_payload_too_large(resp)
210211
and len(spans) > 1
211212
and remaining_bisects > 0
212-
):
213-
if time() >= deadline_sec or self._shutdown:
214-
_logger.error(
215-
"Payload too large but %s, dropping %d spans",
216-
"shutdown in progress"
217-
if self._shutdown
218-
else "deadline expired",
219-
len(spans),
220-
)
221-
return SpanExportResult.FAILURE
222-
mid = len(spans) // 2
223-
_logger.warning(
224-
"Payload too large (%d spans), splitting into two batches",
225-
len(spans),
226-
)
227-
first = self._export_batch(
228-
list(spans[:mid]),
229-
deadline_sec,
230-
remaining_bisects - 1,
231-
)
232-
if first != SpanExportResult.SUCCESS:
233-
return SpanExportResult.FAILURE
234-
return self._export_batch(
235-
list(spans[mid:]),
236-
deadline_sec,
237-
remaining_bisects - 1,
213+
and environ.get(
214+
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, ""
238215
)
216+
.strip()
217+
.lower()
218+
== "true"
219+
)
239220

240-
if not retryable:
221+
if not retryable and not bisectable:
241222
_logger.error(
242223
"Failed to export span batch code: %s, reason: %s",
243224
status_code,
@@ -255,6 +236,34 @@ def _export_batch(
255236
"max retries or shutdown."
256237
)
257238
return SpanExportResult.FAILURE
239+
240+
if bisectable:
241+
if time() >= deadline_sec or self._shutdown:
242+
_logger.error(
243+
"Payload too large but %s, dropping %d spans",
244+
"shutdown in progress"
245+
if self._shutdown
246+
else "deadline expired",
247+
len(spans),
248+
)
249+
return SpanExportResult.FAILURE
250+
mid = len(spans) // 2
251+
_logger.warning(
252+
"Payload too large (%d spans), splitting into two batches",
253+
len(spans),
254+
)
255+
first = self._export_batch(
256+
list(spans[:mid]),
257+
deadline_sec,
258+
remaining_bisects - 1,
259+
)
260+
if first != SpanExportResult.SUCCESS:
261+
return SpanExportResult.FAILURE
262+
return self._export_batch(
263+
list(spans[mid:]),
264+
deadline_sec,
265+
remaining_bisects - 1,
266+
)
258267
_logger.warning(
259268
"Transient error %s encountered while exporting span batch, retrying in %.2fs.",
260269
reason,

exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from opentelemetry.sdk._logs import ReadWriteLogRecord
4444
from opentelemetry.sdk._logs.export import LogRecordExportResult
4545
from opentelemetry.sdk.environment_variables import (
46+
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413,
4647
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER,
4748
OTEL_EXPORTER_OTLP_CERTIFICATE,
4849
OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE,
@@ -563,6 +564,10 @@ def test_shutdown_interrupts_retry_backoff(self, mock_post):
563564

564565
assert after - before < 0.2
565566

567+
@patch.dict(
568+
"os.environ",
569+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
570+
)
566571
@patch.object(Session, "post")
567572
def test_413_splits_batch_and_succeeds(self, mock_post):
568573
"""When backend returns 413, the exporter should split the batch in half and retry each half."""
@@ -592,6 +597,10 @@ def test_413_splits_batch_and_succeeds(self, mock_post):
592597
)
593598
)
594599

600+
@patch.dict(
601+
"os.environ",
602+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
603+
)
595604
@patch.object(Session, "post")
596605
def test_413_single_log_returns_failure(self, mock_post):
597606
"""When a single log record is too large, the exporter should return FAILURE via the non-retryable path."""
@@ -617,6 +626,10 @@ def test_413_single_log_returns_failure(self, mock_post):
617626
)
618627
)
619628

629+
@patch.dict(
630+
"os.environ",
631+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
632+
)
620633
@patch.object(Session, "post")
621634
def test_413_recursive_splitting(self, mock_post):
622635
"""When a split half still returns 413, the exporter should continue splitting recursively."""
@@ -642,6 +655,10 @@ def test_413_recursive_splitting(self, mock_post):
642655
self.assertEqual(result, LogRecordExportResult.SUCCESS)
643656
self.assertEqual(mock_post.call_count, 5)
644657

658+
@patch.dict(
659+
"os.environ",
660+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
661+
)
645662
@patch.object(Session, "post")
646663
def test_413_partial_failure(self, mock_post):
647664
"""When the first half fails with a non-retryable error, the second half is not attempted (short-circuit)."""
@@ -666,14 +683,20 @@ def test_413_partial_failure(self, mock_post):
666683
self.assertEqual(result, LogRecordExportResult.FAILURE)
667684
self.assertEqual(mock_post.call_count, 2)
668685

686+
@patch.dict(
687+
"os.environ",
688+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
689+
)
669690
@patch(
670691
"opentelemetry.exporter.otlp.proto.http._log_exporter.time",
671692
)
672693
@patch.object(Session, "post")
673694
def test_413_deadline_expired_returns_failure(self, mock_post, mock_time):
674695
"""When a 413 is received but the deadline has expired, return FAILURE without splitting."""
675-
mock_time.side_effect = [100.0, 100.0, 100.6]
676-
exporter = OTLPLogExporter(timeout=0.5)
696+
# time() calls: export() deadline_sec setup, _export timeout calc,
697+
# retry-exit check (deadline_sec - time()), bisect deadline check
698+
mock_time.side_effect = [100.0, 100.0, 100.0, 110.1]
699+
exporter = OTLPLogExporter(timeout=10)
677700

678701
resp_413 = Response()
679702
resp_413.status_code = 413
@@ -695,6 +718,10 @@ def test_413_deadline_expired_returns_failure(self, mock_post, mock_time):
695718
)
696719
)
697720

721+
@patch.dict(
722+
"os.environ",
723+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
724+
)
698725
@patch.object(Session, "post")
699726
def test_413_max_bisects_exceeded_returns_failure(self, mock_post):
700727
"""When max bisect depth is exhausted, the exporter should stop splitting and return FAILURE."""
@@ -715,6 +742,10 @@ def test_413_max_bisects_exceeded_returns_failure(self, mock_post):
715742
self.assertEqual(result, LogRecordExportResult.FAILURE)
716743
# Should not recurse indefinitely — bounded by _MAX_BISECTS
717744

745+
@patch.dict(
746+
"os.environ",
747+
{_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413: "true"},
748+
)
718749
@patch.object(Session, "post")
719750
def test_413_shutdown_returns_failure(self, mock_post):
720751
"""When a 413 is received but shutdown is in progress, return FAILURE without splitting."""
@@ -736,3 +767,29 @@ def test_413_shutdown_returns_failure(self, mock_post):
736767

737768
# export() itself returns FAILURE early on shutdown
738769
self.assertEqual(result, LogRecordExportResult.FAILURE)
770+
771+
@patch.object(Session, "post")
772+
def test_413_without_env_var_does_not_split(self, mock_post):
773+
"""When the experimental env var is not set, 413 should be treated as a non-retryable error."""
774+
exporter = OTLPLogExporter(timeout=10)
775+
776+
resp_413 = Response()
777+
resp_413.status_code = 413
778+
resp_413.reason = "Request Entity Too Large"
779+
780+
mock_post.return_value = resp_413
781+
782+
log_data = self._get_sdk_log_data()
783+
784+
with self.assertLogs(level=WARNING) as warning:
785+
result = exporter.export(log_data)
786+
787+
self.assertEqual(result, LogRecordExportResult.FAILURE)
788+
# Should NOT split — only 1 call
789+
self.assertEqual(mock_post.call_count, 1)
790+
self.assertTrue(
791+
any(
792+
"Failed to export logs batch code: 413" in record.message
793+
for record in warning.records
794+
)
795+
)

0 commit comments

Comments
 (0)