Skip to content

Commit ae82b55

Browse files
Handle HTTP 413 responses in OTLP HTTP exporters by splitting and retrying
When a backend returns HTTP 413 (Payload Too Large), the trace and log exporters now split the batch in half and recursively retry each half. This prevents silent data loss when batch sizes exceed backend limits. The splitting includes deadline guards to prevent infinite recursion, short-circuits on first-half failure to avoid wasting time on the second half, and drops individual items that are genuinely too large. Fixes #4533
1 parent 7b38450 commit ae82b55

File tree

5 files changed

+414
-2
lines changed

5 files changed

+414
-2
lines changed

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def _is_retryable(resp: requests.Response) -> bool:
3434
return False
3535

3636

37+
def _is_payload_too_large(resp: requests.Response) -> bool:
38+
return resp.status_code == 413
39+
40+
3741
def _load_session_from_envvar(
3842
cred_envvar: Literal[
3943
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER,

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
Compression,
3232
)
3333
from opentelemetry.exporter.otlp.proto.http._common import (
34+
_is_payload_too_large,
3435
_is_retryable,
3536
_load_session_from_envvar,
3637
)
@@ -183,8 +184,14 @@ def export(
183184
_logger.warning("Exporter already shutdown, ignoring batch")
184185
return LogRecordExportResult.FAILURE
185186

186-
serialized_data = encode_logs(batch).SerializeToString()
187187
deadline_sec = time() + self._timeout
188+
return self._export_batch(batch, deadline_sec)
189+
190+
def _export_batch(
191+
self, batch: Sequence[ReadableLogRecord], deadline_sec: float
192+
) -> LogRecordExportResult:
193+
serialized_data = encode_logs(batch).SerializeToString()
194+
188195
for retry_num in range(_MAX_RETRYS):
189196
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
190197
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
@@ -201,6 +208,35 @@ def export(
201208
retryable = _is_retryable(resp)
202209
status_code = resp.status_code
203210

211+
if _is_payload_too_large(resp):
212+
# 413 handling always returns here; will not fall through
213+
# to the 'if not retryable' check below.
214+
if len(batch) <= 1:
215+
_logger.error(
216+
"Single log record exceeds backend payload size limit, dropping log record"
217+
)
218+
return LogRecordExportResult.FAILURE
219+
if time() >= deadline_sec:
220+
_logger.error(
221+
"Payload too large but deadline expired, dropping %d log records",
222+
len(batch),
223+
)
224+
return LogRecordExportResult.FAILURE
225+
mid = len(batch) // 2
226+
_logger.warning(
227+
"Payload too large (%d log records), splitting into two batches",
228+
len(batch),
229+
)
230+
first = self._export_batch(
231+
list(batch[:mid]), deadline_sec
232+
)
233+
if first != LogRecordExportResult.SUCCESS:
234+
return LogRecordExportResult.FAILURE
235+
second = self._export_batch(
236+
list(batch[mid:]), deadline_sec
237+
)
238+
return second
239+
204240
if not retryable:
205241
_logger.error(
206242
"Failed to export logs batch code: %s, reason: %s",

exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
Compression,
3434
)
3535
from opentelemetry.exporter.otlp.proto.http._common import (
36+
_is_payload_too_large,
3637
_is_retryable,
3738
_load_session_from_envvar,
3839
)
@@ -176,8 +177,14 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
176177
_logger.warning("Exporter already shutdown, ignoring batch")
177178
return SpanExportResult.FAILURE
178179

179-
serialized_data = encode_spans(spans).SerializePartialToString()
180180
deadline_sec = time() + self._timeout
181+
return self._export_batch(spans, deadline_sec)
182+
183+
def _export_batch(
184+
self, spans: Sequence[ReadableSpan], deadline_sec: float
185+
) -> SpanExportResult:
186+
serialized_data = encode_spans(spans).SerializePartialToString()
187+
181188
for retry_num in range(_MAX_RETRYS):
182189
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
183190
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
@@ -194,6 +201,35 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
194201
retryable = _is_retryable(resp)
195202
status_code = resp.status_code
196203

204+
if _is_payload_too_large(resp):
205+
# 413 handling always returns here; will not fall through
206+
# to the 'if not retryable' check below.
207+
if len(spans) <= 1:
208+
_logger.error(
209+
"Single span exceeds backend payload size limit, dropping span"
210+
)
211+
return SpanExportResult.FAILURE
212+
if time() >= deadline_sec:
213+
_logger.error(
214+
"Payload too large but deadline expired, dropping %d spans",
215+
len(spans),
216+
)
217+
return SpanExportResult.FAILURE
218+
mid = len(spans) // 2
219+
_logger.warning(
220+
"Payload too large (%d spans), splitting into two batches",
221+
len(spans),
222+
)
223+
first = self._export_batch(
224+
list(spans[:mid]), deadline_sec
225+
)
226+
if first != SpanExportResult.SUCCESS:
227+
return SpanExportResult.FAILURE
228+
second = self._export_batch(
229+
list(spans[mid:]), deadline_sec
230+
)
231+
return second
232+
197233
if not retryable:
198234
_logger.error(
199235
"Failed to export span batch code: %s, reason: %s",

exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,136 @@ def test_shutdown_interrupts_retry_backoff(self, mock_post):
562562
)
563563

564564
assert after - before < 0.2
565+
566+
@patch.object(Session, "post")
567+
def test_413_splits_batch_and_succeeds(self, mock_post):
568+
"""When backend returns 413, the exporter should split the batch in half and retry each half."""
569+
exporter = OTLPLogExporter(timeout=10)
570+
571+
resp_413 = Response()
572+
resp_413.status_code = 413
573+
resp_413.reason = "Request Entity Too Large"
574+
575+
resp_ok = Response()
576+
resp_ok.status_code = 200
577+
578+
mock_post.side_effect = [resp_413, resp_ok, resp_ok]
579+
580+
log_data = self._get_sdk_log_data()
581+
582+
with self.assertLogs(level=WARNING) as warning:
583+
result = exporter.export(log_data)
584+
585+
self.assertEqual(result, LogRecordExportResult.SUCCESS)
586+
# 1 initial call (413) + 2 split calls
587+
self.assertEqual(mock_post.call_count, 3)
588+
self.assertTrue(
589+
any(
590+
"Payload too large" in record.message
591+
for record in warning.records
592+
)
593+
)
594+
595+
@patch.object(Session, "post")
596+
def test_413_single_log_returns_failure(self, mock_post):
597+
"""When a single log record is too large, the exporter should return FAILURE."""
598+
exporter = OTLPLogExporter(timeout=10)
599+
600+
resp_413 = Response()
601+
resp_413.status_code = 413
602+
resp_413.reason = "Request Entity Too Large"
603+
604+
mock_post.return_value = resp_413
605+
606+
log_data = self._get_sdk_log_data()[:1]
607+
608+
with self.assertLogs(level=WARNING) as warning:
609+
result = exporter.export(log_data)
610+
611+
self.assertEqual(result, LogRecordExportResult.FAILURE)
612+
self.assertEqual(mock_post.call_count, 1)
613+
self.assertTrue(
614+
any(
615+
"Single log record exceeds backend payload size limit"
616+
in record.message
617+
for record in warning.records
618+
)
619+
)
620+
621+
@patch.object(Session, "post")
622+
def test_413_recursive_splitting(self, mock_post):
623+
"""When a split half still returns 413, the exporter should continue splitting recursively."""
624+
exporter = OTLPLogExporter(timeout=10)
625+
626+
resp_413 = Response()
627+
resp_413.status_code = 413
628+
resp_413.reason = "Request Entity Too Large"
629+
630+
resp_ok = Response()
631+
resp_ok.status_code = 200
632+
633+
log_data = self._get_sdk_log_data() # returns 3 logs
634+
635+
# 3 logs: first 413 → split [0],[1,2]
636+
# [0] → ok
637+
# [1,2] → 413 → split [1],[2] → ok, ok
638+
mock_post.side_effect = [resp_413, resp_ok, resp_413, resp_ok, resp_ok]
639+
640+
with self.assertLogs(level=WARNING):
641+
result = exporter.export(log_data)
642+
643+
self.assertEqual(result, LogRecordExportResult.SUCCESS)
644+
self.assertEqual(mock_post.call_count, 5)
645+
646+
@patch.object(Session, "post")
647+
def test_413_partial_failure(self, mock_post):
648+
"""When the first half fails with a non-retryable error, the second half is not attempted (short-circuit)."""
649+
exporter = OTLPLogExporter(timeout=10)
650+
651+
resp_413 = Response()
652+
resp_413.status_code = 413
653+
resp_413.reason = "Request Entity Too Large"
654+
655+
resp_400 = Response()
656+
resp_400.status_code = 400
657+
resp_400.reason = "Bad Request"
658+
659+
log_data = self._get_sdk_log_data()
660+
661+
# First call returns 413, first half gets 400 → short-circuit
662+
mock_post.side_effect = [resp_413, resp_400]
663+
664+
with self.assertLogs(level=WARNING):
665+
result = exporter.export(log_data)
666+
667+
self.assertEqual(result, LogRecordExportResult.FAILURE)
668+
self.assertEqual(mock_post.call_count, 2)
669+
670+
@patch(
671+
"opentelemetry.exporter.otlp.proto.http._log_exporter.time",
672+
)
673+
@patch.object(Session, "post")
674+
def test_413_deadline_expired_returns_failure(self, mock_post, mock_time):
675+
"""When a 413 is received but the deadline has expired, return FAILURE without splitting."""
676+
mock_time.side_effect = [100.0, 100.0, 100.6]
677+
exporter = OTLPLogExporter(timeout=0.5)
678+
679+
resp_413 = Response()
680+
resp_413.status_code = 413
681+
resp_413.reason = "Request Entity Too Large"
682+
683+
mock_post.return_value = resp_413
684+
685+
log_data = self._get_sdk_log_data()
686+
687+
with self.assertLogs(level=WARNING) as warning:
688+
result = exporter.export(log_data)
689+
690+
self.assertEqual(result, LogRecordExportResult.FAILURE)
691+
self.assertEqual(mock_post.call_count, 1)
692+
self.assertTrue(
693+
any(
694+
"deadline expired" in record.message
695+
for record in warning.records
696+
)
697+
)

0 commit comments

Comments
 (0)