Skip to content

Commit 0d1bda1

Browse files
vastinADOT Patch workflow
andauthored
serviceevents: gate error breakdown on faults (5xx) only (#783)
## What Gate the per-error `error_breakdown` (and the `EndpointErrorMetric` `count` data points it feeds) on **faults (5xx) only**, and stop synthesizing an `"UnknownError"` entry when no real exception type was captured. The breakdown is now populated only for a request with `status_code >= 500` **and** a captured exception type. A 4xx still increments the aggregate `errors` counter but produces no breakdown entry; a 5xx that returned a 500 without a captured exception produces no synthetic entry. ## Changes - **Gate `>= 400` → `>= 500`** in `endpoint_collector.py`. - **No synthetic `UnknownError`**: `_extract_error_from_call_path` returns `None` (was a synthetic `{"error_type": "UnknownError", "function_name": "unknown"}` dict) when no real type can be resolved, so callers omit the breakdown. - The collector indexes `error_info` directly (the extractor is the sole producer of a complete-or-`None` dict), so no synthetic default can silently resurface. ## Tests - Fault-only gate (4xx excluded from breakdown, still counted as an error). - 5xx-without-exception omission across Flask / Django / FastAPI. - The `None`-return extractor contract. Lint clean: `black`, `isort`, `flake8`, `codespell` all pass; serviceevents suite green (716 passing). Co-authored-by: ADOT Patch workflow <adot-patch-workflow@github.com>
1 parent e3a2ff7 commit 0d1bda1

6 files changed

Lines changed: 118 additions & 20 deletions

File tree

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/collectors/endpoint_collector.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,17 @@ def record_request(
149149
elif status_code >= 400:
150150
agg["errors"] += 1
151151

152-
# Track error_breakdown if error occurred
153-
if status_code >= 400 and error_info:
154-
error_type = error_info.get("error_type", "UnknownError")
155-
function_name = error_info.get("function_name", "unknown")
152+
# Track error_breakdown only for faults (5xx), matching Java. A 4xx is
153+
# still reflected in the aggregate "errors" counter above, but does not
154+
# produce an EndpointErrorMetric breakdown data point.
155+
if status_code >= 500 and error_info:
156+
# error_info, when present, always carries both keys: the extractor
157+
# (_extract_error_from_call_path) returns either None or a complete
158+
# {error_type, function_name} dict. Index directly so no synthetic
159+
# "UnknownError"/"unknown" default can silently resurface here — the
160+
# very value the extractor's None return was added to eliminate.
161+
error_type = error_info["error_type"]
162+
function_name = error_info["function_name"]
156163

157164
# Create unique key for this error pattern
158165
error_key = f"{error_type}:{function_name}"

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/instrumentation/flask_instrumentation.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,13 @@ def _extract_error_from_call_path(exception, route, method) -> Optional[Dict]:
389389
elif isinstance(exc_data, dict) and exc_data.get("name"):
390390
error_type = exc_data["name"]
391391
else:
392-
error_type = "UnknownError"
392+
# No real error type was captured — neither a passed-in exception nor a
393+
# monitor-recorded one. Return None so callers omit the error breakdown
394+
# entirely, matching Java (whose gate is `statusCode >= 500 && errorType
395+
# != null`). A 5xx with no captured exception (e.g. a handler that returns
396+
# a 500 status without raising) must NOT synthesize an "UnknownError"
397+
# breakdown entry.
398+
return None
393399

394400
# Find the origin function_name.
395401
function_name = "unknown"

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/collectors/test_endpoint_collector.py

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,13 @@ def test_different_routes_create_separate_entries(self):
213213

214214
self.assertEqual(len(collector._aggregations), 3)
215215

216-
def test_error_info_not_recorded_below_400(self):
217-
"""Test that error_info is not recorded for status codes below 400."""
216+
def test_error_breakdown_not_recorded_below_500(self):
217+
"""error_breakdown is only populated for faults (5xx), matching Java.
218+
219+
Neither a 2xx nor a 4xx with error_info produces a breakdown entry; the
220+
gate is status_code >= 500. (A 4xx still bumps the aggregate ``errors``
221+
counter, asserted separately in test_4xx_increments_errors.)
222+
"""
218223
collector = EndpointMetricCollector(flush_interval_ms=10000)
219224

220225
collector.record_request(
@@ -224,10 +229,41 @@ def test_error_info_not_recorded_below_400(self):
224229
duration_ns=10_000_000,
225230
error_info={"error_type": "ValueError", "function_name": "func_123"},
226231
)
232+
collector.record_request(
233+
route="/api/users",
234+
method="GET",
235+
status_code=404,
236+
duration_ns=10_000_000,
237+
error_info={"error_type": "NotFoundError", "function_name": "func_456"},
238+
)
227239

228240
operation = list(collector._aggregations.keys())[0]
229241
agg = collector._aggregations[operation]
230-
# error_breakdown should be empty since status < 400
242+
# error_breakdown should be empty since no request was a 5xx fault.
243+
self.assertEqual(len(agg["error_breakdown"]), 0)
244+
245+
def test_error_breakdown_not_recorded_for_5xx_without_error_info(self):
246+
"""A 5xx with error_info=None produces no breakdown entry, matching Java.
247+
248+
The framework hook's _extract_error_from_call_path returns None when no real
249+
error type was captured (e.g. a handler that returns a 500 status without
250+
raising), so record_request is called with error_info=None. The request is
251+
still counted as a fault, but no per-error breakdown is recorded — matching
252+
Java's `statusCode >= 500 && errorType != null` gate.
253+
"""
254+
collector = EndpointMetricCollector(flush_interval_ms=10000)
255+
256+
collector.record_request(
257+
route="/api/users",
258+
method="GET",
259+
status_code=500,
260+
duration_ns=10_000_000,
261+
error_info=None,
262+
)
263+
264+
operation = list(collector._aggregations.keys())[0]
265+
agg = collector._aggregations[operation]
266+
self.assertEqual(agg["faults"], 1)
231267
self.assertEqual(len(agg["error_breakdown"]), 0)
232268

233269
def test_5xx_increments_faults(self):
@@ -406,8 +442,14 @@ def test_no_incidents_defaults(self):
406442
self.assertEqual(event.incident_count, 0)
407443
self.assertEqual(event.incidents_exemplar, [])
408444

409-
def test_error_metrics_emitted_per_error_type(self):
410-
"""collect() emits one EndpointErrorMetric per error type alongside the summary."""
445+
def test_error_metrics_emitted_only_for_faults(self):
446+
"""collect() emits one EndpointErrorMetric per 5xx error type; 4xx is excluded.
447+
448+
Matches Java: the breakdown that feeds EndpointErrorMetric is gated on
449+
status_code >= 500. A 4xx with error_info still increments the aggregate
450+
``errors`` counter on the EndpointSummary, but does not produce a breakdown
451+
data point.
452+
"""
411453
emitter = MagicMock()
412454
collector = EndpointMetricCollector(
413455
flush_interval_ms=10000,
@@ -438,13 +480,12 @@ def test_error_metrics_emitted_per_error_type(self):
438480
# EndpointSummary emitted once.
439481
emitter.emit_endpoint_summary.assert_called_once()
440482

441-
# Per-error-type metrics emitted: one EndpointErrorMetric per error type.
483+
# Only the 5xx fault produces an EndpointErrorMetric; the 4xx is excluded.
442484
emitter.emit_endpoint_error_metrics.assert_called_once()
443485
metrics = emitter.emit_endpoint_error_metrics.call_args[0][0]
444486
by_exception = {m.exception: m for m in metrics}
445-
self.assertEqual(set(by_exception), {"RuntimeError", "ValueError"})
487+
self.assertEqual(set(by_exception), {"RuntimeError"})
446488
self.assertEqual(by_exception["RuntimeError"].count, 3)
447-
self.assertEqual(by_exception["ValueError"].count, 2)
448489
for metric in metrics:
449490
self.assertEqual(metric.telemetry_type, "EndpointErrorMetric")
450491

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_django_instrumentation.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,42 @@ def test_processes_incident_on_500(self, mock_clear):
624624
ec_kwargs = mock_ec.record_request.call_args[1]
625625
self.assertIsNotNone(ec_kwargs["error_info"])
626626

627+
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.clear_current_operation")
628+
def test_processes_500_without_exception_omits_error_info(self, mock_clear):
629+
"""A 500 response with no captured exception produces no error breakdown.
630+
631+
Django shares _extract_error_from_call_path with Flask/FastAPI. When a view
632+
returns a 500 status WITHOUT raising (so _finalize_request gets exc=None) and
633+
the monitor recorded no exception (fresh state, reset in setUp), there is no
634+
real error type to attribute. The extractor must return None and the collector
635+
must receive error_info=None — matching Java's `statusCode >= 500 && errorType
636+
!= null` gate. No synthetic "UnknownError" breakdown is produced. The request
637+
is still recorded as a 500 and the incident is still processed.
638+
"""
639+
mock_ec = MagicMock()
640+
mock_isc = MagicMock()
641+
django_mod._endpoint_collector = mock_ec
642+
django_mod._incident_snapshot_collector = mock_isc
643+
644+
request = self._make_request(method="POST", path="/api/orders", route="api/orders", view_name="create_order")
645+
response = MagicMock()
646+
response.status_code = 500
647+
648+
with patch(
649+
"amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.time"
650+
) as mock_time:
651+
mock_time.perf_counter_ns.return_value = 1050000000
652+
# exc=None: the view returned a 500 status without raising.
653+
_finalize_request(request, response, None)
654+
655+
# The request is still recorded as a 500 (the aggregate fault counter increments)...
656+
ec_kwargs = mock_ec.record_request.call_args[1]
657+
self.assertEqual(ec_kwargs["status_code"], 500)
658+
# ...but with no captured exception type, no per-error breakdown is attributed.
659+
self.assertIsNone(ec_kwargs["error_info"])
660+
# The incident snapshot is still processed for the 500.
661+
mock_isc.process_potential_incident.assert_called_once()
662+
627663
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.clear_current_operation")
628664
def test_infers_500_from_exception(self, mock_clear):
629665
"""When no response is provided, status 500 is inferred from exception."""

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_fastapi_instrumentation.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,14 @@ async def mock_send(msg):
398398
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.clear_current_operation")
399399
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.set_current_operation")
400400
async def test_middleware_processes_incident_on_500(self, mock_set_op, mock_clear_op):
401-
"""Status 500 triggers incident snapshot processing."""
401+
"""Status 500 triggers incident snapshot processing.
402+
403+
The app sends a 500 status WITHOUT raising and with no monitor-captured
404+
exception, so no real error type is available. error_info is therefore None
405+
(the breakdown is omitted), matching Java's `errorType != null` gate — a 500
406+
without a captured exception produces no per-error breakdown. The incident
407+
snapshot is still processed and the request is still recorded.
408+
"""
402409
mock_ec = MagicMock()
403410
mock_isc = MagicMock()
404411
fastapi_mod._endpoint_collector = mock_ec
@@ -423,9 +430,9 @@ async def mock_app(scope, receive, send):
423430
call_kwargs = mock_isc.process_potential_incident.call_args[1]
424431
self.assertEqual(call_kwargs["status_code"], 500)
425432

426-
# Error info should be passed to endpoint collector
433+
# No exception was captured, so no error breakdown is produced (Java parity).
427434
ec_kwargs = mock_ec.record_request.call_args[1]
428-
self.assertIsNotNone(ec_kwargs["error_info"])
435+
self.assertIsNone(ec_kwargs["error_info"])
429436

430437
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.clear_current_operation")
431438
@patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.set_current_operation")

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_flask_instrumentation.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,16 @@ def test_extract_error_with_call_path(self):
133133
self.assertEqual(result["error_type"], "RuntimeError")
134134
self.assertEqual(result["function_name"], "my_func_123")
135135

136-
def test_extract_error_no_investigation_data(self):
137-
"""Returns default values when no investigation data exists."""
136+
def test_extract_error_returns_none_when_no_error_type_captured(self):
137+
"""Returns None when no real error type can be resolved (no passed-in exception and
138+
no monitor-captured one), matching Java's `errorType != null` gate. A 5xx with no
139+
captured exception must NOT synthesize an "UnknownError" breakdown entry."""
138140
_ServiceEventsMonitorState.get_instance()
139141
# No begin_investigation call, so no inv data
140142

141143
result = _extract_error_from_call_path(None, "/api/test", "GET")
142144

143-
self.assertEqual(result["error_type"], "UnknownError")
144-
self.assertEqual(result["function_name"], "unknown")
145+
self.assertIsNone(result)
145146

146147
def test_extract_error_prefers_captured_exception_origin(self):
147148
"""The function the monitor recorded as the thrower wins over call_path[0]."""

0 commit comments

Comments
 (0)