serviceevents: gate error breakdown on faults (5xx) only (#783)

vastin · ADOT Patch workflow · web-flow · commit 0d1bda17a67e · 2026-06-17T05:42:12.000Z
## What

Gate the per-error `error_breakdown` (and the `EndpointErrorMetric`
`count` data points it feeds) on **faults (5xx) only**, and stop
synthesizing an `"UnknownError"` entry when no real exception type was
captured.

The breakdown is now populated only for a request with `status_code &gt;=
500` **and** a captured exception type. A 4xx still increments the
aggregate `errors` counter but produces no breakdown entry; a 5xx that
returned a 500 without a captured exception produces no synthetic entry.

## Changes

- **Gate `&gt;= 400` → `&gt;= 500`** in `endpoint_collector.py`.
- **No synthetic `UnknownError`**: `_extract_error_from_call_path`
returns `None` (was a synthetic `{"error_type": "UnknownError",
"function_name": "unknown"}` dict) when no real type can be resolved, so
callers omit the breakdown.
- The collector indexes `error_info` directly (the extractor is the sole
producer of a complete-or-`None` dict), so no synthetic default can
silently resurface.

## Tests

- Fault-only gate (4xx excluded from breakdown, still counted as an
error).
- 5xx-without-exception omission across Flask / Django / FastAPI.
- The `None`-return extractor contract.

Lint clean: `black`, `isort`, `flake8`, `codespell` all pass;
serviceevents suite green (716 passing).

Co-authored-by: ADOT Patch workflow &lt;adot-patch-workflow@github.com&gt;
diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/collectors/endpoint_collector.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/collectors/endpoint_collector.py
@@ -149,10 +149,17 @@ def record_request(
             elif status_code >= 400:
                 agg["errors"] += 1
 
-            # Track error_breakdown if error occurred
-            if status_code >= 400 and error_info:
-                error_type = error_info.get("error_type", "UnknownError")
-                function_name = error_info.get("function_name", "unknown")
+            # Track error_breakdown only for faults (5xx), matching Java. A 4xx is
+            # still reflected in the aggregate "errors" counter above, but does not
+            # produce an EndpointErrorMetric breakdown data point.
+            if status_code >= 500 and error_info:
+                # error_info, when present, always carries both keys: the extractor
+                # (_extract_error_from_call_path) returns either None or a complete
+                # {error_type, function_name} dict. Index directly so no synthetic
+                # "UnknownError"/"unknown" default can silently resurface here — the
+                # very value the extractor's None return was added to eliminate.
+                error_type = error_info["error_type"]
+                function_name = error_info["function_name"]
 
                 # Create unique key for this error pattern
                 error_key = f"{error_type}:{function_name}"
diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/instrumentation/flask_instrumentation.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/serviceevents/instrumentation/flask_instrumentation.py
@@ -389,7 +389,13 @@ def _extract_error_from_call_path(exception, route, method) -> Optional[Dict]:
     elif isinstance(exc_data, dict) and exc_data.get("name"):
         error_type = exc_data["name"]
     else:
-        error_type = "UnknownError"
+        # No real error type was captured — neither a passed-in exception nor a
+        # monitor-recorded one. Return None so callers omit the error breakdown
+        # entirely, matching Java (whose gate is `statusCode >= 500 && errorType
+        # != null`). A 5xx with no captured exception (e.g. a handler that returns
+        # a 500 status without raising) must NOT synthesize an "UnknownError"
+        # breakdown entry.
+        return None
 
     # Find the origin function_name.
     function_name = "unknown"
diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/collectors/test_endpoint_collector.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/collectors/test_endpoint_collector.py
@@ -213,8 +213,13 @@ def test_different_routes_create_separate_entries(self):
 
         self.assertEqual(len(collector._aggregations), 3)
 
-    def test_error_info_not_recorded_below_400(self):
-        """Test that error_info is not recorded for status codes below 400."""
+    def test_error_breakdown_not_recorded_below_500(self):
+        """error_breakdown is only populated for faults (5xx), matching Java.
+
+        Neither a 2xx nor a 4xx with error_info produces a breakdown entry; the
+        gate is status_code >= 500. (A 4xx still bumps the aggregate ``errors``
+        counter, asserted separately in test_4xx_increments_errors.)
+        """
         collector = EndpointMetricCollector(flush_interval_ms=10000)
 
         collector.record_request(
@@ -224,10 +229,41 @@ def test_error_info_not_recorded_below_400(self):
             duration_ns=10_000_000,
             error_info={"error_type": "ValueError", "function_name": "func_123"},
         )
+        collector.record_request(
+            route="/api/users",
+            method="GET",
+            status_code=404,
+            duration_ns=10_000_000,
+            error_info={"error_type": "NotFoundError", "function_name": "func_456"},
+        )
 
         operation = list(collector._aggregations.keys())[0]
         agg = collector._aggregations[operation]
-        # error_breakdown should be empty since status < 400
+        # error_breakdown should be empty since no request was a 5xx fault.
+        self.assertEqual(len(agg["error_breakdown"]), 0)
+
+    def test_error_breakdown_not_recorded_for_5xx_without_error_info(self):
+        """A 5xx with error_info=None produces no breakdown entry, matching Java.
+
+        The framework hook's _extract_error_from_call_path returns None when no real
+        error type was captured (e.g. a handler that returns a 500 status without
+        raising), so record_request is called with error_info=None. The request is
+        still counted as a fault, but no per-error breakdown is recorded — matching
+        Java's `statusCode >= 500 && errorType != null` gate.
+        """
+        collector = EndpointMetricCollector(flush_interval_ms=10000)
+
+        collector.record_request(
+            route="/api/users",
+            method="GET",
+            status_code=500,
+            duration_ns=10_000_000,
+            error_info=None,
+        )
+
+        operation = list(collector._aggregations.keys())[0]
+        agg = collector._aggregations[operation]
+        self.assertEqual(agg["faults"], 1)
         self.assertEqual(len(agg["error_breakdown"]), 0)
 
     def test_5xx_increments_faults(self):
@@ -406,8 +442,14 @@ def test_no_incidents_defaults(self):
         self.assertEqual(event.incident_count, 0)
         self.assertEqual(event.incidents_exemplar, [])
 
-    def test_error_metrics_emitted_per_error_type(self):
-        """collect() emits one EndpointErrorMetric per error type alongside the summary."""
+    def test_error_metrics_emitted_only_for_faults(self):
+        """collect() emits one EndpointErrorMetric per 5xx error type; 4xx is excluded.
+
+        Matches Java: the breakdown that feeds EndpointErrorMetric is gated on
+        status_code >= 500. A 4xx with error_info still increments the aggregate
+        ``errors`` counter on the EndpointSummary, but does not produce a breakdown
+        data point.
+        """
         emitter = MagicMock()
         collector = EndpointMetricCollector(
             flush_interval_ms=10000,
@@ -438,13 +480,12 @@ def test_error_metrics_emitted_per_error_type(self):
         # EndpointSummary emitted once.
         emitter.emit_endpoint_summary.assert_called_once()
 
-        # Per-error-type metrics emitted: one EndpointErrorMetric per error type.
+        # Only the 5xx fault produces an EndpointErrorMetric; the 4xx is excluded.
         emitter.emit_endpoint_error_metrics.assert_called_once()
         metrics = emitter.emit_endpoint_error_metrics.call_args[0][0]
         by_exception = {m.exception: m for m in metrics}
-        self.assertEqual(set(by_exception), {"RuntimeError", "ValueError"})
+        self.assertEqual(set(by_exception), {"RuntimeError"})
         self.assertEqual(by_exception["RuntimeError"].count, 3)
-        self.assertEqual(by_exception["ValueError"].count, 2)
         for metric in metrics:
             self.assertEqual(metric.telemetry_type, "EndpointErrorMetric")
 
diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_django_instrumentation.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_django_instrumentation.py
@@ -624,6 +624,42 @@ def test_processes_incident_on_500(self, mock_clear):
         ec_kwargs = mock_ec.record_request.call_args[1]
         self.assertIsNotNone(ec_kwargs["error_info"])
 
+    @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.clear_current_operation")
+    def test_processes_500_without_exception_omits_error_info(self, mock_clear):
+        """A 500 response with no captured exception produces no error breakdown.
+
+        Django shares _extract_error_from_call_path with Flask/FastAPI. When a view
+        returns a 500 status WITHOUT raising (so _finalize_request gets exc=None) and
+        the monitor recorded no exception (fresh state, reset in setUp), there is no
+        real error type to attribute. The extractor must return None and the collector
+        must receive error_info=None — matching Java's `statusCode >= 500 && errorType
+        != null` gate. No synthetic "UnknownError" breakdown is produced. The request
+        is still recorded as a 500 and the incident is still processed.
+        """
+        mock_ec = MagicMock()
+        mock_isc = MagicMock()
+        django_mod._endpoint_collector = mock_ec
+        django_mod._incident_snapshot_collector = mock_isc
+
+        request = self._make_request(method="POST", path="/api/orders", route="api/orders", view_name="create_order")
+        response = MagicMock()
+        response.status_code = 500
+
+        with patch(
+            "amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.time"
+        ) as mock_time:
+            mock_time.perf_counter_ns.return_value = 1050000000
+            # exc=None: the view returned a 500 status without raising.
+            _finalize_request(request, response, None)
+
+        # The request is still recorded as a 500 (the aggregate fault counter increments)...
+        ec_kwargs = mock_ec.record_request.call_args[1]
+        self.assertEqual(ec_kwargs["status_code"], 500)
+        # ...but with no captured exception type, no per-error breakdown is attributed.
+        self.assertIsNone(ec_kwargs["error_info"])
+        # The incident snapshot is still processed for the 500.
+        mock_isc.process_potential_incident.assert_called_once()
+
     @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.django_instrumentation.clear_current_operation")
     def test_infers_500_from_exception(self, mock_clear):
         """When no response is provided, status 500 is inferred from exception."""
diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_fastapi_instrumentation.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_fastapi_instrumentation.py
@@ -398,7 +398,14 @@ async def mock_send(msg):
     @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.clear_current_operation")
     @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.set_current_operation")
     async def test_middleware_processes_incident_on_500(self, mock_set_op, mock_clear_op):
-        """Status 500 triggers incident snapshot processing."""
+        """Status 500 triggers incident snapshot processing.
+
+        The app sends a 500 status WITHOUT raising and with no monitor-captured
+        exception, so no real error type is available. error_info is therefore None
+        (the breakdown is omitted), matching Java's `errorType != null` gate — a 500
+        without a captured exception produces no per-error breakdown. The incident
+        snapshot is still processed and the request is still recorded.
+        """
         mock_ec = MagicMock()
         mock_isc = MagicMock()
         fastapi_mod._endpoint_collector = mock_ec
@@ -423,9 +430,9 @@ async def mock_app(scope, receive, send):
         call_kwargs = mock_isc.process_potential_incident.call_args[1]
         self.assertEqual(call_kwargs["status_code"], 500)
 
-        # Error info should be passed to endpoint collector
+        # No exception was captured, so no error breakdown is produced (Java parity).
         ec_kwargs = mock_ec.record_request.call_args[1]
-        self.assertIsNotNone(ec_kwargs["error_info"])
+        self.assertIsNone(ec_kwargs["error_info"])
 
     @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.clear_current_operation")
     @patch("amazon.opentelemetry.distro.serviceevents.instrumentation.fastapi_instrumentation.set_current_operation")
diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_flask_instrumentation.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/serviceevents/instrumentation/test_flask_instrumentation.py
@@ -133,15 +133,16 @@ def test_extract_error_with_call_path(self):
         self.assertEqual(result["error_type"], "RuntimeError")
         self.assertEqual(result["function_name"], "my_func_123")
 
-    def test_extract_error_no_investigation_data(self):
-        """Returns default values when no investigation data exists."""
+    def test_extract_error_returns_none_when_no_error_type_captured(self):
+        """Returns None when no real error type can be resolved (no passed-in exception and
+        no monitor-captured one), matching Java's `errorType != null` gate. A 5xx with no
+        captured exception must NOT synthesize an "UnknownError" breakdown entry."""
         _ServiceEventsMonitorState.get_instance()
         # No begin_investigation call, so no inv data
 
         result = _extract_error_from_call_path(None, "/api/test", "GET")
 
-        self.assertEqual(result["error_type"], "UnknownError")
-        self.assertEqual(result["function_name"], "unknown")
+        self.assertIsNone(result)
 
     def test_extract_error_prefers_captured_exception_origin(self):
         """The function the monitor recorded as the thrower wins over call_path[0]."""