scaleapi
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/agentex/lib/core/observability/tests/test_tracing_metrics.py‎
Lines changed: 93 additions & 0 deletions b/‎src/agentex/lib/core/observability/tests/test_tracing_metrics.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎src/agentex/lib/core/observability/tests/test_tracing_metrics_recording.py‎
Lines changed: 124 additions & 0 deletions b/‎src/agentex/lib/core/observability/tests/test_tracing_metrics_recording.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎src/agentex/lib/core/observability/tracing_metrics.py‎
Lines changed: 165 additions & 0 deletions b/‎src/agentex/lib/core/observability/tracing_metrics.py‎
Lines changed: 165 additions & 0 deletions
@@ -1,5 +1,11 @@
 # Changelog
 
+## Unreleased
+
+### Features
+
+* **tracing:** emit OTel metrics for async span queue depth, batch drain, and SGP export success/failure (HTTP status labels). Disable SDK-side recording with ``AGENTEX_TRACING_METRICS=0``.
+
 ## 0.11.2 (2026-05-13)
 
 Full Changelog: [v0.11.1...v0.11.2](https://github.com/scaleapi/scale-agentex-python/compare/v0.11.1...v0.11.2)
 
@@ -0,0 +1,93 @@
+"""Tests for ``agentex.lib.core.observability.tracing_metrics``."""
+
+from __future__ import annotations
+
+import agentex.lib.core.observability.tracing_metrics as tracing_metrics
+from agentex.lib.core.observability.tracing_metrics import (
+    TracingMetrics,
+    classify_export_error,
+    get_tracing_metrics,
+    processor_label,
+)
+
+
+class TestClassifyExportError:
+    def test_scale_gp_authentication_error(self):
+        class AuthenticationError(Exception):
+            pass
+
+        exc = AuthenticationError("Error code: 401 - {'message': 'Not authorized to access Account'}")
+        assert classify_export_error(exc) == ("authentication", "401")
+
+    def test_rate_limit_code(self):
+        class APIError(Exception):
+            pass
+
+        exc = APIError("Error code: 429 - rate limited")
+        assert classify_export_error(exc) == ("rate_limit", "429")
+
+    def test_server_error_code(self):
+        class APIError(Exception):
+            pass
+
+        exc = APIError("Error code: 503 - unavailable")
+        assert classify_export_error(exc) == ("server_error", "5xx")
+
+    def test_timeout_by_name(self):
+        class APITimeoutError(Exception):
+            pass
+
+        assert classify_export_error(APITimeoutError("slow")) == ("timeout", "timeout")
+
+    def test_unknown_error(self):
+        class WeirdError(Exception):
+            pass
+
+        assert classify_export_error(WeirdError("boom")) == ("other_error", "unknown")
+
+
+class TestProcessorLabel:
+    def test_sgp_async_processor(self):
+        class SGPAsyncTracingProcessor:
+            pass
+
+        assert processor_label(SGPAsyncTracingProcessor()) == "sgp"
+
+    def test_other_processor(self):
+        class AgentexAsyncTracingProcessor:
+            pass
+
+        assert processor_label(AgentexAsyncTracingProcessor()) == "other"
+
+
+class TestGetTracingMetrics:
+    def test_returns_tracing_metrics_instance(self, monkeypatch):
+        monkeypatch.setattr(tracing_metrics, "_tracing_metrics", None)
+        m = get_tracing_metrics()
+        assert isinstance(m, TracingMetrics)
+
+    def test_singleton_returns_same_instance(self, monkeypatch):
+        monkeypatch.setattr(tracing_metrics, "_tracing_metrics", None)
+        first = get_tracing_metrics()
+        second = get_tracing_metrics()
+        assert first is second
+
+    def test_instruments_exist(self, monkeypatch):
+        monkeypatch.setattr(tracing_metrics, "_tracing_metrics", None)
+        m = get_tracing_metrics()
+        for name in (
+            "span_events_enqueued",
+            "span_events_dropped",
+            "queue_depth",
+            "queue_lag_ms",
+            "batch_items",
+            "batch_size",
+            "batch_drain_duration_ms",
+            "export_batches",
+            "export_spans",
+            "export_batch_failures",
+            "export_spans_failed",
+            "shutdown_timeouts",
+            "shutdown_remaining_items",
+        ):
+            assert hasattr(m, name), f"missing instrument: {name}"
@@ -0,0 +1,124 @@
+"""Tests for ``agentex.lib.core.observability.tracing_metrics_recording``."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import agentex.lib.core.observability.tracing_metrics_recording as recording
+
+
+class _Item:
+    def __init__(self, enqueued_at: float | None) -> None:
+        self.enqueued_at = enqueued_at
+
+
+class TestIsMetricsEnabled:
+    def setup_method(self) -> None:
+        recording._metrics_enabled = None
+
+    def test_enabled_by_default(self, monkeypatch):
+        monkeypatch.delenv("AGENTEX_TRACING_METRICS", raising=False)
+        assert recording.is_metrics_enabled() is True
+
+    def test_disabled_by_zero(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "0")
+        recording._metrics_enabled = None
+        assert recording.is_metrics_enabled() is False
+
+
+class TestRecordingHelpers:
+    def setup_method(self) -> None:
+        recording._metrics_enabled = None
+
+    def test_record_span_enqueued_when_disabled_does_not_load_metrics(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "0")
+        recording._metrics_enabled = None
+        with patch(
+            "agentex.lib.core.observability.tracing_metrics.get_tracing_metrics"
+        ) as mock_get:
+            recording.record_span_enqueued("start")
+            mock_get.assert_not_called()
+
+    def test_record_span_enqueued_when_enabled(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "1")
+        recording._metrics_enabled = None
+        mock_metrics = MagicMock()
+        with patch(
+            "agentex.lib.core.observability.tracing_metrics.get_tracing_metrics",
+            return_value=mock_metrics,
+        ):
+            recording.record_span_enqueued("end")
+        mock_metrics.span_events_enqueued.add.assert_called_once_with(1, {"event_type": "end"})
+
+    def test_monotonic_if_enabled_respects_kill_switch(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "0")
+        recording._metrics_enabled = None
+        assert recording.monotonic_if_enabled() is None
+
+    def test_record_batch_coalesced_records_lag(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "1")
+        recording._metrics_enabled = None
+        mock_metrics = MagicMock()
+        with patch(
+            "agentex.lib.core.observability.tracing_metrics.get_tracing_metrics",
+            return_value=mock_metrics,
+        ), patch("agentex.lib.core.observability.tracing_metrics_recording.time.monotonic", return_value=10.0):
+            recording.record_batch_coalesced(
+                queue_depth=3,
+                batch_items=[_Item(9.5), _Item(9.0)],
+            )
+        mock_metrics.queue_depth.record.assert_called_once_with(3)
+        mock_metrics.batch_items.record.assert_called_once_with(2)
+        mock_metrics.queue_lag_ms.record.assert_called_once_with(1000.0)
+
+    def test_record_export_failure(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "1")
+        recording._metrics_enabled = None
+        mock_metrics = MagicMock()
+
+        class AuthenticationError(Exception):
+            pass
+
+        exc = AuthenticationError("Error code: 401 - denied")
+        processor = type("SGPAsyncTracingProcessor", (), {})()
+
+        with patch(
+            "agentex.lib.core.observability.tracing_metrics.get_tracing_metrics",
+            return_value=mock_metrics,
+        ):
+            recording.record_export_failure(
+                processor=processor,
+                event_type="start",
+                span_count=5,
+                exc=exc,
+            )
+
+        mock_metrics.export_batch_failures.add.assert_called_once()
+        mock_metrics.export_spans_failed.add.assert_called_once_with(
+            5,
+            {
+                "processor": "sgp",
+                "event_type": "start",
+                "http_code": "401",
+                "error_class": "authentication",
+            },
+        )
+
+    def test_record_export_success(self, monkeypatch):
+        monkeypatch.setenv("AGENTEX_TRACING_METRICS", "1")
+        recording._metrics_enabled = None
+        mock_metrics = MagicMock()
+        with patch(
+            "agentex.lib.core.observability.tracing_metrics.get_tracing_metrics",
+            return_value=mock_metrics,
+        ):
+            recording.record_export_success(event_type="end", span_count=12)
+
+        mock_metrics.export_batches.add.assert_called_once_with(
+            1,
+            {"processor": "sgp", "event_type": "end", "outcome": "success"},
+        )
+        mock_metrics.export_spans.add.assert_called_once_with(
+            12,
+            {"processor": "sgp", "event_type": "end", "outcome": "success"},
+        )
@@ -0,0 +1,165 @@
+"""OTel metrics for async span queue and SGP export telemetry.
+
+Single source of truth for span-queue / export instrumentation.  Import
+``get_tracing_metrics()`` or the ``record_*`` helpers in
+``tracing_metrics_recording`` from hot paths — never configure a
+``MeterProvider`` here.
+
+The meter is no-op when the application has not configured a
+``MeterProvider``.  Set ``AGENTEX_TRACING_METRICS=0`` to skip recording
+entirely (see ``tracing_metrics_recording.is_metrics_enabled``).
+
+Cardinality is bounded:
+- ``event_type``: ``start`` | ``end``
+- ``processor``: ``sgp`` | ``other``
+- ``outcome``: ``success`` | ``failure`` (export counters only)
+- ``http_code``: small fixed set from ``classify_export_error``
+- ``error_class``: small fixed set from ``classify_export_error``
+- ``reason``: ``shutdown`` (drops only)
+- ``phase``: ``start`` | ``end`` (batch drain histograms)
+
+Resource attributes (``service.name``, ``k8s.*``, etc.) come from the
+host application's OTel resource configuration.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from opentelemetry import metrics
+
+_HTTP_CODE_RE = re.compile(r"Error code:\s*(\d+)")
+
+
+class TracingMetrics:
+    """Lazily-created OTel instruments for span queue + export telemetry."""
+
+    def __init__(self) -> None:
+        meter = metrics.get_meter("agentex.tracing")
+        self.span_events_enqueued = meter.create_counter(
+            name="agentex.tracing.span_events.enqueued",
+            unit="1",
+            description="Span queue START/END events accepted by enqueue()",
+        )
+        self.span_events_dropped = meter.create_counter(
+            name="agentex.tracing.span_events.dropped",
+            unit="1",
+            description="Span queue events dropped (e.g. shutdown)",
+        )
+        self.queue_depth = meter.create_histogram(
+            name="agentex.tracing.queue.depth",
+            unit="1",
+            description="asyncio queue depth at the start of a drain batch",
+        )
+        self.queue_lag_ms = meter.create_histogram(
+            name="agentex.tracing.queue.lag_ms",
+            unit="ms",
+            description="Max time from enqueue to drain-batch start for items in the batch",
+        )
+        self.batch_items = meter.create_histogram(
+            name="agentex.tracing.batch.items",
+            unit="1",
+            description="Total span events coalesced in one linger/drain batch",
+        )
+        self.batch_size = meter.create_histogram(
+            name="agentex.tracing.batch.size",
+            unit="1",
+            description="Span events in one START or END dispatch phase",
+        )
+        self.batch_drain_duration_ms = meter.create_histogram(
+            name="agentex.tracing.batch.drain_duration_ms",
+            unit="ms",
+            description="Wall time for one START or END _process_items dispatch",
+        )
+        self.export_batches = meter.create_counter(
+            name="agentex.tracing.export.batches",
+            unit="1",
+            description="HTTP export batch attempts tagged with outcome",
+        )
+        self.export_spans = meter.create_counter(
+            name="agentex.tracing.export.spans",
+            unit="1",
+            description="Spans included in HTTP export batches tagged with outcome",
+        )
+        self.export_batch_failures = meter.create_counter(
+            name="agentex.tracing.export.batch_failures",
+            unit="1",
+            description="Failed HTTP export batches by processor and HTTP status",
+        )
+        self.export_spans_failed = meter.create_counter(
+            name="agentex.tracing.export.spans_failed",
+            unit="1",
+            description="Spans in failed HTTP export batches by processor and HTTP status",
+        )
+        self.shutdown_timeouts = meter.create_counter(
+            name="agentex.tracing.shutdown.timeouts",
+            unit="1",
+            description="Span queue shutdown calls that hit the join timeout",
+        )
+        self.shutdown_remaining_items = meter.create_histogram(
+            name="agentex.tracing.shutdown.remaining_items",
+            unit="1",
+            description="Queue depth when span queue shutdown times out",
+        )
+
+
+_tracing_metrics: Optional[TracingMetrics] = None
+
+
+def get_tracing_metrics() -> TracingMetrics:
+    """Return the tracing metrics singleton, creating it on first use."""
+    global _tracing_metrics
+    if _tracing_metrics is None:
+        _tracing_metrics = TracingMetrics()
+    return _tracing_metrics
+
+
+def processor_label(processor: object) -> str:
+    """Map a tracing processor instance to a low-cardinality label."""
+    if type(processor).__name__ == "SGPAsyncTracingProcessor":
+        return "sgp"
+    return "other"
+
+
+def classify_export_error(exc: BaseException) -> tuple[str, str]:
+    """Categorize an export failure into (error_class, http_code_label).
+
+    ``http_code_label`` is a small fixed set suitable for Prometheus labels.
+    """
+    name = type(exc).__name__
+    message = str(exc)
+
+    if "Timeout" in name:
+        return "timeout", "timeout"
+    if "Connection" in name or "Connect" in name:
+        return "network_error", "network"
+
+    match = _HTTP_CODE_RE.search(message)
+    if match:
+        code = int(match.group(1))
+        if code == 401:
+            return "authentication", "401"
+        if code == 403:
+            return "authentication", "403"
+        if code == 429:
+            return "rate_limit", "429"
+        if 400 <= code < 500:
+            return "client_error", "4xx"
+        if 500 <= code < 600:
+            return "server_error", "5xx"
+        return "other_error", str(code)
+
+    if any(s in name for s in ("Authentication", "Permission")):
+        return "authentication", "unknown"
+    if "RateLimit" in name:
+        return "rate_limit", "429"
+    if any(s in name for s in ("ServerError", "InternalServer", "ServiceUnavailable", "BadGateway")):
+        return "server_error", "5xx"
+    if any(
+        s in name
+        for s in ("BadRequest", "NotFound", "Conflict", "UnprocessableEntity")
+    ):
+        return "client_error", "4xx"
+
+    return "other_error", "unknown"