perf(tracing): span queue linger + per-loop httpx keepalive

smoreinis · smoreinis · commit 958320a16b87 · 2026-05-20T12:40:37.000-07:00
Two compounding causes of slow SGP trace export under load:

- The async drain loop returned size-1 batches almost every time
  because there was no time window for spans to accumulate.  Adds a
  100ms linger (tunable via AGENTEX_SPAN_QUEUE_LINGER_MS) so
  concurrently-emitted spans coalesce into one upsert_batch call.

- httpx keepalive was disabled (max_keepalive_connections=0) in
  SGPAsyncTracingProcessor, AgentexAsyncTracingProcessor, and the ADK
  TracingModule to avoid "bound to a different event loop" errors in
  sync-ACP.  Each span paid a full TLS handshake.  Replaced with a
  per-event-loop client cache keyed on id(asyncio.get_running_loop());
  connections are reused within a loop and cross-loop safety is
  preserved.

Tests cover linger coalescing, batch-size cap interaction, per-loop
client caching, a keepalive-enabled regression guard, and
disabled-processor null-client behavior.
diff --git a/src/agentex/lib/adk/_modules/tracing.py b/src/agentex/lib/adk/_modules/tracing.py
@@ -67,14 +67,13 @@ def _tracing_service(self) -> TracingService:
         if self._tracing_service_lazy is None or (loop_id is not None and loop_id != self._bound_loop_id):
             import httpx
 
-            # Disable keepalive so each span HTTP call gets a fresh TCP
-            # connection.  Reused connections carry asyncio primitives bound
-            # to the event loop that created them; in sync-ACP / streaming
-            # contexts the loop context can shift between calls, causing
-            # "bound to a different event loop" RuntimeErrors.
+            # Keepalive ON: connections are reused within a single event
+            # loop, eliminating the TLS-handshake-per-span penalty under
+            # load.  Cross-loop safety is preserved by rebuilding the
+            # client whenever loop_id changes (the conditional above).
             agentex_client = create_async_agentex_client(
                 http_client=httpx.AsyncClient(
-                    limits=httpx.Limits(max_keepalive_connections=0),
+                    limits=httpx.Limits(max_keepalive_connections=20),
                 ),
             )
             tracer = AsyncTracer(agentex_client)
diff --git a/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py b/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, override
+import asyncio
+from typing import TYPE_CHECKING, Any, Dict, override
 
 from agentex import Agentex
 from agentex.types.span import Span
@@ -9,6 +10,9 @@
     AsyncTracingProcessor,
 )
 
+if TYPE_CHECKING:
+    from agentex import AsyncAgentex
+
 
 class AgentexSyncTracingProcessor(SyncTracingProcessor):
     def __init__(self, config: AgentexTracingProcessorConfig):  # noqa: ARG002
@@ -67,19 +71,35 @@ def shutdown(self) -> None:
 
 class AgentexAsyncTracingProcessor(AsyncTracingProcessor):
     def __init__(self, config: AgentexTracingProcessorConfig):  # noqa: ARG002
+        # Per-event-loop client cache.  httpx.AsyncClient is bound to the
+        # loop that created it, so in sync-ACP / streaming contexts (where
+        # the active loop can change between requests) we keep one client
+        # per loop instead of disabling keepalive entirely.
+        self._clients_by_loop_id: dict[int, "AsyncAgentex"] = {}
+
+    def _build_client(self) -> "AsyncAgentex":
         import httpx
 
-        # Disable keepalive so each span HTTP call gets a fresh TCP connection.
-        # Reused connections carry asyncio primitives bound to the event loop
-        # that created them; in sync-ACP / streaming contexts the loop context
-        # can shift between calls, causing "bound to a different event loop"
-        # RuntimeErrors.
-        self.client = create_async_agentex_client(
+        # Keepalive ON: connections are reused within a single event loop,
+        # eliminating the TLS-handshake-per-span penalty under load.
+        return create_async_agentex_client(
             http_client=httpx.AsyncClient(
-                limits=httpx.Limits(max_keepalive_connections=0),
+                limits=httpx.Limits(max_keepalive_connections=20),
             ),
         )
 
+    @property
+    def client(self) -> "AsyncAgentex":
+        try:
+            loop_id = id(asyncio.get_running_loop())
+        except RuntimeError:
+            return self._build_client()
+        client = self._clients_by_loop_id.get(loop_id)
+        if client is None:
+            client = self._build_client()
+            self._clients_by_loop_id[loop_id] = client
+        return client
+
     # TODO(AGX1-199): Add batch create/update endpoints to Agentex API and use
     # them here instead of one HTTP call per span.
     # https://linear.app/scale-epd/issue/AGX1-199/add-agentex-batch-endpoint-for-traces
diff --git a/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py b/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 from typing import cast, override
 
 import scale_gp_beta.lib.tracing as tracing
@@ -92,23 +93,47 @@ def shutdown(self) -> None:
 class SGPAsyncTracingProcessor(AsyncTracingProcessor):
     def __init__(self, config: SGPTracingProcessorConfig):
         self.disabled = config.sgp_api_key == "" or config.sgp_account_id == ""
+        self._config = config
+        # Per-event-loop client cache.  httpx.AsyncClient ties its connection
+        # pool to the loop it was created on; in sync-ACP / streaming contexts
+        # the active loop can change between requests.  Keying by loop id lets
+        # us keep keepalive on within each loop while staying safe across
+        # loops.  The construction can also happen at module import time when
+        # no loop is running, so we have to defer it until the first call.
+        self._clients_by_loop_id: dict[int, AsyncSGPClient] = {}
+        self.env_vars = EnvironmentVariables.refresh()
+
+    def _build_client(self) -> AsyncSGPClient:
         import httpx
 
-        # Disable keepalive so each HTTP call gets a fresh TCP connection,
-        # avoiding "bound to a different event loop" errors in sync-ACP.
-        self.sgp_async_client = (
-            AsyncSGPClient(
-                api_key=config.sgp_api_key,
-                account_id=config.sgp_account_id,
-                base_url=config.sgp_base_url,
-                http_client=httpx.AsyncClient(
-                    limits=httpx.Limits(max_keepalive_connections=0),
-                ),
-            )
-            if not self.disabled
-            else None
+        return AsyncSGPClient(
+            api_key=self._config.sgp_api_key,
+            account_id=self._config.sgp_account_id,
+            base_url=self._config.sgp_base_url,
+            # Keepalive ON: connections are reused within a single event loop,
+            # which removes the TLS-handshake-per-span penalty observed under
+            # load.  Cross-loop safety is preserved by the per-loop cache.
+            http_client=httpx.AsyncClient(
+                limits=httpx.Limits(max_keepalive_connections=20),
+            ),
         )
-        self.env_vars = EnvironmentVariables.refresh()
+
+    def _get_client(self) -> AsyncSGPClient | None:
+        """Return the AsyncSGPClient bound to the current event loop, creating
+        one on first use.  Returns None when the processor is disabled."""
+        if self.disabled:
+            return None
+        try:
+            loop_id = id(asyncio.get_running_loop())
+        except RuntimeError:
+            # Called from outside an event loop — should not happen on the
+            # hot path, but build a one-off client rather than crashing.
+            return self._build_client()
+        client = self._clients_by_loop_id.get(loop_id)
+        if client is None:
+            client = self._build_client()
+            self._clients_by_loop_id[loop_id] = client
+        return client
 
     @override
     async def on_span_start(self, span: Span) -> None:
@@ -123,31 +148,29 @@ async def on_spans_start(self, spans: list[Span]) -> None:
         if not spans:
             return
 
-        sgp_spans = [_build_sgp_span(span, self.env_vars) for span in spans]
-
-        if self.disabled:
+        client = self._get_client()
+        if client is None:
             logger.warning("SGP is disabled, skipping span upsert")
             return
-        await self.sgp_async_client.spans.upsert_batch(  # type: ignore[union-attr]
-            items=[s.to_request_params() for s in sgp_spans]
-        )
+
+        sgp_spans = [_build_sgp_span(span, self.env_vars) for span in spans]
+        await client.spans.upsert_batch(items=[s.to_request_params() for s in sgp_spans])
 
     @override
     async def on_spans_end(self, spans: list[Span]) -> None:
         if not spans:
             return
 
+        client = self._get_client()
+        if client is None:
+            return
+
         sgp_spans: list[SGPSpan] = []
         for span in spans:
             sgp_span = _build_sgp_span(span, self.env_vars)
             sgp_span.end_time = span.end_time.isoformat()  # type: ignore[union-attr]
             sgp_spans.append(sgp_span)
-
-        if self.disabled:
-            return
-        await self.sgp_async_client.spans.upsert_batch(  # type: ignore[union-attr]
-            items=[s.to_request_params() for s in sgp_spans]
-        )
+        await client.spans.upsert_batch(items=[s.to_request_params() for s in sgp_spans])
 
     @override
     async def shutdown(self) -> None:
diff --git a/src/agentex/lib/core/tracing/span_queue.py b/src/agentex/lib/core/tracing/span_queue.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import asyncio
 from enum import Enum
 from dataclasses import dataclass
@@ -13,6 +14,25 @@
 logger = make_logger(__name__)
 
 _DEFAULT_BATCH_SIZE = 50
+_DEFAULT_LINGER_MS = 100
+
+
+def _read_linger_ms_env() -> int:
+    """Read AGENTEX_SPAN_QUEUE_LINGER_MS from the environment, falling back to
+    _DEFAULT_LINGER_MS when unset or unparseable.  Negative values are clamped
+    to 0 (i.e. "drain immediately, no linger")."""
+    raw = os.environ.get("AGENTEX_SPAN_QUEUE_LINGER_MS")
+    if raw is None:
+        return _DEFAULT_LINGER_MS
+    try:
+        return max(0, int(raw))
+    except ValueError:
+        logger.warning(
+            "Ignoring invalid AGENTEX_SPAN_QUEUE_LINGER_MS=%r; using default %d ms",
+            raw,
+            _DEFAULT_LINGER_MS,
+        )
+        return _DEFAULT_LINGER_MS
 
 
 class SpanEventType(str, Enum):
@@ -35,13 +55,23 @@ class AsyncSpanQueue:
     batch are flushed concurrently, then all END events, so that per-span
     start-before-end ordering is preserved while HTTP calls for independent
     spans execute in parallel.
+
+    Once the drain loop picks up the first item, it lingers up to
+    ``linger_ms`` waiting for more items to coalesce into the same batch.
+    Without the linger the drain almost always returned size-1 batches under
+    real agent workloads, because spans typically arrive a few ms apart.
     """
 
-    def __init__(self, batch_size: int = _DEFAULT_BATCH_SIZE) -> None:
+    def __init__(
+        self,
+        batch_size: int = _DEFAULT_BATCH_SIZE,
+        linger_ms: int | None = None,
+    ) -> None:
         self._queue: asyncio.Queue[_SpanQueueItem] = asyncio.Queue()
         self._drain_task: asyncio.Task[None] | None = None
         self._stopping = False
         self._batch_size = batch_size
+        self._linger_ms = _read_linger_ms_env() if linger_ms is None else max(0, linger_ms)
 
     def enqueue(
         self,
@@ -69,12 +99,30 @@ async def _drain_loop(self) -> None:
             first = await self._queue.get()
             batch: list[_SpanQueueItem] = [first]
 
-            # Opportunistically grab more ready items (non-blocking).
-            while len(batch) < self._batch_size:
-                try:
-                    batch.append(self._queue.get_nowait())
-                except asyncio.QueueEmpty:
-                    break
+            # Linger briefly so spans emitted within the window coalesce into
+            # one batch.  Stop early when the batch fills, when the linger
+            # window elapses, or as soon as the queue is briefly empty *after*
+            # the deadline.
+            if self._linger_ms > 0 and not self._stopping:
+                loop = asyncio.get_running_loop()
+                deadline = loop.time() + (self._linger_ms / 1000.0)
+                while len(batch) < self._batch_size:
+                    remaining = deadline - loop.time()
+                    if remaining <= 0:
+                        break
+                    try:
+                        batch.append(
+                            await asyncio.wait_for(self._queue.get(), timeout=remaining)
+                        )
+                    except asyncio.TimeoutError:
+                        break
+            else:
+                # No linger — drain whatever is already queued and stop.
+                while len(batch) < self._batch_size:
+                    try:
+                        batch.append(self._queue.get_nowait())
+                    except asyncio.QueueEmpty:
+                        break
 
             try:
                 # Separate START and END events.  Processing all STARTs before
diff --git a/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py b/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py
diff --git a/tests/lib/core/tracing/test_span_queue.py b/tests/lib/core/tracing/test_span_queue.py