refactor(tracing): split _run worker into named helpers and expand tests

mohammadatallah-scale · mohammadatallah-scale · commit e6ea039da808 · 2026-05-04T15:03:57.000-04:00
Code clarity changes (no behavior change):
- Split SGPAsyncTracingProcessor._run into _is_shutting_down,
  _wait_for_flush_signal, and _safe_drain helpers so the loop reads
  top-to-bottom: "while not shutting down, wait for trigger, drain."
- Add docstrings on _enqueue, _ensure_started, _drain, and
  _upsert_with_retry covering inputs, side effects, and dropped-batch
  semantics.

New tests (regression coverage that was missing):
- test_drain_splits_into_multiple_batches_above_max_batch_size: 80
  enqueued events split into multiple upsert_batch calls, each batch
  capped at MAX_BATCH_SIZE (50).
- test_worker_continues_after_unexpected_exception_in_one_batch: a
  RuntimeError on one upsert drops that batch; the worker keeps
  flushing and a subsequent batch lands. Exercises the per-iteration
  try/except in _run.
diff --git a/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py b/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py
@@ -164,8 +164,10 @@ def _add_source_to_span(self, span: Span) -> None:
     def _ensure_started(self) -> None:
         """Initialize per-loop queue + worker on first use, or after a loop swap.
 
-        Must be called from inside an async method so `get_running_loop()`
-        is safe.
+        Must be called from inside an async method so `get_running_loop()` is
+        safe. Idempotent on the same loop while the worker is healthy; on a
+        loop change or worker death, it rebuilds the queue and worker (items
+        in the previous queue are lost — they were tied to a now-dead loop).
         """
         if self.disabled:
             return
@@ -246,6 +248,8 @@ async def shutdown(self) -> None:
             self._worker.cancel()
 
     def _enqueue(self, sgp_span: SGPSpan) -> None:
+        """Push a span onto the queue and signal an early flush if the queue
+        has crossed `DEFAULT_TRIGGER_QUEUE_SIZE`. Drops the span on overflow."""
         if self._queue is None:
             return
         try:
@@ -256,37 +260,51 @@ def _enqueue(self, sgp_span: SGPSpan) -> None:
         if self._flush_event is not None and self._queue.qsize() >= DEFAULT_TRIGGER_QUEUE_SIZE:
             self._flush_event.set()
 
-    async def _run(self) -> None:
+    def _is_shutting_down(self) -> bool:
+        return self._shutdown_event is not None and self._shutdown_event.is_set()
+
+    async def _wait_for_flush_signal(self) -> None:
+        """Block until either an early-flush signal arrives or the cadence
+        timer fires. Returns either way; the caller is responsible for
+        draining."""
+        assert self._flush_event is not None
         try:
-            while not (self._shutdown_event and self._shutdown_event.is_set()):
-                # Wake on either an early-flush signal or the cadence timer.
-                assert self._flush_event is not None
-                try:
-                    await asyncio.wait_for(self._flush_event.wait(), timeout=DEFAULT_TRIGGER_CADENCE)
-                except asyncio.TimeoutError:
-                    pass
-                self._flush_event.clear()
-                # Per-iteration guard: an unexpected error during one drain
-                # must not kill the worker, otherwise queued items stay
-                # unflushed until shutdown.
-                try:
-                    await self._drain()
-                except asyncio.CancelledError:
-                    raise
-                except Exception:
-                    logger.exception("Tracing worker iteration failed; continuing")
+            await asyncio.wait_for(self._flush_event.wait(), timeout=DEFAULT_TRIGGER_CADENCE)
+        except asyncio.TimeoutError:
+            pass
+        self._flush_event.clear()
 
-            # Final drain on shutdown.
-            try:
-                await self._drain()
-            except Exception:
-                logger.exception("Final tracing drain failed; some spans may be lost")
+    async def _safe_drain(self, log_label: str) -> None:
+        """Run `_drain`, catching unexpected errors so one bad iteration
+        doesn't kill the worker. CancelledError is always re-raised."""
+        try:
+            await self._drain()
+        except asyncio.CancelledError:
+            raise
+        except Exception:
+            logger.exception(log_label)
+
+    async def _run(self) -> None:
+        """Background worker. Sleeps until a flush trigger fires, drains the
+        queue, and repeats. On shutdown signal, does one final drain so
+        nothing pending is dropped. The outermost try / except keeps a worker
+        crash from being silent."""
+        try:
+            while not self._is_shutting_down():
+                await self._wait_for_flush_signal()
+                await self._safe_drain("Tracing worker iteration failed; continuing")
+            await self._safe_drain("Final tracing drain failed; some spans may be lost")
         except asyncio.CancelledError:
             raise
         except Exception:
             logger.exception("Async tracing worker crashed")
 
     async def _drain(self) -> None:
+        """Pull spans from the queue and upsert them in batches of up to
+        `DEFAULT_MAX_BATCH_SIZE`. Stops when the queue is empty.
+
+        A span whose `to_request_params()` raises is dropped (logged); the
+        rest of the batch still goes out. This matches the SDK's exporter."""
         if self._queue is None or self.sgp_async_client is None:
             return
         while not self._queue.empty():
@@ -305,6 +323,13 @@ async def _drain(self) -> None:
             await self._upsert_with_retry(batch)
 
     async def _upsert_with_retry(self, batch: list[dict]) -> None:
+        """POST a single batch with the SDK's retry policy: 4 attempts with
+        exponential backoff (`INITIAL_BACKOFF` -> `MAX_BACKOFF` capped).
+
+        - `APIError` triggers retry up to `DEFAULT_RETRIES` attempts.
+        - Anything else is logged and the batch is dropped (we don't know
+          whether the server saw the request, and the SDK already wraps
+          transport-level failures as `APIError`)."""
         if self.sgp_async_client is None:
             return
         backoff = INITIAL_BACKOFF
diff --git a/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py b/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import uuid
+import asyncio
 from datetime import UTC, datetime
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -259,3 +260,64 @@ async def test_shutdown_flushes_queued_spans_in_one_batch(self):
         items = client.spans.upsert_batch.call_args.kwargs["items"]
         # 5 starts + 5 ends = 10 enqueued items, well under MAX_BATCH_SIZE.
         assert len(items) == 10, f"Expected 10 items in the batch, got {len(items)}"
+
+    async def test_drain_splits_into_multiple_batches_above_max_batch_size(self):
+        """Spans beyond MAX_BATCH_SIZE (50) must be split across multiple
+        upsert_batch calls so a single call never exceeds the cap."""
+        processor, client = self._make_processor()
+
+        with patch(f"{MODULE}.create_span", side_effect=lambda **kw: _make_mock_sgp_span()):
+            for _ in range(40):
+                span = _make_span()
+                await processor.on_span_start(span)
+                span.end_time = datetime.now(UTC)
+                await processor.on_span_end(span)
+
+        # 40 starts + 40 ends = 80 enqueued items. With MAX_BATCH_SIZE=50,
+        # that's at least 2 upsert calls.
+        await processor.shutdown()
+
+        assert client.spans.upsert_batch.call_count >= 2, (
+            f"Expected ≥2 batched upserts for 80 events, got {client.spans.upsert_batch.call_count}"
+        )
+        for call in client.spans.upsert_batch.call_args_list:
+            items = call.kwargs["items"]
+            assert len(items) <= 50, f"Batch of {len(items)} exceeds MAX_BATCH_SIZE=50"
+        total_items = sum(len(call.kwargs["items"]) for call in client.spans.upsert_batch.call_args_list)
+        assert total_items == 80, f"Expected 80 items across all batches, got {total_items}"
+
+    async def test_worker_continues_after_unexpected_exception_in_one_batch(self):
+        """A single upsert raising an unexpected (non-APIError) exception
+        must drop that batch and let the worker keep flushing subsequent
+        ones. Regression test for the per-iteration try/except in `_run`."""
+        processor, client = self._make_processor()
+
+        # First call raises (unexpected exception → batch dropped),
+        # subsequent calls succeed.
+        client.spans.upsert_batch.side_effect = [RuntimeError("boom"), None]
+
+        with patch(f"{MODULE}.create_span", side_effect=lambda **kw: _make_mock_sgp_span()):
+            # First flush — will raise inside _upsert_with_retry, batch dropped.
+            span_a = _make_span()
+            await processor.on_span_start(span_a)
+            span_a.end_time = datetime.now(UTC)
+            await processor.on_span_end(span_a)
+            assert processor._flush_event is not None
+            processor._flush_event.set()
+            # Yield so the worker runs the failing flush.
+            await asyncio.sleep(0)
+            await asyncio.sleep(0)
+
+            # Worker must still be alive and able to handle a second batch.
+            span_b = _make_span()
+            await processor.on_span_start(span_b)
+            span_b.end_time = datetime.now(UTC)
+            await processor.on_span_end(span_b)
+
+        await processor.shutdown()
+
+        # First call raised, second succeeded → 2 calls total.
+        assert client.spans.upsert_batch.call_count == 2, (
+            f"Worker should have made a second upsert attempt after the first failed; "
+            f"got {client.spans.upsert_batch.call_count}"
+        )