feat: Metrics to count all non-terminal execution statuses (#186)

morgan-wowk · web-flow · commit 9b5a9dc73cc3 · 2026-04-27T13:30:23.000-07:00
### TL;DR Added a metrics polling service that periodically queries the database to emit execution status count gauges for monitoring active (non-terminal) pipeline executions. ![Screenshot 2026-03-23 at 5.24.33 PM.png](https://app.graphite.com/user-attachments/assets/bde3e0ea-123d-484b-abae-23c418332a5b.png) ### What changed? - Added `EXECUTIONS` metric unit to `MetricUnit` enum - Created `execution_status_count` observable gauge to track execution node counts by status - Implemented `PollingService` class in new `metrics_poller.py` module that: - Queries the database every 30 seconds for execution status counts - Only tracks active (non-terminal) statuses like PENDING, RUNNING, etc. - Thread-safely updates gauge observations with current counts - Integrated metrics poller as a daemon thread in the application startup, with automatic detection of OpenTelemetry metrics configuration ### How to test? 1. Set the OpenTelemetry metrics exporter endpoint environment variable (see `examples/observability/otel-jaeger-prometheus`) 2. Start the application and verify the metrics poller thread launches 3. Create pipeline executions in various active statuses 4. Check that the `execution.status.count` gauge emits observations with correct counts for each status 5. Verify that terminal statuses (SUCCEEDED, FAILED) are not included in the gauge metrics ### Why make this change? This enables real-time monitoring of pipeline execution health by providing visibility into how many executions are currently in each active state, which is essential for operational observability.
diff --git a/cloud_pipelines_backend/instrumentation/metrics.py b/cloud_pipelines_backend/instrumentation/metrics.py
@@ -40,6 +40,7 @@ class MetricUnit(str, enum.Enum):
 
     SECONDS = "s"
     ERRORS = "{error}"
+    EXECUTIONS = "{execution}"
 
 
 # ---------------------------------------------------------------------------
@@ -59,6 +60,13 @@ class MetricUnit(str, enum.Enum):
     unit=MetricUnit.SECONDS,
 )
 
+execution_status_count = orchestrator_meter.create_observable_gauge(
+    name="execution.status.count",
+    callbacks=[],
+    description="Number of execution nodes in each active (non-terminal) status",
+    unit=MetricUnit.EXECUTIONS,
+)
+
 
 # ---------------------------------------------------------------------------
 # SQLAlchemy event listeners
diff --git a/cloud_pipelines_backend/instrumentation/metrics_polling.py b/cloud_pipelines_backend/instrumentation/metrics_polling.py
@@ -0,0 +1,107 @@
+"""Metrics polling.
+
+Periodically queries the DB and updates ObservableGauges. Currently emits
+execution status counts; add new DB-backed metrics here as needed.
+
+Only fluctuating (non-terminal) statuses are emitted as status count gauges —
+terminal statuses like SUCCEEDED and FAILED only ever climb and are not useful
+as gauges.
+"""
+
+import logging
+import time
+import typing
+
+import sqlalchemy as sql
+from opentelemetry import metrics as otel_metrics
+from sqlalchemy import orm
+
+from .. import backend_types_sql as bts
+from . import metrics as app_metrics
+from .opentelemetry._internal import configuration as otel_configuration
+
+_logger = logging.getLogger(__name__)
+
+
+# All statuses minus terminal (ended) ones — these fluctuate up and down
+_ACTIVE_STATUSES: frozenset[bts.ContainerExecutionStatus] = (
+    frozenset(bts.ContainerExecutionStatus) - bts.CONTAINER_STATUSES_ENDED
+)
+
+
+def _empty_status_counts() -> dict[str, int]:
+    return {s.value: 0 for s in _ACTIVE_STATUSES}
+
+
+class PollingService:
+    """Polls the DB periodically and emits execution status count gauges."""
+
+    def __init__(
+        self,
+        *,
+        session_factory: typing.Callable[[], orm.Session],
+        poll_interval_seconds: float = 30.0,
+    ) -> None:
+        self._session_factory = session_factory
+        self._poll_interval_seconds = poll_interval_seconds
+        # Initialize all active statuses to 0
+        self._counts: dict[str, int] = _empty_status_counts()
+        # Register our observe method as the gauge callback.
+        # The OTel SDK stores callbacks in _callbacks; we append after creation
+        # since create_observable_gauge is called at module load time in metrics.py.
+        app_metrics.execution_status_count._callbacks.append(self._observe)
+
+    def run_loop(self) -> None:
+        while True:
+            try:
+                self._poll()
+            except Exception:
+                _logger.exception("Metrics PollingService: error polling DB")
+            time.sleep(self._poll_interval_seconds)
+
+    def _poll(self) -> None:
+        with self._session_factory() as session:
+            rows = session.execute(
+                sql.select(
+                    bts.ExecutionNode.container_execution_status,
+                    sql.func.count().label("count"),
+                )
+                .where(
+                    bts.ExecutionNode.container_execution_status.in_(_ACTIVE_STATUSES)
+                )
+                .group_by(bts.ExecutionNode.container_execution_status)
+            ).all()
+        new_counts = _empty_status_counts()
+        for status, count in rows:
+            if status is not None:
+                new_counts[status.value] = count
+        # CPython: attribute assignment is atomic under the GIL; no lock needed.
+        # If GIL-free Python is ever adopted, revisit this.
+        self._counts = new_counts
+        _logger.debug(f"Metrics PollingService: polled status counts: {new_counts}")
+
+    def _observe(
+        self, _options: otel_metrics.CallbackOptions
+    ) -> typing.Iterable[otel_metrics.Observation]:
+        counts = self._counts.copy()
+        for status_value, count in counts.items():
+            yield otel_metrics.Observation(count, {"execution.status": status_value})
+
+
+def run(*, db_engine: sql.Engine) -> None:
+    """Check OTel config and run the metrics polling loop (blocking).
+
+    Logs and returns immediately if no metrics endpoint is configured.
+    """
+    otel_config = otel_configuration.resolve()
+    if otel_config is None or otel_config.metrics is None:
+        _logger.info(
+            f"No OTel metrics endpoint configured"
+            f" (set {otel_configuration.EnvVar.METRIC_EXPORTER_ENDPOINT})"
+            f" — metrics poller not starting"
+        )
+        return
+    session_factory = orm.sessionmaker(
+        autocommit=False, autoflush=False, bind=db_engine
+    )
+    PollingService(session_factory=session_factory).run_loop()
diff --git a/start_local.py b/start_local.py
@@ -211,6 +211,13 @@ def run_orchestrator(
 # endregion
 
 
+# region: Metrics poller initialization
+
+from cloud_pipelines_backend.instrumentation import metrics_polling
+
+# endregion
+
+
 # region: API Server initialization
 import contextlib
 import threading
@@ -228,9 +235,9 @@ def run_orchestrator(
 @contextlib.asynccontextmanager
 async def lifespan(app: fastapi.FastAPI):
     database_ops.initialize_and_migrate_db(db_engine=db_engine)
+    threading.Thread(target=run_configured_orchestrator, daemon=True).start()
     threading.Thread(
-        target=run_configured_orchestrator,
-        daemon=True,
+        target=metrics_polling.run, kwargs={"db_engine": db_engine}, daemon=True
     ).start()
     if os.environ.get("GOOGLE_CLOUD_SHELL") == "true":
         # TODO: Find a way to get fastapi/starlette/uvicorn port