refactor: migrate kernel live_stat from Valkey to Prometheus

seedspirit · seedspirit · commit 323c1d12fec7 · 2026-04-27T13:00:08.000+09:00
diff --git a/changes/11330.enhance.md b/changes/11330.enhance.md
@@ -0,0 +1 @@
+Migrate kernel `live_stat` GraphQL resolver from Valkey to Prometheus while preserving the legacy wire shape
diff --git a/src/ai/backend/agent/stats.py b/src/ai/backend/agent/stats.py
@@ -26,15 +26,17 @@
 from ai.backend.common import msgpack
 from ai.backend.common.identity import is_containerized
 from ai.backend.common.metrics.metric import StageObserver
-from ai.backend.common.metrics.types import UTILIZATION_METRIC_INTERVAL
+from ai.backend.common.metrics.types import (
+    UTILIZATION_METRIC_INTERVAL,
+    MetricValue,
+    MovingStatValue,
+)
 from ai.backend.common.types import (
     PID,
     ContainerId,
     DeviceId,
     KernelId,
     MetricKey,
-    MetricValue,
-    MovingStatValue,
     SessionId,
     SlotName,
 )
diff --git a/src/ai/backend/appproxy/worker/types.py b/src/ai/backend/appproxy/worker/types.py
@@ -42,10 +42,9 @@
     SafeGauge,
     SafeHistogram,
 )
+from ai.backend.common.metrics.types import MetricValue, MovingStatValue
 from ai.backend.common.types import (
     MetricKey,
-    MetricValue,
-    MovingStatValue,
     RuntimeVariant,
 )
 
diff --git a/src/ai/backend/client/output/formatters.py b/src/ai/backend/client/output/formatters.py
@@ -9,7 +9,7 @@
 
 import humanize
 
-from ai.backend.common.types import MetricValue
+from ai.backend.common.metrics.types import MetricValue
 
 from .types import AbstractOutputFormatter, FieldSpec
 
diff --git a/src/ai/backend/common/clients/valkey_client/valkey_stat/client.py b/src/ai/backend/common/clients/valkey_client/valkey_stat/client.py
@@ -25,6 +25,7 @@
 from ai.backend.common.exception import BackendAIError
 from ai.backend.common.json import dump_json_str, load_json
 from ai.backend.common.metrics.metric import DomainType, LayerType
+from ai.backend.common.metrics.types import MetricValue
 from ai.backend.common.resilience import (
     BackoffStrategy,
     MetricArgs,
@@ -34,7 +35,7 @@
     RetryPolicy,
 )
 from ai.backend.common.resource.types import TotalResourceData
-from ai.backend.common.types import AccessKey, MetricKey, MetricValue, ValkeyTarget
+from ai.backend.common.types import AccessKey, MetricKey, ValkeyTarget
 from ai.backend.logging.utils import BraceStyleAdapter
 
 log = BraceStyleAdapter(logging.getLogger(__spec__.name))
diff --git a/src/ai/backend/common/metrics/types.py b/src/ai/backend/common/metrics/types.py
@@ -1,11 +1,107 @@
-from typing import Final
+from typing import Final, TypedDict
 
 UNDEFINED: Final[str] = "undefined"
 
+
+class MovingStatValue(TypedDict):
+    min: str
+    max: str
+    sum: str
+    avg: str
+    diff: str
+    rate: str
+    version: int | None  # for legacy client compatibility
+
+
+MetricValue = TypedDict(
+    "MetricValue",
+    {
+        "current": str,
+        "capacity": str,
+        "pct": str,
+        "unit_hint": str,
+        "stats.min": str,
+        "stats.max": str,
+        "stats.sum": str,
+        "stats.avg": str,
+        "stats.diff": str,
+        "stats.rate": str,
+        "stats.version": int | None,
+    },
+)
+
+
+def make_default_metric_value(unit_hint: str) -> MetricValue:
+    """Return a `MetricValue` populated with neutral defaults.
+
+    All numeric string fields are `"0"` (including `capacity`, matching the
+    legacy Valkey shape where every metric carried a string capacity).
+    `unit_hint` is supplied by the caller.
+    """
+    return MetricValue({
+        "current": "0",
+        "capacity": "0",
+        "pct": "0",
+        "unit_hint": unit_hint,
+        "stats.min": "0",
+        "stats.max": "0",
+        "stats.sum": "0",
+        "stats.avg": "0",
+        "stats.diff": "0",
+        "stats.rate": "0",
+        "stats.version": None,
+    })
+
+
 UTILIZATION_METRIC_INTERVAL: Final[float] = 5.0
 UTILIZATION_METRIC_DETENTION: Final[float] = 600.0  # 10 minutes
 
 CONTAINER_UTILIZATION_METRIC_NAME: Final[str] = "backendai_container_utilization"
 CONTAINER_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "container_metric_name"
 DEVICE_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "device_metric_name"
 PROCESS_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "process_metric_name"
+
+# Metric-name classification used by the legacy live_stat dict converter.
+# These mirror the semantics that Worker's MovingStatistics produced when
+# kernel stats were stored in Valkey:
+#   - RATE_STAT_METRICS: stats.rate is meaningful (rate of change per second).
+#   - DIFF_STAT_METRICS: stats.diff is meaningful (delta over the last window).
+RATE_STAT_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
+DIFF_STAT_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
+
+# Per-metric unit hint emitted by the agent (source of truth: src/ai/backend/agent/docker/intrinsic.py).
+METRIC_UNIT_HINTS: Final[dict[str, str]] = {
+    "cpu_used": "msec",
+    "cpu_util": "percent",
+    "mem": "bytes",
+    "net_rx": "bps",
+    "net_tx": "bps",
+    "io_read": "bytes",
+    "io_write": "bytes",
+    "io_scratch_size": "bytes",
+}
+
+
+def resolve_unit_hint(metric_name: str) -> str:
+    """Return the unit_hint for a Backend.AI container metric name.
+
+    Prometheus does not carry the agent-side `unit_hint` in its samples, so the
+    manager has to recover it from the metric name alone. Lookup order:
+
+      1. Explicit registration in :data:`METRIC_UNIT_HINTS` (highest priority).
+      2. Naming-convention fallback for plugin metrics that follow Backend.AI
+         conventions (e.g., `cuda_util`, `gpu_mem`, `tpu_util`).
+      3. The metric_name itself as a last resort — preserves the sample data
+         and surfaces the missing registration to the WebUI via the response.
+    """
+    if metric_name in METRIC_UNIT_HINTS:
+        return METRIC_UNIT_HINTS[metric_name]
+    if metric_name.endswith("_util"):
+        return "percent"
+    if metric_name == "mem" or metric_name.endswith("_mem"):
+        return "bytes"
+    if metric_name.startswith("io_"):
+        return "bytes"
+    if metric_name.startswith("net_"):
+        return "bps"
+    return metric_name
diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py
@@ -106,7 +106,6 @@
     "KernelEnqueueingConfig",
     "KernelId",
     "MetricKey",
-    "MetricValue",
     "ModelServiceProfile",
     "ModelServiceStatus",
     "MountExpression",
@@ -115,7 +114,6 @@
     "MountPermissionLiteral",
     "MountPoint",
     "MountTypes",
-    "MovingStatValue",
     "PreemptionMode",
     "PreemptionOrder",
     "PromMetric",
@@ -565,34 +563,6 @@ class AbuseReport(TypedDict):
     abuse_report: str | None
 
 
-class MovingStatValue(TypedDict):
-    min: str
-    max: str
-    sum: str
-    avg: str
-    diff: str
-    rate: str
-    version: int | None  # for legacy client compatibility
-
-
-MetricValue = TypedDict(
-    "MetricValue",
-    {
-        "current": str,
-        "capacity": str | None,
-        "pct": str,
-        "unit_hint": str,
-        "stats.min": str,
-        "stats.max": str,
-        "stats.sum": str,
-        "stats.avg": str,
-        "stats.diff": str,
-        "stats.rate": str,
-        "stats.version": int | None,
-    },
-)
-
-
 class IntrinsicSlotNames(enum.Enum):
     CPU = SlotName("cpu")
     MEMORY = SlotName("mem")
diff --git a/src/ai/backend/manager/api/gql_legacy/kernel.py b/src/ai/backend/manager/api/gql_legacy/kernel.py
@@ -24,6 +24,7 @@
     KernelId,
     SessionId,
 )
+from ai.backend.manager.api.gql_legacy.stat_converter import LegacyLiveStatConverter
 from ai.backend.manager.data.kernel.types import KernelStatus
 from ai.backend.manager.defs import DEFAULT_ROLE
 from ai.backend.manager.models.group import groups
@@ -42,6 +43,7 @@
     QueryFilterParser,
 )
 from ai.backend.manager.models.user import UserRole, users
+from ai.backend.manager.services.metric.actions.live_stat import ContainerLiveStatAction
 
 from .base import (
     BigInt,
@@ -67,6 +69,23 @@
 )
 
 
+async def _batch_load_kernel_live_stat(
+    ctx: GraphQueryContext,
+    kernel_ids: Sequence[KernelId],
+) -> list[dict[str, Any] | None]:
+    """Prometheus-backed replacement for the old Valkey `KernelStatistics.by_kernel`
+    loader. Returns the legacy `dict[metric_name, MetricValue]` shape (or `None`
+    when the kernel has no Prometheus samples) preserving wire compatibility.
+    """
+    if not kernel_ids:
+        return []
+    action_result = await ctx.processors.metric.query_container_live_stat.wait_for_complete(
+        ContainerLiveStatAction(kernel_ids=list(kernel_ids))
+    )
+    converted = LegacyLiveStatConverter().convert(action_result.stats)
+    return [converted.get(kid) for kid in kernel_ids]
+
+
 class KernelNode(graphene.ObjectType):  # type: ignore[misc]
     class Meta:
         interfaces = (AsyncNode,)
@@ -190,17 +209,10 @@ async def resolve_image(self, info: graphene.ResolveInfo) -> ImageNode | None:
     async def resolve_live_stat(self, info: graphene.ResolveInfo) -> dict[str, Any] | None:
         graph_ctx: GraphQueryContext = info.context
         loader = graph_ctx.dataloader_manager.get_loader_by_func(
-            graph_ctx, self.batch_load_live_stat
+            graph_ctx, _batch_load_kernel_live_stat
         )
         return cast(dict[str, Any] | None, await loader.load(self.row_id))
 
-    @classmethod
-    async def batch_load_live_stat(
-        cls, ctx: GraphQueryContext, kernel_ids: Sequence[KernelId]
-    ) -> list[dict[str, Any] | None]:
-        kernel_ids_str = [str(kid) for kid in kernel_ids]
-        return await ctx.valkey_stat.get_session_statistics_batch(kernel_ids_str)
-
 
 class KernelConnection(Connection):
     class Meta:
@@ -313,7 +325,9 @@ def from_row(cls, ctx: GraphQueryContext, row: KernelRow | None) -> ComputeConta
     # we can leave last_stat value for legacy support, as an alias to last_stat
     async def resolve_live_stat(self, info: graphene.ResolveInfo) -> Mapping[str, Any] | None:
         graph_ctx: GraphQueryContext = info.context
-        loader = graph_ctx.dataloader_manager.get_loader(graph_ctx, "KernelStatistics.by_kernel")
+        loader = graph_ctx.dataloader_manager.get_loader_by_func(
+            graph_ctx, _batch_load_kernel_live_stat
+        )
         return cast(Mapping[str, Any] | None, await loader.load(self.id))
 
     async def resolve_last_stat(self, info: graphene.ResolveInfo) -> Mapping[str, Any] | None:
@@ -606,7 +620,9 @@ class Meta:
     # we can leave last_stat value for legacy support, as an alias to last_stat
     async def resolve_live_stat(self, info: graphene.ResolveInfo) -> Mapping[str, Any] | None:
         graph_ctx: GraphQueryContext = info.context
-        loader = graph_ctx.dataloader_manager.get_loader(graph_ctx, "KernelStatistics.by_kernel")
+        loader = graph_ctx.dataloader_manager.get_loader_by_func(
+            graph_ctx, _batch_load_kernel_live_stat
+        )
         return cast(Mapping[str, Any] | None, await loader.load(self.id))
 
     async def resolve_last_stat(self, info: graphene.ResolveInfo) -> Mapping[str, Any] | None:
@@ -632,7 +648,9 @@ async def _resolve_legacy_metric(
             if value is None:
                 return convert_type(0)
             return convert_type(value)
-        loader = graph_ctx.dataloader_manager.get_loader(graph_ctx, "KernelStatistics.by_kernel")
+        loader = graph_ctx.dataloader_manager.get_loader_by_func(
+            graph_ctx, _batch_load_kernel_live_stat
+        )
         kstat = await loader.load(self.id)
         if kstat is None:
             return convert_type(0)
diff --git a/src/ai/backend/manager/api/gql_legacy/stat_converter.py b/src/ai/backend/manager/api/gql_legacy/stat_converter.py
diff --git a/src/ai/backend/manager/api/gql_legacy/statistics.py b/src/ai/backend/manager/api/gql_legacy/statistics.py
diff --git a/src/ai/backend/manager/clients/prometheus/fixed_query_builder.py b/src/ai/backend/manager/clients/prometheus/fixed_query_builder.py
diff --git a/tests/unit/manager/api/gql_legacy/test_stat_converter.py b/tests/unit/manager/api/gql_legacy/test_stat_converter.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Migrate kernel `live_stat` GraphQL resolver from Valkey to Prometheus while preserving the legacy wire shape
Original file line number	Diff line number	Diff line change
`@@ -42,10 +42,9 @@`
`42`	`42`	`SafeGauge,`
`43`	`43`	`SafeHistogram,`
`44`	`44`	`)`
	`45`	`+from ai.backend.common.metrics.types import MetricValue, MovingStatValue`
`45`	`46`	`from ai.backend.common.types import (`
`46`	`47`	`MetricKey,`
`47`		`- MetricValue,`
`48`		`- MovingStatValue,`
`49`	`48`	`RuntimeVariant,`
`50`	`49`	`)`
`51`	`50`