From 53463a321ad4dd7d00a119f0551abe2a69c2fc10 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Fri, 8 May 2026 17:25:20 +0900 Subject: [PATCH 01/11] refactor(BA-5878): switch back to PromQL window-based max/avg live stats Revert agent-side stats.max/avg export in favor of computing max/avg via PromQL window expressions on the manager. Container live-stat fan-out grows from 3 to 5 queries (gauge / diff / rate / max / avg). - Re-introduce MAX / AVG to ValueType plus from_legacy_live_stat_label to map "stats.max" / "stats.avg" labels back into typed value types. - Build max/avg templates that union a gauge sub-expression with a rate sub-expression and label_replace the result back to the legacy "stats.max" / "stats.avg" label. - Classify metrics into gauge-shape vs rate-shape stats sources, with exact names for built-ins and regex patterns for accelerator metrics. - Repository merges 5 query responses in a generic loop instead of unpacking three buckets. Co-Authored-By: Claude Opus 4.7 --- changes/11360.feature.md | 1 + .../common/clients/prometheus/client.py | 13 +- .../clients/prometheus/fixed_query_builder.py | 153 +++++++++++++++--- .../common/clients/prometheus/metric_types.py | 23 ++- .../common/clients/prometheus/types.py | 16 ++ .../test_container_metric.py | 69 ++++++++ 6 files changed, 244 insertions(+), 31 deletions(-) create mode 100644 changes/11360.feature.md diff --git a/changes/11360.feature.md b/changes/11360.feature.md new file mode 100644 index 00000000000..4031dca85a0 --- /dev/null +++ b/changes/11360.feature.md @@ -0,0 +1 @@ +Add window-based max/avg container live stats queries via PromQL to populate legacy `stats.max` / `stats.avg` fields diff --git a/src/ai/backend/common/clients/prometheus/client.py b/src/ai/backend/common/clients/prometheus/client.py index 9e4397576fa..a6607a70d80 100644 --- a/src/ai/backend/common/clients/prometheus/client.py +++ b/src/ai/backend/common/clients/prometheus/client.py @@ -84,13 +84,12 @@ async def fetch_container_live_stats( kernel_ids: Sequence[KernelId], ) -> dict[KernelId, list[MetricValue]]: queries = self._fixed_query_builder.get_container_live_stat_queries(kernel_ids) - gauge_response = await self._query_instant(queries.gauge) - diff_response = await self._query_instant(queries.diff) - rate_response = await self._query_instant(queries.rate) - gauge = KernelMetricValuesByKernel.from_prometheus_response(gauge_response) - diff = KernelMetricValuesByKernel.from_prometheus_response(diff_response) - rate = KernelMetricValuesByKernel.from_prometheus_response(rate_response) - merged = gauge.merged_with(diff).merged_with(rate) + merged = KernelMetricValuesByKernel(values_by_kernel={}) + for preset in queries.to_list(): + response = await self._query_instant(preset) + merged = merged.merged_with( + KernelMetricValuesByKernel.from_prometheus_response(response) + ) return merged.values_by_kernel async def execute_preset( diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 18fc8f9bd10..7496da4f07d 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -6,6 +6,12 @@ from ai.backend.common.clients.prometheus.metric_types import ( DIFF_METRICS, RATE_METRICS, + STATS_AVG_GAUGE_METRIC_PATTERNS, + STATS_AVG_GAUGE_METRICS, + STATS_AVG_OVER_RATE_METRICS, + STATS_MAX_GAUGE_METRIC_PATTERNS, + STATS_MAX_GAUGE_METRICS, + STATS_MAX_OVER_RATE_METRICS, ContainerLiveStatQueries, ContainerMetricOptionalLabel, MetricType, @@ -47,10 +53,49 @@ class LabelValuesQuery: metric_match: str +@dataclass(frozen=True) +class _LiveStatQuerySpec: + template: str + metric_name_filter: frozenset[str] | None = None + value_type_filter: ValueType | None = None + + +@dataclass(frozen=True) +class _StatsBucket: + """Window-stats bucket spec (gauge metrics + rate metrics for a single stat).""" + + value_type: ValueType + gauge_metrics: frozenset[str] + rate_metrics: frozenset[str] + gauge_metric_patterns: frozenset[str] = frozenset() + + def _regex_union(values: Sequence[str]) -> str: return "|".join(re.escape(value) for value in values) +def _metric_name_regex( + metric_names: frozenset[str], + metric_patterns: frozenset[str] = frozenset(), +) -> str: + exact_parts = [re.escape(value) for value in sorted(metric_names)] + return "|".join([*exact_parts, *sorted(metric_patterns)]) + + +_MAX_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket( + value_type=ValueType.MAX, + gauge_metrics=STATS_MAX_GAUGE_METRICS, + rate_metrics=STATS_MAX_OVER_RATE_METRICS, + gauge_metric_patterns=STATS_MAX_GAUGE_METRIC_PATTERNS, +) +_AVG_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket( + value_type=ValueType.AVG, + gauge_metrics=STATS_AVG_GAUGE_METRICS, + rate_metrics=STATS_AVG_OVER_RATE_METRICS, + gauge_metric_patterns=STATS_AVG_GAUGE_METRIC_PATTERNS, +) + + class FixedQueryBuilder: _timewindow: str @@ -100,50 +145,116 @@ def get_container_live_stat_queries( self, kernel_ids: Sequence[KernelId], ) -> ContainerLiveStatQueries: + kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) + group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) return ContainerLiveStatQueries( - gauge=self._get_container_live_stat_query( + gauge=self._get_live_stat_query( kernel_ids, - metric_type=MetricType.GAUGE, + _LiveStatQuerySpec(template=self._get_template(MetricType.GAUGE)), ), - diff=self._get_container_live_stat_query( + diff=self._get_live_stat_query( kernel_ids, - metric_type=MetricType.DIFF, - metric_name_filter=DIFF_METRICS, - value_type_filter=ValueType.CURRENT, + _LiveStatQuerySpec( + template=self._get_template(MetricType.DIFF), + metric_name_filter=DIFF_METRICS, + value_type_filter=ValueType.CURRENT, + ), ), - rate=self._get_container_live_stat_query( + rate=self._get_live_stat_query( kernel_ids, - metric_type=MetricType.RATE, - metric_name_filter=RATE_METRICS, - value_type_filter=ValueType.CURRENT, + _LiveStatQuerySpec( + template=self._get_template(MetricType.RATE), + metric_name_filter=RATE_METRICS, + value_type_filter=ValueType.CURRENT, + ), ), + max=self._build_stats_preset(_MAX_STATS_BUCKET, kernel_id_regex, group_by), + avg=self._build_stats_preset(_AVG_STATS_BUCKET, kernel_id_regex, group_by), + ) + + def _build_stats_preset( + self, + bucket: _StatsBucket, + kernel_id_regex: str, + group_by: str, + ) -> MetricPreset: + return MetricPreset( + template=self._render_stats_query( + bucket, + kernel_id_regex=kernel_id_regex, + group_by=group_by, + ) ) - def _get_container_live_stat_query( + def _get_live_stat_query( self, kernel_ids: Sequence[KernelId], - *, - metric_type: MetricType, - metric_name_filter: frozenset[str] | None = None, - value_type_filter: ValueType | None = None, + spec: _LiveStatQuerySpec, ) -> MetricPreset: labels: dict[str, LabelMatcher] = { "kernel_id": LabelMatcher.regex(_regex_union([str(kid) for kid in kernel_ids])) } - if metric_name_filter is not None: + if spec.metric_name_filter is not None: labels["container_metric_name"] = LabelMatcher.regex( - _regex_union(sorted(metric_name_filter)) + _regex_union(sorted(spec.metric_name_filter)) ) - if value_type_filter is not None: - labels["value_type"] = LabelMatcher.exact(value_type_filter.value) + if spec.value_type_filter is not None: + labels["value_type"] = LabelMatcher.exact(spec.value_type_filter.value) return MetricPreset( - template=self._get_template(metric_type), - labels=labels, + template=spec.template, group_by=_LIVE_STAT_GROUP_BY, + labels=labels, window=self._timewindow, ) + def _render_stats_query( + self, + bucket: _StatsBucket, + *, + kernel_id_regex: str, + group_by: str, + ) -> str: + parts: list[str] = [] + stat_fn = f"{bucket.value_type.value}_over_time" + stat_label = bucket.value_type.to_live_stat_label() + if bucket.gauge_metrics or bucket.gauge_metric_patterns: + gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns) + gauge_labels = self._live_stat_current_labels( + kernel_id_regex=kernel_id_regex, + metric_name_regex=gauge_regex, + ) + parts.append( + f"label_replace({stat_fn}((sum by ({group_by})(" + f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{gauge_labels}}}))[{self._timewindow}:])," + f'"value_type","{stat_label}","value_type",".*")' + ) + if bucket.rate_metrics: + rate_regex = _regex_union(sorted(bucket.rate_metrics)) + rate_labels = self._live_stat_current_labels( + kernel_id_regex=kernel_id_regex, + metric_name_regex=rate_regex, + ) + parts.append( + f"label_replace({stat_fn}((sum by ({group_by})(rate(" + f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{rate_labels}}}" + f"[{self._timewindow}])))[{self._timewindow}:])," + f'"value_type","{stat_label}","value_type",".*")' + ) + return " or ".join(parts) + + def _live_stat_current_labels( + self, + *, + kernel_id_regex: str, + metric_name_regex: str, + ) -> str: + return ( + f'kernel_id=~"{kernel_id_regex}"' + f',container_metric_name=~"{metric_name_regex}"' + f',value_type="{ValueType.CURRENT.value}"' + ) + def _get_template(self, metric_type: MetricType) -> str: match metric_type: case MetricType.GAUGE: diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index 6ce41499666..1e481aa9cb8 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -61,19 +61,36 @@ class MetricType(StrEnum): @dataclass(frozen=True) class ContainerLiveStatQueries: - """Gauge / diff / rate query preset bundle for container live stats.""" + """Gauge / diff / rate / max / avg query preset bundle for container live stats.""" gauge: MetricPreset diff: MetricPreset rate: MetricPreset + max: MetricPreset + avg: MetricPreset def to_list(self) -> list[MetricPreset]: - return [self.gauge, self.diff, self.rate] + return [self.gauge, self.diff, self.rate, self.max, self.avg] DIFF_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) RATE_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"}) +# Window stats: built-ins are exact, accelerator/plugin metrics use patterns. +STATS_MAX_GAUGE_METRICS: Final[frozenset[str]] = frozenset({ + "mem", + "io_scratch_size", +}) +STATS_MAX_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ + r"[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)", +}) +STATS_AVG_GAUGE_METRICS: Final[frozenset[str]] = frozenset() +STATS_AVG_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ + r"[A-Za-z0-9][A-Za-z0-9_-]*_util", +}) +STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) +STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) + @dataclass class ContainerMetricResponseInfo: @@ -184,7 +201,7 @@ def from_prometheus_response(cls, response: PrometheusResponse) -> Self: container_metric_name = cast(str, info.container_metric_name) value_type_str = cast(str, info.value_type) try: - value_type = ValueType(value_type_str) + value_type = ValueType.from_legacy_live_stat_label(value_type_str) kernel_id = KernelId(UUID(kernel_id_str)) except ValueError: continue diff --git a/src/ai/backend/common/clients/prometheus/types.py b/src/ai/backend/common/clients/prometheus/types.py index ef16273ab98..003d5a00d5d 100644 --- a/src/ai/backend/common/clients/prometheus/types.py +++ b/src/ai/backend/common/clients/prometheus/types.py @@ -8,6 +8,22 @@ class ValueType(StrEnum): CURRENT = "current" CAPACITY = "capacity" PCT = "pct" + MAX = "max" + AVG = "avg" + RATE = "rate" + + @classmethod + def from_legacy_live_stat_label(cls, value: str) -> "ValueType": + if value.startswith("stats."): + return cls(value.removeprefix("stats.")) + return cls(value) + + def to_live_stat_label(self) -> str: + match self: + case ValueType.MAX | ValueType.AVG | ValueType.RATE: + return f"stats.{self.value}" + case _: + return self.value @dataclass(frozen=True) diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index d6d6898974e..9a782a8fc0b 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -14,6 +14,7 @@ from ai.backend.common.clients.prometheus.metric_types import ( ContainerMetricOptionalLabel, ContainerMetricResponseInfo, + KernelMetricValuesByKernel, MetricType, ValueType, ) @@ -30,6 +31,7 @@ InvalidAPIParameters, PrometheusConnectionError, ) +from ai.backend.common.types import KernelId from ai.backend.manager.repositories.metric.repository import MetricRepository from ai.backend.manager.services.metric.actions.container import ( ContainerMetricAction, @@ -804,6 +806,73 @@ async def test_build_query_renders_expected_promql(self, case: BuiltinQueryTestC assert rendered_query == case.expected_query +class TestLiveStatQueryProvider: + """Characterization tests for container live stat PromQL.""" + + def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None: + kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678")) + fixed_query_builder = FixedQueryBuilder("5m") + + queries = fixed_query_builder.get_container_live_stat_queries([kernel_id]) + + assert queries.max.render() == ( + "label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(" + "backendai_container_utilization" + '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + 'container_metric_name=~"io_scratch_size|mem|' + '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",' + 'value_type="current"}))[5m:]),' + '"value_type","stats.max","value_type",".*")' + " or " + "label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" + "backendai_container_utilization" + '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + 'container_metric_name=~"cpu_util",value_type="current"}' + "[5m])))[5m:])," + '"value_type","stats.max","value_type",".*")' + ) + assert queries.avg.render() == ( + "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(" + "backendai_container_utilization" + '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_util",' + 'value_type="current"}))[5m:]),' + '"value_type","stats.avg","value_type",".*")' + " or " + "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" + "backendai_container_utilization" + '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + 'container_metric_name=~"cpu_util",value_type="current"}' + "[5m])))[5m:])," + '"value_type","stats.avg","value_type",".*")' + ) + + +class TestKernelMetricValuesByKernel: + def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> None: + kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678")) + response = PrometheusResponse( + status="success", + data=PrometheusQueryData( + result_type="vector", + result=[ + MetricResponse( + metric=MetricResponseInfo( + kernel_id=str(kernel_id), + container_metric_name="mem", + value_type="stats.max", + ), + values=[(1704067200.0, "1024")], + ) + ], + ), + ) + + result = KernelMetricValuesByKernel.from_prometheus_response(response) + + assert result.values_by_kernel[kernel_id][0].value_type == ValueType.MAX + + class TestMetricResponseInfoParsing: """Unit tests for MetricResponseInfo parsing behavior.""" From 520fccafb1fd7861b494d6aef9d7610b0ee30a8f Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 17:19:08 +0900 Subject: [PATCH 02/11] fix(BA-5878): unbreak live-stat regex and tighten log/enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PromQL parser rejects \- as an unknown escape sequence inside a regex literal, so re.escape over-escaping broke every container live-stat query (kernel_id UUIDs always contain hyphens). Strip the backslash from \- after escaping so the rendered queries are RE2-acceptable. Also drop the unused ValueType.RATE — no producer ever emits it and no consumer matches on it; only MAX/AVG round-trip to the legacy stats.* labels. And include the underlying exception in the warning emitted from MetricRepository.query_container_live_stats so "empty results" no longer hide the real Prometheus failure mode. Co-Authored-By: Claude Opus 4.7 --- .../common/clients/prometheus/fixed_query_builder.py | 2 +- src/ai/backend/common/clients/prometheus/types.py | 3 +-- src/ai/backend/manager/repositories/metric/repository.py | 4 ++-- .../services/utilization_metric/test_container_metric.py | 8 ++++---- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 7496da4f07d..e3ea98f463c 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -71,7 +71,7 @@ class _StatsBucket: def _regex_union(values: Sequence[str]) -> str: - return "|".join(re.escape(value) for value in values) + return "|".join(re.escape(value).replace(r"\-", "-") for value in values) def _metric_name_regex( diff --git a/src/ai/backend/common/clients/prometheus/types.py b/src/ai/backend/common/clients/prometheus/types.py index 003d5a00d5d..bc2c6a7b2fd 100644 --- a/src/ai/backend/common/clients/prometheus/types.py +++ b/src/ai/backend/common/clients/prometheus/types.py @@ -10,7 +10,6 @@ class ValueType(StrEnum): PCT = "pct" MAX = "max" AVG = "avg" - RATE = "rate" @classmethod def from_legacy_live_stat_label(cls, value: str) -> "ValueType": @@ -20,7 +19,7 @@ def from_legacy_live_stat_label(cls, value: str) -> "ValueType": def to_live_stat_label(self) -> str: match self: - case ValueType.MAX | ValueType.AVG | ValueType.RATE: + case ValueType.MAX | ValueType.AVG: return f"stats.{self.value}" case _: return self.value diff --git a/src/ai/backend/manager/repositories/metric/repository.py b/src/ai/backend/manager/repositories/metric/repository.py index e7278927b37..ccbda445cfa 100644 --- a/src/ai/backend/manager/repositories/metric/repository.py +++ b/src/ai/backend/manager/repositories/metric/repository.py @@ -70,7 +70,7 @@ async def query_container_live_stats( return KernelLiveStatBatchResult.empty(kernel_ids) try: values_by_kernel = await self._prometheus_client.fetch_container_live_stats(kernel_ids) - except (PrometheusConnectionError, FailedToGetMetric): - log.warning("Failed to query metrics for kernel live stats, returning empty results") + except (PrometheusConnectionError, FailedToGetMetric) as e: + log.warning("Failed to query metrics for kernel live stats: {!r}", e) return KernelLiveStatBatchResult.empty(kernel_ids) return KernelLiveStatBatchResult.from_metric_values(kernel_ids, values_by_kernel) diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index 9a782a8fc0b..d09c3f0cbf1 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -818,7 +818,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None assert queries.max.render() == ( "label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(" "backendai_container_utilization" - '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"io_scratch_size|mem|' '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",' 'value_type="current"}))[5m:]),' @@ -826,7 +826,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None " or " "label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" "backendai_container_utilization" - '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"cpu_util",value_type="current"}' "[5m])))[5m:])," '"value_type","stats.max","value_type",".*")' @@ -834,14 +834,14 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None assert queries.avg.render() == ( "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(" "backendai_container_utilization" - '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_util",' 'value_type="current"}))[5m:]),' '"value_type","stats.avg","value_type",".*")' " or " "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" "backendai_container_utilization" - '{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",' + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"cpu_util",value_type="current"}' "[5m])))[5m:])," '"value_type","stats.avg","value_type",".*")' From d7377b58f45f39c52b75ccda0bf921956aab5d78 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 17:34:38 +0900 Subject: [PATCH 03/11] refactor(BA-5878): factor stats subquery skeleton out of _render_stats_query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gauge and rate branches both produced the same label_replace + *_over_time + sum_by skeleton, differing only in whether the inner selector was wrapped with rate(...[window]). The two branches embedded that skeleton as inline f-strings, which hid the symmetry and was hard to read. Extract two methods on FixedQueryBuilder: - _utilization_selector() — builds {METRIC}{labels}, hiding the brace-doubling away from rendering callers. - _window_stat_subquery() — wraps a selector in label_replace({stat_fn}((sum by ({group_by})({selector}))[window:]), "value_type","{label}","value_type",".*"). This is the single place where the stats-subquery shape lives. _render_stats_query now reads as: pick a regex, build a selector (optionally wrapped in rate()), and feed it through the same skeleton. Rendered output is byte-for-byte unchanged (characterization tests pass without touching the fixture). Co-Authored-By: Claude Opus 4.7 --- .../clients/prometheus/fixed_query_builder.py | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index e3ea98f463c..0cf415687ff 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -215,34 +215,40 @@ def _render_stats_query( kernel_id_regex: str, group_by: str, ) -> str: - parts: list[str] = [] stat_fn = f"{bucket.value_type.value}_over_time" stat_label = bucket.value_type.to_live_stat_label() + parts: list[str] = [] if bucket.gauge_metrics or bucket.gauge_metric_patterns: gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns) - gauge_labels = self._live_stat_current_labels( - kernel_id_regex=kernel_id_regex, - metric_name_regex=gauge_regex, - ) - parts.append( - f"label_replace({stat_fn}((sum by ({group_by})(" - f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{gauge_labels}}}))[{self._timewindow}:])," - f'"value_type","{stat_label}","value_type",".*")' - ) + selector = self._utilization_selector(kernel_id_regex, gauge_regex) + parts.append(self._window_stat_subquery(stat_fn, selector, group_by, stat_label)) if bucket.rate_metrics: rate_regex = _regex_union(sorted(bucket.rate_metrics)) - rate_labels = self._live_stat_current_labels( - kernel_id_regex=kernel_id_regex, - metric_name_regex=rate_regex, - ) - parts.append( - f"label_replace({stat_fn}((sum by ({group_by})(rate(" - f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{rate_labels}}}" - f"[{self._timewindow}])))[{self._timewindow}:])," - f'"value_type","{stat_label}","value_type",".*")' - ) + base = self._utilization_selector(kernel_id_regex, rate_regex) + selector = f"rate({base}[{self._timewindow}])" + parts.append(self._window_stat_subquery(stat_fn, selector, group_by, stat_label)) return " or ".join(parts) + def _utilization_selector(self, kernel_id_regex: str, metric_name_regex: str) -> str: + labels = self._live_stat_current_labels( + kernel_id_regex=kernel_id_regex, + metric_name_regex=metric_name_regex, + ) + return f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{labels}}}" + + def _window_stat_subquery( + self, + stat_fn: str, + selector: str, + group_by: str, + stat_label: str, + ) -> str: + return ( + f"label_replace(" + f"{stat_fn}((sum by ({group_by})({selector}))[{self._timewindow}:])," + f'"value_type","{stat_label}","value_type",".*")' + ) + def _live_stat_current_labels( self, *, From 6a31eb6179dc2e6b80c1f234bc28cd1718bf3aaa Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 17:53:17 +0900 Subject: [PATCH 04/11] refactor(BA-5878): unify live-stat preset call shape The five fields of ContainerLiveStatQueries were built through three asymmetric idioms: gauge/diff/rate constructed _LiveStatQuerySpec on the fly inside the call, max/avg used pre-built module constants but asked the caller to compute kernel_id_regex/group_by and pass them positionally. Promote the three filtered-query specs to module-level Final constants (_GAUGE_LIVE_STAT_SPEC / _DIFF_LIVE_STAT_SPEC / _RATE_LIVE_STAT_SPEC) so they sit alongside _MAX_STATS_BUCKET / _AVG_STATS_BUCKET, and rename the two builders so all five fields read as self._build_*_preset(kernel_ids, _CONSTANT). The kernel_id_regex / group_by computation moves inside _build_window_stats_preset where it is actually consumed, instead of leaking to the caller. Co-Authored-By: Claude Opus 4.7 --- .../clients/prometheus/fixed_query_builder.py | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 0cf415687ff..02317912c4a 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -82,6 +82,20 @@ def _metric_name_regex( return "|".join([*exact_parts, *sorted(metric_patterns)]) +_GAUGE_LIVE_STAT_SPEC: Final[_LiveStatQuerySpec] = _LiveStatQuerySpec( + template=_GAUGE_TEMPLATE, +) +_DIFF_LIVE_STAT_SPEC: Final[_LiveStatQuerySpec] = _LiveStatQuerySpec( + template=_DIFF_TEMPLATE, + metric_name_filter=DIFF_METRICS, + value_type_filter=ValueType.CURRENT, +) +_RATE_LIVE_STAT_SPEC: Final[_LiveStatQuerySpec] = _LiveStatQuerySpec( + template=_RATE_TEMPLATE, + metric_name_filter=RATE_METRICS, + value_type_filter=ValueType.CURRENT, +) + _MAX_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket( value_type=ValueType.MAX, gauge_metrics=STATS_MAX_GAUGE_METRICS, @@ -145,39 +159,21 @@ def get_container_live_stat_queries( self, kernel_ids: Sequence[KernelId], ) -> ContainerLiveStatQueries: - kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) - group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) return ContainerLiveStatQueries( - gauge=self._get_live_stat_query( - kernel_ids, - _LiveStatQuerySpec(template=self._get_template(MetricType.GAUGE)), - ), - diff=self._get_live_stat_query( - kernel_ids, - _LiveStatQuerySpec( - template=self._get_template(MetricType.DIFF), - metric_name_filter=DIFF_METRICS, - value_type_filter=ValueType.CURRENT, - ), - ), - rate=self._get_live_stat_query( - kernel_ids, - _LiveStatQuerySpec( - template=self._get_template(MetricType.RATE), - metric_name_filter=RATE_METRICS, - value_type_filter=ValueType.CURRENT, - ), - ), - max=self._build_stats_preset(_MAX_STATS_BUCKET, kernel_id_regex, group_by), - avg=self._build_stats_preset(_AVG_STATS_BUCKET, kernel_id_regex, group_by), + gauge=self._build_filtered_preset(kernel_ids, _GAUGE_LIVE_STAT_SPEC), + diff=self._build_filtered_preset(kernel_ids, _DIFF_LIVE_STAT_SPEC), + rate=self._build_filtered_preset(kernel_ids, _RATE_LIVE_STAT_SPEC), + max=self._build_window_stats_preset(kernel_ids, _MAX_STATS_BUCKET), + avg=self._build_window_stats_preset(kernel_ids, _AVG_STATS_BUCKET), ) - def _build_stats_preset( + def _build_window_stats_preset( self, + kernel_ids: Sequence[KernelId], bucket: _StatsBucket, - kernel_id_regex: str, - group_by: str, ) -> MetricPreset: + kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) + group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) return MetricPreset( template=self._render_stats_query( bucket, @@ -186,7 +182,7 @@ def _build_stats_preset( ) ) - def _get_live_stat_query( + def _build_filtered_preset( self, kernel_ids: Sequence[KernelId], spec: _LiveStatQuerySpec, From c45735f549690ff77d928b07cbd3fae7ce11b6f5 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 17:56:15 +0900 Subject: [PATCH 05/11] chore: drop unreachable default arm from _get_template match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The match over MetricType already covers all three variants, so the case _ -> raise UnreachableError(...) arm was dead code that Pylance flagged as unreachable. mypy's exhaustiveness check now enforces the same invariant statically — adding a new MetricType variant without a case will fail type check, which is the same protection the runtime raise gave but caught earlier. Co-Authored-By: Claude Opus 4.7 --- .../backend/common/clients/prometheus/fixed_query_builder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 02317912c4a..365d5ad0afa 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -19,7 +19,6 @@ from ai.backend.common.clients.prometheus.preset import LabelMatcher, MetricPreset from ai.backend.common.clients.prometheus.querier import ContainerMetricQuerier from ai.backend.common.clients.prometheus.types import ValueType -from ai.backend.common.exception import UnreachableError from ai.backend.common.metrics.types import ( CONTAINER_UTILIZATION_METRIC_LABEL_NAME, CONTAINER_UTILIZATION_METRIC_NAME, @@ -265,5 +264,3 @@ def _get_template(self, metric_type: MetricType) -> str: return _RATE_TEMPLATE case MetricType.DIFF: return _DIFF_TEMPLATE - case _: - raise UnreachableError(f"Unknown metric type: {metric_type}") From 1f50031a057c61f26ad87b58a834c14147982e93 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 18:00:59 +0900 Subject: [PATCH 06/11] refactor(BA-5878): clarify legacy-direction in ValueType label helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two helpers were named "to_live_stat_label" / "from_legacy_live_stat_label", which obscured their actual asymmetry: only the encoder produces the legacy "stats." form, while the decoder accepts both legacy and current shapes. Rename them to: - ValueType.to_legacy_live_stat_label — encoder, legacy emission - ValueType.from_live_stat_label — decoder, accepts either form and document the "stats." prefix as the legacy convention so the removeprefix branch reads as historical compatibility, not generic parsing. Co-Authored-By: Claude Opus 4.7 --- .../common/clients/prometheus/fixed_query_builder.py | 2 +- src/ai/backend/common/clients/prometheus/metric_types.py | 2 +- src/ai/backend/common/clients/prometheus/types.py | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 365d5ad0afa..b736ff224bd 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -211,7 +211,7 @@ def _render_stats_query( group_by: str, ) -> str: stat_fn = f"{bucket.value_type.value}_over_time" - stat_label = bucket.value_type.to_live_stat_label() + stat_label = bucket.value_type.to_legacy_live_stat_label() parts: list[str] = [] if bucket.gauge_metrics or bucket.gauge_metric_patterns: gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns) diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index 1e481aa9cb8..b90818a835d 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -201,7 +201,7 @@ def from_prometheus_response(cls, response: PrometheusResponse) -> Self: container_metric_name = cast(str, info.container_metric_name) value_type_str = cast(str, info.value_type) try: - value_type = ValueType.from_legacy_live_stat_label(value_type_str) + value_type = ValueType.from_live_stat_label(value_type_str) kernel_id = KernelId(UUID(kernel_id_str)) except ValueError: continue diff --git a/src/ai/backend/common/clients/prometheus/types.py b/src/ai/backend/common/clients/prometheus/types.py index bc2c6a7b2fd..4f0e424e04c 100644 --- a/src/ai/backend/common/clients/prometheus/types.py +++ b/src/ai/backend/common/clients/prometheus/types.py @@ -12,12 +12,14 @@ class ValueType(StrEnum): AVG = "avg" @classmethod - def from_legacy_live_stat_label(cls, value: str) -> "ValueType": - if value.startswith("stats."): + def from_live_stat_label(cls, value: str) -> "ValueType": + if value.startswith( + "stats." + ): # Legacy live_stat labels were prefixed with "stats." (e.g. "stats.max", "stats.avg") return cls(value.removeprefix("stats.")) return cls(value) - def to_live_stat_label(self) -> str: + def to_legacy_live_stat_label(self) -> str: match self: case ValueType.MAX | ValueType.AVG: return f"stats.{self.value}" From e781b81932649dc5fdaeaf065726b8132d78e27c Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 18:34:58 +0900 Subject: [PATCH 07/11] fix(BA-5878): close accelerator pattern gap for *_power and *_temperature The window-stat gauge patterns lagged behind what legacy accelerator plugins actually publish: - *_power: legacy emits both stats.max and stats.avg; this PR was emitting only stats.max. - *_temperature: legacy emits both stats.max and stats.avg; this PR was emitting neither. Surveyed plugins (rebellions/common, rebellions/atom_max, habana, ipu, mock): - All emit *_mem (max only) and *_util (max + avg). - Only mock currently emits *_power and *_temperature, both with {avg, max} filters. Extend STATS_MAX_GAUGE_METRIC_PATTERNS to include _temperature and STATS_AVG_GAUGE_METRIC_PATTERNS to include _power and _temperature so the new pipeline matches what every legacy plugin actually publishes. Co-Authored-By: Claude Opus 4.7 --- src/ai/backend/common/clients/prometheus/metric_types.py | 4 ++-- .../services/utilization_metric/test_container_metric.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index b90818a835d..b8d6c344c5b 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -82,11 +82,11 @@ def to_list(self) -> list[MetricPreset]: "io_scratch_size", }) STATS_MAX_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ - r"[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)", + r"[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power|temperature)", }) STATS_AVG_GAUGE_METRICS: Final[frozenset[str]] = frozenset() STATS_AVG_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ - r"[A-Za-z0-9][A-Za-z0-9_-]*_util", + r"[A-Za-z0-9][A-Za-z0-9_-]*_(util|power|temperature)", }) STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index d09c3f0cbf1..12d19e73f40 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -820,7 +820,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"io_scratch_size|mem|' - '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",' + '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power|temperature)",' 'value_type="current"}))[5m:]),' '"value_type","stats.max","value_type",".*")' " or " @@ -835,7 +835,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(" "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' - 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_util",' + 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_(util|power|temperature)",' 'value_type="current"}))[5m:]),' '"value_type","stats.avg","value_type",".*")' " or " From 58c123e71aacf30022d8583b11a7b185085b3e40 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 18:53:00 +0900 Subject: [PATCH 08/11] feat(BA-5878): emit stats.rate window queries for io_read/io_write/net_rx/net_tx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy live_stat consumers — most importantly the legacy_compute_session.net_rx_bytes / net_tx_bytes / io_read_bytes / io_write_bytes GraphQL resolvers — read the "stats.rate" key from the per-metric live_stat dict. The new PromQL pipeline previously emitted nothing under that label, leaving those four legacy fields uncovered: - io_read / io_write: legacy agent's stats_filter={"rate"}, which the new pipeline did not produce. - net_rx / net_tx: legacy agent's stats_filter is empty, so the agent publishes no stats.rate at all even though the legacy resolver expects it — making the resolver always return 0. The new pipeline now produces a value where legacy never did. Two metric shapes flow through the new bucket: - Gauge-shape (net_rx, net_tx): the metric's `current` value is already a per-second rate (set by agent's current_hook = lambda m: m.stats.rate), so PromQL only needs to sum across replicas and label_replace to "stats.rate". - Counter-shape (io_read, io_write): the value is a cumulative byte counter, so PromQL applies rate(...[window]) before label_replace. Live verified against Prometheus on a running kernel: {net_rx, stats.rate} = 27530 {net_tx, stats.rate} = 30378 {io_read, stats.rate} = 0 {io_write, stats.rate} = 0 Re-introduces ValueType.RATE (and its to_legacy_live_stat_label / from_live_stat_label round-trip) that was removed earlier in the branch when no producer existed; the round-trip is now load-bearing again. Co-Authored-By: Claude Opus 4.7 --- .../clients/prometheus/fixed_query_builder.py | 28 +++++++++++++++++++ .../common/clients/prometheus/metric_types.py | 15 ++++++++-- .../common/clients/prometheus/types.py | 3 +- .../test_container_metric.py | 23 +++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index b736ff224bd..6fe7d15b6d0 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -12,6 +12,8 @@ STATS_MAX_GAUGE_METRIC_PATTERNS, STATS_MAX_GAUGE_METRICS, STATS_MAX_OVER_RATE_METRICS, + STATS_RATE_COUNTER_METRICS, + STATS_RATE_GAUGE_METRICS, ContainerLiveStatQueries, ContainerMetricOptionalLabel, MetricType, @@ -164,6 +166,32 @@ def get_container_live_stat_queries( rate=self._build_filtered_preset(kernel_ids, _RATE_LIVE_STAT_SPEC), max=self._build_window_stats_preset(kernel_ids, _MAX_STATS_BUCKET), avg=self._build_window_stats_preset(kernel_ids, _AVG_STATS_BUCKET), + rate_stats=self._build_rate_stats_preset(kernel_ids), + ) + + def _build_rate_stats_preset( + self, + kernel_ids: Sequence[KernelId], + ) -> MetricPreset: + kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) + group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) + stat_label = ValueType.RATE.to_legacy_live_stat_label() + parts: list[str] = [] + if STATS_RATE_GAUGE_METRICS: + gauge_regex = _regex_union(sorted(STATS_RATE_GAUGE_METRICS)) + selector = self._utilization_selector(kernel_id_regex, gauge_regex) + parts.append(self._labelled_sum(selector, group_by, stat_label)) + if STATS_RATE_COUNTER_METRICS: + counter_regex = _regex_union(sorted(STATS_RATE_COUNTER_METRICS)) + base = self._utilization_selector(kernel_id_regex, counter_regex) + selector = f"rate({base}[{self._timewindow}])" + parts.append(self._labelled_sum(selector, group_by, stat_label)) + return MetricPreset(template=" or ".join(parts)) + + def _labelled_sum(self, selector: str, group_by: str, stat_label: str) -> str: + return ( + f"label_replace(sum by ({group_by})({selector})," + f'"value_type","{stat_label}","value_type",".*")' ) def _build_window_stats_preset( diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index b8d6c344c5b..766301af2a2 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -61,16 +61,17 @@ class MetricType(StrEnum): @dataclass(frozen=True) class ContainerLiveStatQueries: - """Gauge / diff / rate / max / avg query preset bundle for container live stats.""" + """Gauge / diff / rate / max / avg / rate_stats query preset bundle for container live stats.""" gauge: MetricPreset diff: MetricPreset rate: MetricPreset max: MetricPreset avg: MetricPreset + rate_stats: MetricPreset def to_list(self) -> list[MetricPreset]: - return [self.gauge, self.diff, self.rate, self.max, self.avg] + return [self.gauge, self.diff, self.rate, self.max, self.avg, self.rate_stats] DIFF_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) @@ -91,6 +92,16 @@ def to_list(self) -> list[MetricPreset]: STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) +# stats.rate emission targets the legacy stats.rate live_stat label. +# Two metric shapes flow in: +# * "gauge" set: agent's current_hook already publishes per-second rate as +# the metric's `current` value, so we only need to sum across replicas +# and relabel to stats.rate (no PromQL rate() wrap). +# * "counter" set: the published series is a cumulative byte counter, so +# we apply rate(...[window]) to get bytes/sec before relabel. +STATS_RATE_GAUGE_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"}) +STATS_RATE_COUNTER_METRICS: Final[frozenset[str]] = frozenset({"io_read", "io_write"}) + @dataclass class ContainerMetricResponseInfo: diff --git a/src/ai/backend/common/clients/prometheus/types.py b/src/ai/backend/common/clients/prometheus/types.py index 4f0e424e04c..ef0129f9d31 100644 --- a/src/ai/backend/common/clients/prometheus/types.py +++ b/src/ai/backend/common/clients/prometheus/types.py @@ -10,6 +10,7 @@ class ValueType(StrEnum): PCT = "pct" MAX = "max" AVG = "avg" + RATE = "rate" @classmethod def from_live_stat_label(cls, value: str) -> "ValueType": @@ -21,7 +22,7 @@ def from_live_stat_label(cls, value: str) -> "ValueType": def to_legacy_live_stat_label(self) -> str: match self: - case ValueType.MAX | ValueType.AVG: + case ValueType.MAX | ValueType.AVG | ValueType.RATE: return f"stats.{self.value}" case _: return self.value diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index 12d19e73f40..f65a5ddfd72 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -847,6 +847,29 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None '"value_type","stats.avg","value_type",".*")' ) + def test_rate_stats_query_renders_legacy_stats_rate_label(self) -> None: + kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678")) + fixed_query_builder = FixedQueryBuilder("5m") + + queries = fixed_query_builder.get_container_live_stat_queries([kernel_id]) + + assert queries.rate_stats.render() == ( + "label_replace(sum by (container_metric_name,kernel_id,value_type)(" + "backendai_container_utilization" + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' + 'container_metric_name=~"net_rx|net_tx",' + 'value_type="current"}),' + '"value_type","stats.rate","value_type",".*")' + " or " + "label_replace(sum by (container_metric_name,kernel_id,value_type)(rate(" + "backendai_container_utilization" + '{kernel_id=~"12345678-1234-5678-1234-567812345678",' + 'container_metric_name=~"io_read|io_write",' + 'value_type="current"}' + "[5m]))," + '"value_type","stats.rate","value_type",".*")' + ) + class TestKernelMetricValuesByKernel: def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> None: From 56d6638f39b442dceb0a674d0221ebf9d30ae3fa Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 19:07:21 +0900 Subject: [PATCH 09/11] refactor(BA-5878): drop "stats." prefix from query-time value_type labels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier the new pipeline emitted "stats.max" / "stats.avg" / "stats.rate" as the rewritten value_type label, mirroring the legacy live_stat dict keys. That coupling was unnecessary: the labels are synthesized at query time via label_replace and never persist in the Prometheus TSDB (only "current" / "capacity" series actually get scraped). The legacy "stats." prefix is a Valkey live_stat artifact that future legacy- compat consumers should re-attach in a dedicated converter, not something the manager pipeline needs to carry. The new pipeline now emits clean, ValueType-enum-aligned labels: stats.max -> max stats.avg -> avg stats.rate -> rate Round-trip helpers go away: - ValueType.to_legacy_live_stat_label() removed (was a 1-arm match that always returned "stats.{value}" for the three stat slots). - ValueType.from_live_stat_label() removed; KernelMetricValuesByKernel parses the response label via ValueType(value_type_str) directly. Drop trailing .value on f-string interpolation of ValueType members — StrEnum's __format__ already returns the string value, so f"{ValueType.MAX}" == "max" without the explicit attribute access. Test fixtures and the from_prometheus_response characterisation test now use the bare labels. Co-Authored-By: Claude Opus 4.7 --- .../clients/prometheus/fixed_query_builder.py | 18 ++++++++---------- .../common/clients/prometheus/metric_types.py | 2 +- .../backend/common/clients/prometheus/types.py | 15 --------------- .../test_container_metric.py | 16 ++++++++-------- 4 files changed, 17 insertions(+), 34 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py index 6fe7d15b6d0..504720b2516 100644 --- a/src/ai/backend/common/clients/prometheus/fixed_query_builder.py +++ b/src/ai/backend/common/clients/prometheus/fixed_query_builder.py @@ -175,20 +175,19 @@ def _build_rate_stats_preset( ) -> MetricPreset: kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) - stat_label = ValueType.RATE.to_legacy_live_stat_label() parts: list[str] = [] if STATS_RATE_GAUGE_METRICS: gauge_regex = _regex_union(sorted(STATS_RATE_GAUGE_METRICS)) selector = self._utilization_selector(kernel_id_regex, gauge_regex) - parts.append(self._labelled_sum(selector, group_by, stat_label)) + parts.append(self._labelled_sum(selector, group_by, ValueType.RATE)) if STATS_RATE_COUNTER_METRICS: counter_regex = _regex_union(sorted(STATS_RATE_COUNTER_METRICS)) base = self._utilization_selector(kernel_id_regex, counter_regex) selector = f"rate({base}[{self._timewindow}])" - parts.append(self._labelled_sum(selector, group_by, stat_label)) + parts.append(self._labelled_sum(selector, group_by, ValueType.RATE)) return MetricPreset(template=" or ".join(parts)) - def _labelled_sum(self, selector: str, group_by: str, stat_label: str) -> str: + def _labelled_sum(self, selector: str, group_by: str, stat_label: ValueType) -> str: return ( f"label_replace(sum by ({group_by})({selector})," f'"value_type","{stat_label}","value_type",".*")' @@ -238,18 +237,17 @@ def _render_stats_query( kernel_id_regex: str, group_by: str, ) -> str: - stat_fn = f"{bucket.value_type.value}_over_time" - stat_label = bucket.value_type.to_legacy_live_stat_label() + stat_fn = f"{bucket.value_type}_over_time" parts: list[str] = [] if bucket.gauge_metrics or bucket.gauge_metric_patterns: gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns) selector = self._utilization_selector(kernel_id_regex, gauge_regex) - parts.append(self._window_stat_subquery(stat_fn, selector, group_by, stat_label)) + parts.append(self._window_stat_subquery(stat_fn, selector, group_by, bucket.value_type)) if bucket.rate_metrics: rate_regex = _regex_union(sorted(bucket.rate_metrics)) base = self._utilization_selector(kernel_id_regex, rate_regex) selector = f"rate({base}[{self._timewindow}])" - parts.append(self._window_stat_subquery(stat_fn, selector, group_by, stat_label)) + parts.append(self._window_stat_subquery(stat_fn, selector, group_by, bucket.value_type)) return " or ".join(parts) def _utilization_selector(self, kernel_id_regex: str, metric_name_regex: str) -> str: @@ -264,7 +262,7 @@ def _window_stat_subquery( stat_fn: str, selector: str, group_by: str, - stat_label: str, + stat_label: ValueType, ) -> str: return ( f"label_replace(" @@ -281,7 +279,7 @@ def _live_stat_current_labels( return ( f'kernel_id=~"{kernel_id_regex}"' f',container_metric_name=~"{metric_name_regex}"' - f',value_type="{ValueType.CURRENT.value}"' + f',value_type="{ValueType.CURRENT}"' ) def _get_template(self, metric_type: MetricType) -> str: diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index 766301af2a2..fe07a6783c7 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -212,7 +212,7 @@ def from_prometheus_response(cls, response: PrometheusResponse) -> Self: container_metric_name = cast(str, info.container_metric_name) value_type_str = cast(str, info.value_type) try: - value_type = ValueType.from_live_stat_label(value_type_str) + value_type = ValueType(value_type_str) kernel_id = KernelId(UUID(kernel_id_str)) except ValueError: continue diff --git a/src/ai/backend/common/clients/prometheus/types.py b/src/ai/backend/common/clients/prometheus/types.py index ef0129f9d31..7219c7f58cd 100644 --- a/src/ai/backend/common/clients/prometheus/types.py +++ b/src/ai/backend/common/clients/prometheus/types.py @@ -12,21 +12,6 @@ class ValueType(StrEnum): AVG = "avg" RATE = "rate" - @classmethod - def from_live_stat_label(cls, value: str) -> "ValueType": - if value.startswith( - "stats." - ): # Legacy live_stat labels were prefixed with "stats." (e.g. "stats.max", "stats.avg") - return cls(value.removeprefix("stats.")) - return cls(value) - - def to_legacy_live_stat_label(self) -> str: - match self: - case ValueType.MAX | ValueType.AVG | ValueType.RATE: - return f"stats.{self.value}" - case _: - return self.value - @dataclass(frozen=True) class MetricValue: diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index f65a5ddfd72..70e0b98a007 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -822,14 +822,14 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None 'container_metric_name=~"io_scratch_size|mem|' '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power|temperature)",' 'value_type="current"}))[5m:]),' - '"value_type","stats.max","value_type",".*")' + '"value_type","max","value_type",".*")' " or " "label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"cpu_util",value_type="current"}' "[5m])))[5m:])," - '"value_type","stats.max","value_type",".*")' + '"value_type","max","value_type",".*")' ) assert queries.avg.render() == ( "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(" @@ -837,14 +837,14 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_(util|power|temperature)",' 'value_type="current"}))[5m:]),' - '"value_type","stats.avg","value_type",".*")' + '"value_type","avg","value_type",".*")' " or " "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(rate(" "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"cpu_util",value_type="current"}' "[5m])))[5m:])," - '"value_type","stats.avg","value_type",".*")' + '"value_type","avg","value_type",".*")' ) def test_rate_stats_query_renders_legacy_stats_rate_label(self) -> None: @@ -859,7 +859,7 @@ def test_rate_stats_query_renders_legacy_stats_rate_label(self) -> None: '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"net_rx|net_tx",' 'value_type="current"}),' - '"value_type","stats.rate","value_type",".*")' + '"value_type","rate","value_type",".*")' " or " "label_replace(sum by (container_metric_name,kernel_id,value_type)(rate(" "backendai_container_utilization" @@ -867,12 +867,12 @@ def test_rate_stats_query_renders_legacy_stats_rate_label(self) -> None: 'container_metric_name=~"io_read|io_write",' 'value_type="current"}' "[5m]))," - '"value_type","stats.rate","value_type",".*")' + '"value_type","rate","value_type",".*")' ) class TestKernelMetricValuesByKernel: - def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> None: + def test_from_prometheus_response_parses_value_type_into_enum(self) -> None: kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678")) response = PrometheusResponse( status="success", @@ -883,7 +883,7 @@ def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> metric=MetricResponseInfo( kernel_id=str(kernel_id), container_metric_name="mem", - value_type="stats.max", + value_type="max", ), values=[(1704067200.0, "1024")], ) From 9f34a315ad1d2ab53de0152a1321e3cc74225cdb Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 19:18:51 +0900 Subject: [PATCH 10/11] docs(BA-5878): reframe news fragment around legacy stats.* compat The original phrasing led with the implementation ("window-based max/avg queries via PromQL") rather than the user-facing motivation ("a parallel supply for legacy stats.* live_stat fields that doesn't suffer the agent-restart / accumulator failure modes"). Updates the changelog entry to lead with the compat angle, expands coverage to include stats.rate (now also produced), and notes the restart-safe / window semantics that make this a meaningful upgrade over the legacy producer. Co-Authored-By: Claude Opus 4.7 --- changes/11360.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/11360.feature.md b/changes/11360.feature.md index 4031dca85a0..2a87eeac417 100644 --- a/changes/11360.feature.md +++ b/changes/11360.feature.md @@ -1 +1 @@ -Add window-based max/avg container live stats queries via PromQL to populate legacy `stats.max` / `stats.avg` fields +Provide a manager-side parallel supply for legacy `live_stat` `stats.max` / `stats.avg` / `stats.rate` fields, computed from Prometheus on demand instead of from the agent's in-memory `MovingStatistics` accumulator. Survives agent / manager / host restart, stays consistent across sessions, and uses a sliding window (default 5m) instead of unbounded lifetime accumulation. From 7458db74fbe5531b5e5593ea0b63ec9a17159fca Mon Sep 17 00:00:00 2001 From: BoKeum Date: Sun, 10 May 2026 21:48:55 +0900 Subject: [PATCH 11/11] refactor(BA-5878): centralize accel-suffix list as plugin extension point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Derive STATS_MAX_GAUGE_METRIC_PATTERNS / STATS_AVG_GAUGE_METRIC_PATTERNS from a single _ACCEL_GAUGE_SUFFIXES_* source so adding a new accelerator metric kind (e.g. clock, voltage) is a one-suffix edit instead of editing two regex strings in lockstep. Functionally identical — the generated regex is the same alternation, just sorted alphabetically. Snapshot tests adjusted for the new sort order. Co-Authored-By: Claude Opus 4.7 --- .../common/clients/prometheus/metric_types.py | 26 ++++++++++++++++--- .../test_container_metric.py | 4 +-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ai/backend/common/clients/prometheus/metric_types.py b/src/ai/backend/common/clients/prometheus/metric_types.py index fe07a6783c7..ce672b828db 100644 --- a/src/ai/backend/common/clients/prometheus/metric_types.py +++ b/src/ai/backend/common/clients/prometheus/metric_types.py @@ -74,20 +74,38 @@ def to_list(self) -> list[MetricPreset]: return [self.gauge, self.diff, self.rate, self.max, self.avg, self.rate_stats] +# Backend.AI accelerator/plugin gauge metric naming convention. +# Adding a new suffix here is the single edit needed to extend stats.{max,avg} +# coverage to a new family of accelerator metrics (e.g., adding "clock" auto- +# covers cuda_clock / gpu_clock / tpu_clock). +_ACCEL_GAUGE_SUFFIXES_MAX_ONLY: Final[frozenset[str]] = frozenset({"mem"}) +_ACCEL_GAUGE_SUFFIXES_WITH_AVG: Final[frozenset[str]] = frozenset({ + "util", + "power", + "temperature", +}) + + +def _accel_suffix_pattern(suffixes: frozenset[str]) -> str: + body = "|".join(sorted(suffixes)) + return rf"[A-Za-z0-9][A-Za-z0-9_-]*_({body})" + + DIFF_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) RATE_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"}) -# Window stats: built-ins are exact, accelerator/plugin metrics use patterns. +# Intrinsic gauge metrics that don't follow the accelerator suffix convention. STATS_MAX_GAUGE_METRICS: Final[frozenset[str]] = frozenset({ "mem", "io_scratch_size", }) +STATS_AVG_GAUGE_METRICS: Final[frozenset[str]] = frozenset() +# Pattern-based gauge coverage for plugin/accelerator metrics. STATS_MAX_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ - r"[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power|temperature)", + _accel_suffix_pattern(_ACCEL_GAUGE_SUFFIXES_MAX_ONLY | _ACCEL_GAUGE_SUFFIXES_WITH_AVG), }) -STATS_AVG_GAUGE_METRICS: Final[frozenset[str]] = frozenset() STATS_AVG_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({ - r"[A-Za-z0-9][A-Za-z0-9_-]*_(util|power|temperature)", + _accel_suffix_pattern(_ACCEL_GAUGE_SUFFIXES_WITH_AVG), }) STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"}) diff --git a/tests/unit/manager/services/utilization_metric/test_container_metric.py b/tests/unit/manager/services/utilization_metric/test_container_metric.py index 70e0b98a007..05a17f24e4b 100644 --- a/tests/unit/manager/services/utilization_metric/test_container_metric.py +++ b/tests/unit/manager/services/utilization_metric/test_container_metric.py @@ -820,7 +820,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' 'container_metric_name=~"io_scratch_size|mem|' - '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power|temperature)",' + '[A-Za-z0-9][A-Za-z0-9_-]*_(mem|power|temperature|util)",' 'value_type="current"}))[5m:]),' '"value_type","max","value_type",".*")' " or " @@ -835,7 +835,7 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None "label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(" "backendai_container_utilization" '{kernel_id=~"12345678-1234-5678-1234-567812345678",' - 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_(util|power|temperature)",' + 'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_(power|temperature|util)",' 'value_type="current"}))[5m:]),' '"value_type","avg","value_type",".*")' " or "