Skip to content

Commit 53463a3

Browse files
seedspiritclaude
andcommitted
refactor(BA-5878): switch back to PromQL window-based max/avg live stats
Revert agent-side stats.max/avg export in favor of computing max/avg via PromQL window expressions on the manager. Container live-stat fan-out grows from 3 to 5 queries (gauge / diff / rate / max / avg). - Re-introduce MAX / AVG to ValueType plus from_legacy_live_stat_label to map "stats.max" / "stats.avg" labels back into typed value types. - Build max/avg templates that union a gauge sub-expression with a rate sub-expression and label_replace the result back to the legacy "stats.max" / "stats.avg" label. - Classify metrics into gauge-shape vs rate-shape stats sources, with exact names for built-ins and regex patterns for accelerator metrics. - Repository merges 5 query responses in a generic loop instead of unpacking three buckets. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 16f4bc6 commit 53463a3

6 files changed

Lines changed: 244 additions & 31 deletions

File tree

changes/11360.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add window-based max/avg container live stats queries via PromQL to populate legacy `stats.max` / `stats.avg` fields

src/ai/backend/common/clients/prometheus/client.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,12 @@ async def fetch_container_live_stats(
8484
kernel_ids: Sequence[KernelId],
8585
) -> dict[KernelId, list[MetricValue]]:
8686
queries = self._fixed_query_builder.get_container_live_stat_queries(kernel_ids)
87-
gauge_response = await self._query_instant(queries.gauge)
88-
diff_response = await self._query_instant(queries.diff)
89-
rate_response = await self._query_instant(queries.rate)
90-
gauge = KernelMetricValuesByKernel.from_prometheus_response(gauge_response)
91-
diff = KernelMetricValuesByKernel.from_prometheus_response(diff_response)
92-
rate = KernelMetricValuesByKernel.from_prometheus_response(rate_response)
93-
merged = gauge.merged_with(diff).merged_with(rate)
87+
merged = KernelMetricValuesByKernel(values_by_kernel={})
88+
for preset in queries.to_list():
89+
response = await self._query_instant(preset)
90+
merged = merged.merged_with(
91+
KernelMetricValuesByKernel.from_prometheus_response(response)
92+
)
9493
return merged.values_by_kernel
9594

9695
async def execute_preset(

src/ai/backend/common/clients/prometheus/fixed_query_builder.py

Lines changed: 132 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
from ai.backend.common.clients.prometheus.metric_types import (
77
DIFF_METRICS,
88
RATE_METRICS,
9+
STATS_AVG_GAUGE_METRIC_PATTERNS,
10+
STATS_AVG_GAUGE_METRICS,
11+
STATS_AVG_OVER_RATE_METRICS,
12+
STATS_MAX_GAUGE_METRIC_PATTERNS,
13+
STATS_MAX_GAUGE_METRICS,
14+
STATS_MAX_OVER_RATE_METRICS,
915
ContainerLiveStatQueries,
1016
ContainerMetricOptionalLabel,
1117
MetricType,
@@ -47,10 +53,49 @@ class LabelValuesQuery:
4753
metric_match: str
4854

4955

56+
@dataclass(frozen=True)
57+
class _LiveStatQuerySpec:
58+
template: str
59+
metric_name_filter: frozenset[str] | None = None
60+
value_type_filter: ValueType | None = None
61+
62+
63+
@dataclass(frozen=True)
64+
class _StatsBucket:
65+
"""Window-stats bucket spec (gauge metrics + rate metrics for a single stat)."""
66+
67+
value_type: ValueType
68+
gauge_metrics: frozenset[str]
69+
rate_metrics: frozenset[str]
70+
gauge_metric_patterns: frozenset[str] = frozenset()
71+
72+
5073
def _regex_union(values: Sequence[str]) -> str:
5174
return "|".join(re.escape(value) for value in values)
5275

5376

77+
def _metric_name_regex(
78+
metric_names: frozenset[str],
79+
metric_patterns: frozenset[str] = frozenset(),
80+
) -> str:
81+
exact_parts = [re.escape(value) for value in sorted(metric_names)]
82+
return "|".join([*exact_parts, *sorted(metric_patterns)])
83+
84+
85+
_MAX_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket(
86+
value_type=ValueType.MAX,
87+
gauge_metrics=STATS_MAX_GAUGE_METRICS,
88+
rate_metrics=STATS_MAX_OVER_RATE_METRICS,
89+
gauge_metric_patterns=STATS_MAX_GAUGE_METRIC_PATTERNS,
90+
)
91+
_AVG_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket(
92+
value_type=ValueType.AVG,
93+
gauge_metrics=STATS_AVG_GAUGE_METRICS,
94+
rate_metrics=STATS_AVG_OVER_RATE_METRICS,
95+
gauge_metric_patterns=STATS_AVG_GAUGE_METRIC_PATTERNS,
96+
)
97+
98+
5499
class FixedQueryBuilder:
55100
_timewindow: str
56101

@@ -100,50 +145,116 @@ def get_container_live_stat_queries(
100145
self,
101146
kernel_ids: Sequence[KernelId],
102147
) -> ContainerLiveStatQueries:
148+
kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids])
149+
group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY))
103150
return ContainerLiveStatQueries(
104-
gauge=self._get_container_live_stat_query(
151+
gauge=self._get_live_stat_query(
105152
kernel_ids,
106-
metric_type=MetricType.GAUGE,
153+
_LiveStatQuerySpec(template=self._get_template(MetricType.GAUGE)),
107154
),
108-
diff=self._get_container_live_stat_query(
155+
diff=self._get_live_stat_query(
109156
kernel_ids,
110-
metric_type=MetricType.DIFF,
111-
metric_name_filter=DIFF_METRICS,
112-
value_type_filter=ValueType.CURRENT,
157+
_LiveStatQuerySpec(
158+
template=self._get_template(MetricType.DIFF),
159+
metric_name_filter=DIFF_METRICS,
160+
value_type_filter=ValueType.CURRENT,
161+
),
113162
),
114-
rate=self._get_container_live_stat_query(
163+
rate=self._get_live_stat_query(
115164
kernel_ids,
116-
metric_type=MetricType.RATE,
117-
metric_name_filter=RATE_METRICS,
118-
value_type_filter=ValueType.CURRENT,
165+
_LiveStatQuerySpec(
166+
template=self._get_template(MetricType.RATE),
167+
metric_name_filter=RATE_METRICS,
168+
value_type_filter=ValueType.CURRENT,
169+
),
119170
),
171+
max=self._build_stats_preset(_MAX_STATS_BUCKET, kernel_id_regex, group_by),
172+
avg=self._build_stats_preset(_AVG_STATS_BUCKET, kernel_id_regex, group_by),
173+
)
174+
175+
def _build_stats_preset(
176+
self,
177+
bucket: _StatsBucket,
178+
kernel_id_regex: str,
179+
group_by: str,
180+
) -> MetricPreset:
181+
return MetricPreset(
182+
template=self._render_stats_query(
183+
bucket,
184+
kernel_id_regex=kernel_id_regex,
185+
group_by=group_by,
186+
)
120187
)
121188

122-
def _get_container_live_stat_query(
189+
def _get_live_stat_query(
123190
self,
124191
kernel_ids: Sequence[KernelId],
125-
*,
126-
metric_type: MetricType,
127-
metric_name_filter: frozenset[str] | None = None,
128-
value_type_filter: ValueType | None = None,
192+
spec: _LiveStatQuerySpec,
129193
) -> MetricPreset:
130194
labels: dict[str, LabelMatcher] = {
131195
"kernel_id": LabelMatcher.regex(_regex_union([str(kid) for kid in kernel_ids]))
132196
}
133-
if metric_name_filter is not None:
197+
if spec.metric_name_filter is not None:
134198
labels["container_metric_name"] = LabelMatcher.regex(
135-
_regex_union(sorted(metric_name_filter))
199+
_regex_union(sorted(spec.metric_name_filter))
136200
)
137-
if value_type_filter is not None:
138-
labels["value_type"] = LabelMatcher.exact(value_type_filter.value)
201+
if spec.value_type_filter is not None:
202+
labels["value_type"] = LabelMatcher.exact(spec.value_type_filter.value)
139203

140204
return MetricPreset(
141-
template=self._get_template(metric_type),
142-
labels=labels,
205+
template=spec.template,
143206
group_by=_LIVE_STAT_GROUP_BY,
207+
labels=labels,
144208
window=self._timewindow,
145209
)
146210

211+
def _render_stats_query(
212+
self,
213+
bucket: _StatsBucket,
214+
*,
215+
kernel_id_regex: str,
216+
group_by: str,
217+
) -> str:
218+
parts: list[str] = []
219+
stat_fn = f"{bucket.value_type.value}_over_time"
220+
stat_label = bucket.value_type.to_live_stat_label()
221+
if bucket.gauge_metrics or bucket.gauge_metric_patterns:
222+
gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns)
223+
gauge_labels = self._live_stat_current_labels(
224+
kernel_id_regex=kernel_id_regex,
225+
metric_name_regex=gauge_regex,
226+
)
227+
parts.append(
228+
f"label_replace({stat_fn}((sum by ({group_by})("
229+
f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{gauge_labels}}}))[{self._timewindow}:]),"
230+
f'"value_type","{stat_label}","value_type",".*")'
231+
)
232+
if bucket.rate_metrics:
233+
rate_regex = _regex_union(sorted(bucket.rate_metrics))
234+
rate_labels = self._live_stat_current_labels(
235+
kernel_id_regex=kernel_id_regex,
236+
metric_name_regex=rate_regex,
237+
)
238+
parts.append(
239+
f"label_replace({stat_fn}((sum by ({group_by})(rate("
240+
f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{rate_labels}}}"
241+
f"[{self._timewindow}])))[{self._timewindow}:]),"
242+
f'"value_type","{stat_label}","value_type",".*")'
243+
)
244+
return " or ".join(parts)
245+
246+
def _live_stat_current_labels(
247+
self,
248+
*,
249+
kernel_id_regex: str,
250+
metric_name_regex: str,
251+
) -> str:
252+
return (
253+
f'kernel_id=~"{kernel_id_regex}"'
254+
f',container_metric_name=~"{metric_name_regex}"'
255+
f',value_type="{ValueType.CURRENT.value}"'
256+
)
257+
147258
def _get_template(self, metric_type: MetricType) -> str:
148259
match metric_type:
149260
case MetricType.GAUGE:

src/ai/backend/common/clients/prometheus/metric_types.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,19 +61,36 @@ class MetricType(StrEnum):
6161

6262
@dataclass(frozen=True)
6363
class ContainerLiveStatQueries:
64-
"""Gauge / diff / rate query preset bundle for container live stats."""
64+
"""Gauge / diff / rate / max / avg query preset bundle for container live stats."""
6565

6666
gauge: MetricPreset
6767
diff: MetricPreset
6868
rate: MetricPreset
69+
max: MetricPreset
70+
avg: MetricPreset
6971

7072
def to_list(self) -> list[MetricPreset]:
71-
return [self.gauge, self.diff, self.rate]
73+
return [self.gauge, self.diff, self.rate, self.max, self.avg]
7274

7375

7476
DIFF_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
7577
RATE_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
7678

79+
# Window stats: built-ins are exact, accelerator/plugin metrics use patterns.
80+
STATS_MAX_GAUGE_METRICS: Final[frozenset[str]] = frozenset({
81+
"mem",
82+
"io_scratch_size",
83+
})
84+
STATS_MAX_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({
85+
r"[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",
86+
})
87+
STATS_AVG_GAUGE_METRICS: Final[frozenset[str]] = frozenset()
88+
STATS_AVG_GAUGE_METRIC_PATTERNS: Final[frozenset[str]] = frozenset({
89+
r"[A-Za-z0-9][A-Za-z0-9_-]*_util",
90+
})
91+
STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
92+
STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
93+
7794

7895
@dataclass
7996
class ContainerMetricResponseInfo:
@@ -184,7 +201,7 @@ def from_prometheus_response(cls, response: PrometheusResponse) -> Self:
184201
container_metric_name = cast(str, info.container_metric_name)
185202
value_type_str = cast(str, info.value_type)
186203
try:
187-
value_type = ValueType(value_type_str)
204+
value_type = ValueType.from_legacy_live_stat_label(value_type_str)
188205
kernel_id = KernelId(UUID(kernel_id_str))
189206
except ValueError:
190207
continue

src/ai/backend/common/clients/prometheus/types.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,22 @@ class ValueType(StrEnum):
88
CURRENT = "current"
99
CAPACITY = "capacity"
1010
PCT = "pct"
11+
MAX = "max"
12+
AVG = "avg"
13+
RATE = "rate"
14+
15+
@classmethod
16+
def from_legacy_live_stat_label(cls, value: str) -> "ValueType":
17+
if value.startswith("stats."):
18+
return cls(value.removeprefix("stats."))
19+
return cls(value)
20+
21+
def to_live_stat_label(self) -> str:
22+
match self:
23+
case ValueType.MAX | ValueType.AVG | ValueType.RATE:
24+
return f"stats.{self.value}"
25+
case _:
26+
return self.value
1127

1228

1329
@dataclass(frozen=True)

tests/unit/manager/services/utilization_metric/test_container_metric.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ai.backend.common.clients.prometheus.metric_types import (
1515
ContainerMetricOptionalLabel,
1616
ContainerMetricResponseInfo,
17+
KernelMetricValuesByKernel,
1718
MetricType,
1819
ValueType,
1920
)
@@ -30,6 +31,7 @@
3031
InvalidAPIParameters,
3132
PrometheusConnectionError,
3233
)
34+
from ai.backend.common.types import KernelId
3335
from ai.backend.manager.repositories.metric.repository import MetricRepository
3436
from ai.backend.manager.services.metric.actions.container import (
3537
ContainerMetricAction,
@@ -804,6 +806,73 @@ async def test_build_query_renders_expected_promql(self, case: BuiltinQueryTestC
804806
assert rendered_query == case.expected_query
805807

806808

809+
class TestLiveStatQueryProvider:
810+
"""Characterization tests for container live stat PromQL."""
811+
812+
def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None:
813+
kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678"))
814+
fixed_query_builder = FixedQueryBuilder("5m")
815+
816+
queries = fixed_query_builder.get_container_live_stat_queries([kernel_id])
817+
818+
assert queries.max.render() == (
819+
"label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)("
820+
"backendai_container_utilization"
821+
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
822+
'container_metric_name=~"io_scratch_size|mem|'
823+
'[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",'
824+
'value_type="current"}))[5m:]),'
825+
'"value_type","stats.max","value_type",".*")'
826+
" or "
827+
"label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(rate("
828+
"backendai_container_utilization"
829+
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
830+
'container_metric_name=~"cpu_util",value_type="current"}'
831+
"[5m])))[5m:]),"
832+
'"value_type","stats.max","value_type",".*")'
833+
)
834+
assert queries.avg.render() == (
835+
"label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)("
836+
"backendai_container_utilization"
837+
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
838+
'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_util",'
839+
'value_type="current"}))[5m:]),'
840+
'"value_type","stats.avg","value_type",".*")'
841+
" or "
842+
"label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(rate("
843+
"backendai_container_utilization"
844+
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
845+
'container_metric_name=~"cpu_util",value_type="current"}'
846+
"[5m])))[5m:]),"
847+
'"value_type","stats.avg","value_type",".*")'
848+
)
849+
850+
851+
class TestKernelMetricValuesByKernel:
852+
def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> None:
853+
kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678"))
854+
response = PrometheusResponse(
855+
status="success",
856+
data=PrometheusQueryData(
857+
result_type="vector",
858+
result=[
859+
MetricResponse(
860+
metric=MetricResponseInfo(
861+
kernel_id=str(kernel_id),
862+
container_metric_name="mem",
863+
value_type="stats.max",
864+
),
865+
values=[(1704067200.0, "1024")],
866+
)
867+
],
868+
),
869+
)
870+
871+
result = KernelMetricValuesByKernel.from_prometheus_response(response)
872+
873+
assert result.values_by_kernel[kernel_id][0].value_type == ValueType.MAX
874+
875+
807876
class TestMetricResponseInfoParsing:
808877
"""Unit tests for MetricResponseInfo parsing behavior."""
809878

0 commit comments

Comments
 (0)