Skip to content

Commit 520fcca

Browse files
seedspiritclaude
andcommitted
fix(BA-5878): unbreak live-stat regex and tighten log/enum
PromQL parser rejects \- as an unknown escape sequence inside a regex literal, so re.escape over-escaping broke every container live-stat query (kernel_id UUIDs always contain hyphens). Strip the backslash from \- after escaping so the rendered queries are RE2-acceptable. Also drop the unused ValueType.RATE — no producer ever emits it and no consumer matches on it; only MAX/AVG round-trip to the legacy stats.* labels. And include the underlying exception in the warning emitted from MetricRepository.query_container_live_stats so "empty results" no longer hide the real Prometheus failure mode. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 53463a3 commit 520fcca

4 files changed

Lines changed: 8 additions & 9 deletions

File tree

src/ai/backend/common/clients/prometheus/fixed_query_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class _StatsBucket:
7171

7272

7373
def _regex_union(values: Sequence[str]) -> str:
74-
return "|".join(re.escape(value) for value in values)
74+
return "|".join(re.escape(value).replace(r"\-", "-") for value in values)
7575

7676

7777
def _metric_name_regex(

src/ai/backend/common/clients/prometheus/types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ class ValueType(StrEnum):
1010
PCT = "pct"
1111
MAX = "max"
1212
AVG = "avg"
13-
RATE = "rate"
1413

1514
@classmethod
1615
def from_legacy_live_stat_label(cls, value: str) -> "ValueType":
@@ -20,7 +19,7 @@ def from_legacy_live_stat_label(cls, value: str) -> "ValueType":
2019

2120
def to_live_stat_label(self) -> str:
2221
match self:
23-
case ValueType.MAX | ValueType.AVG | ValueType.RATE:
22+
case ValueType.MAX | ValueType.AVG:
2423
return f"stats.{self.value}"
2524
case _:
2625
return self.value

src/ai/backend/manager/repositories/metric/repository.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ async def query_container_live_stats(
7070
return KernelLiveStatBatchResult.empty(kernel_ids)
7171
try:
7272
values_by_kernel = await self._prometheus_client.fetch_container_live_stats(kernel_ids)
73-
except (PrometheusConnectionError, FailedToGetMetric):
74-
log.warning("Failed to query metrics for kernel live stats, returning empty results")
73+
except (PrometheusConnectionError, FailedToGetMetric) as e:
74+
log.warning("Failed to query metrics for kernel live stats: {!r}", e)
7575
return KernelLiveStatBatchResult.empty(kernel_ids)
7676
return KernelLiveStatBatchResult.from_metric_values(kernel_ids, values_by_kernel)

tests/unit/manager/services/utilization_metric/test_container_metric.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -818,30 +818,30 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None
818818
assert queries.max.render() == (
819819
"label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)("
820820
"backendai_container_utilization"
821-
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
821+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
822822
'container_metric_name=~"io_scratch_size|mem|'
823823
'[A-Za-z0-9][A-Za-z0-9_-]*_(mem|util|power)",'
824824
'value_type="current"}))[5m:]),'
825825
'"value_type","stats.max","value_type",".*")'
826826
" or "
827827
"label_replace(max_over_time((sum by (container_metric_name,kernel_id,value_type)(rate("
828828
"backendai_container_utilization"
829-
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
829+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
830830
'container_metric_name=~"cpu_util",value_type="current"}'
831831
"[5m])))[5m:]),"
832832
'"value_type","stats.max","value_type",".*")'
833833
)
834834
assert queries.avg.render() == (
835835
"label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)("
836836
"backendai_container_utilization"
837-
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
837+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
838838
'container_metric_name=~"[A-Za-z0-9][A-Za-z0-9_-]*_util",'
839839
'value_type="current"}))[5m:]),'
840840
'"value_type","stats.avg","value_type",".*")'
841841
" or "
842842
"label_replace(avg_over_time((sum by (container_metric_name,kernel_id,value_type)(rate("
843843
"backendai_container_utilization"
844-
'{kernel_id=~"12345678\\-1234\\-5678\\-1234\\-567812345678",'
844+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
845845
'container_metric_name=~"cpu_util",value_type="current"}'
846846
"[5m])))[5m:]),"
847847
'"value_type","stats.avg","value_type",".*")'

0 commit comments

Comments
 (0)