|
6 | 6 | from ai.backend.common.clients.prometheus.metric_types import ( |
7 | 7 | DIFF_METRICS, |
8 | 8 | RATE_METRICS, |
| 9 | + STATS_AVG_GAUGE_METRIC_PATTERNS, |
| 10 | + STATS_AVG_GAUGE_METRICS, |
| 11 | + STATS_AVG_OVER_RATE_METRICS, |
| 12 | + STATS_MAX_GAUGE_METRIC_PATTERNS, |
| 13 | + STATS_MAX_GAUGE_METRICS, |
| 14 | + STATS_MAX_OVER_RATE_METRICS, |
9 | 15 | ContainerLiveStatQueries, |
10 | 16 | ContainerMetricOptionalLabel, |
11 | 17 | MetricType, |
@@ -47,10 +53,49 @@ class LabelValuesQuery: |
47 | 53 | metric_match: str |
48 | 54 |
|
49 | 55 |
|
| 56 | +@dataclass(frozen=True) |
| 57 | +class _LiveStatQuerySpec: |
| 58 | + template: str |
| 59 | + metric_name_filter: frozenset[str] | None = None |
| 60 | + value_type_filter: ValueType | None = None |
| 61 | + |
| 62 | + |
| 63 | +@dataclass(frozen=True) |
| 64 | +class _StatsBucket: |
| 65 | + """Window-stats bucket spec (gauge metrics + rate metrics for a single stat).""" |
| 66 | + |
| 67 | + value_type: ValueType |
| 68 | + gauge_metrics: frozenset[str] |
| 69 | + rate_metrics: frozenset[str] |
| 70 | + gauge_metric_patterns: frozenset[str] = frozenset() |
| 71 | + |
| 72 | + |
50 | 73 | def _regex_union(values: Sequence[str]) -> str: |
51 | 74 | return "|".join(re.escape(value) for value in values) |
52 | 75 |
|
53 | 76 |
|
| 77 | +def _metric_name_regex( |
| 78 | + metric_names: frozenset[str], |
| 79 | + metric_patterns: frozenset[str] = frozenset(), |
| 80 | +) -> str: |
| 81 | + exact_parts = [re.escape(value) for value in sorted(metric_names)] |
| 82 | + return "|".join([*exact_parts, *sorted(metric_patterns)]) |
| 83 | + |
| 84 | + |
| 85 | +_MAX_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket( |
| 86 | + value_type=ValueType.MAX, |
| 87 | + gauge_metrics=STATS_MAX_GAUGE_METRICS, |
| 88 | + rate_metrics=STATS_MAX_OVER_RATE_METRICS, |
| 89 | + gauge_metric_patterns=STATS_MAX_GAUGE_METRIC_PATTERNS, |
| 90 | +) |
| 91 | +_AVG_STATS_BUCKET: Final[_StatsBucket] = _StatsBucket( |
| 92 | + value_type=ValueType.AVG, |
| 93 | + gauge_metrics=STATS_AVG_GAUGE_METRICS, |
| 94 | + rate_metrics=STATS_AVG_OVER_RATE_METRICS, |
| 95 | + gauge_metric_patterns=STATS_AVG_GAUGE_METRIC_PATTERNS, |
| 96 | +) |
| 97 | + |
| 98 | + |
54 | 99 | class FixedQueryBuilder: |
55 | 100 | _timewindow: str |
56 | 101 |
|
@@ -100,50 +145,116 @@ def get_container_live_stat_queries( |
100 | 145 | self, |
101 | 146 | kernel_ids: Sequence[KernelId], |
102 | 147 | ) -> ContainerLiveStatQueries: |
| 148 | + kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids]) |
| 149 | + group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY)) |
103 | 150 | return ContainerLiveStatQueries( |
104 | | - gauge=self._get_container_live_stat_query( |
| 151 | + gauge=self._get_live_stat_query( |
105 | 152 | kernel_ids, |
106 | | - metric_type=MetricType.GAUGE, |
| 153 | + _LiveStatQuerySpec(template=self._get_template(MetricType.GAUGE)), |
107 | 154 | ), |
108 | | - diff=self._get_container_live_stat_query( |
| 155 | + diff=self._get_live_stat_query( |
109 | 156 | kernel_ids, |
110 | | - metric_type=MetricType.DIFF, |
111 | | - metric_name_filter=DIFF_METRICS, |
112 | | - value_type_filter=ValueType.CURRENT, |
| 157 | + _LiveStatQuerySpec( |
| 158 | + template=self._get_template(MetricType.DIFF), |
| 159 | + metric_name_filter=DIFF_METRICS, |
| 160 | + value_type_filter=ValueType.CURRENT, |
| 161 | + ), |
113 | 162 | ), |
114 | | - rate=self._get_container_live_stat_query( |
| 163 | + rate=self._get_live_stat_query( |
115 | 164 | kernel_ids, |
116 | | - metric_type=MetricType.RATE, |
117 | | - metric_name_filter=RATE_METRICS, |
118 | | - value_type_filter=ValueType.CURRENT, |
| 165 | + _LiveStatQuerySpec( |
| 166 | + template=self._get_template(MetricType.RATE), |
| 167 | + metric_name_filter=RATE_METRICS, |
| 168 | + value_type_filter=ValueType.CURRENT, |
| 169 | + ), |
119 | 170 | ), |
| 171 | + max=self._build_stats_preset(_MAX_STATS_BUCKET, kernel_id_regex, group_by), |
| 172 | + avg=self._build_stats_preset(_AVG_STATS_BUCKET, kernel_id_regex, group_by), |
| 173 | + ) |
| 174 | + |
| 175 | + def _build_stats_preset( |
| 176 | + self, |
| 177 | + bucket: _StatsBucket, |
| 178 | + kernel_id_regex: str, |
| 179 | + group_by: str, |
| 180 | + ) -> MetricPreset: |
| 181 | + return MetricPreset( |
| 182 | + template=self._render_stats_query( |
| 183 | + bucket, |
| 184 | + kernel_id_regex=kernel_id_regex, |
| 185 | + group_by=group_by, |
| 186 | + ) |
120 | 187 | ) |
121 | 188 |
|
122 | | - def _get_container_live_stat_query( |
| 189 | + def _get_live_stat_query( |
123 | 190 | self, |
124 | 191 | kernel_ids: Sequence[KernelId], |
125 | | - *, |
126 | | - metric_type: MetricType, |
127 | | - metric_name_filter: frozenset[str] | None = None, |
128 | | - value_type_filter: ValueType | None = None, |
| 192 | + spec: _LiveStatQuerySpec, |
129 | 193 | ) -> MetricPreset: |
130 | 194 | labels: dict[str, LabelMatcher] = { |
131 | 195 | "kernel_id": LabelMatcher.regex(_regex_union([str(kid) for kid in kernel_ids])) |
132 | 196 | } |
133 | | - if metric_name_filter is not None: |
| 197 | + if spec.metric_name_filter is not None: |
134 | 198 | labels["container_metric_name"] = LabelMatcher.regex( |
135 | | - _regex_union(sorted(metric_name_filter)) |
| 199 | + _regex_union(sorted(spec.metric_name_filter)) |
136 | 200 | ) |
137 | | - if value_type_filter is not None: |
138 | | - labels["value_type"] = LabelMatcher.exact(value_type_filter.value) |
| 201 | + if spec.value_type_filter is not None: |
| 202 | + labels["value_type"] = LabelMatcher.exact(spec.value_type_filter.value) |
139 | 203 |
|
140 | 204 | return MetricPreset( |
141 | | - template=self._get_template(metric_type), |
142 | | - labels=labels, |
| 205 | + template=spec.template, |
143 | 206 | group_by=_LIVE_STAT_GROUP_BY, |
| 207 | + labels=labels, |
144 | 208 | window=self._timewindow, |
145 | 209 | ) |
146 | 210 |
|
| 211 | + def _render_stats_query( |
| 212 | + self, |
| 213 | + bucket: _StatsBucket, |
| 214 | + *, |
| 215 | + kernel_id_regex: str, |
| 216 | + group_by: str, |
| 217 | + ) -> str: |
| 218 | + parts: list[str] = [] |
| 219 | + stat_fn = f"{bucket.value_type.value}_over_time" |
| 220 | + stat_label = bucket.value_type.to_live_stat_label() |
| 221 | + if bucket.gauge_metrics or bucket.gauge_metric_patterns: |
| 222 | + gauge_regex = _metric_name_regex(bucket.gauge_metrics, bucket.gauge_metric_patterns) |
| 223 | + gauge_labels = self._live_stat_current_labels( |
| 224 | + kernel_id_regex=kernel_id_regex, |
| 225 | + metric_name_regex=gauge_regex, |
| 226 | + ) |
| 227 | + parts.append( |
| 228 | + f"label_replace({stat_fn}((sum by ({group_by})(" |
| 229 | + f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{gauge_labels}}}))[{self._timewindow}:])," |
| 230 | + f'"value_type","{stat_label}","value_type",".*")' |
| 231 | + ) |
| 232 | + if bucket.rate_metrics: |
| 233 | + rate_regex = _regex_union(sorted(bucket.rate_metrics)) |
| 234 | + rate_labels = self._live_stat_current_labels( |
| 235 | + kernel_id_regex=kernel_id_regex, |
| 236 | + metric_name_regex=rate_regex, |
| 237 | + ) |
| 238 | + parts.append( |
| 239 | + f"label_replace({stat_fn}((sum by ({group_by})(rate(" |
| 240 | + f"{CONTAINER_UTILIZATION_METRIC_NAME}{{{rate_labels}}}" |
| 241 | + f"[{self._timewindow}])))[{self._timewindow}:])," |
| 242 | + f'"value_type","{stat_label}","value_type",".*")' |
| 243 | + ) |
| 244 | + return " or ".join(parts) |
| 245 | + |
| 246 | + def _live_stat_current_labels( |
| 247 | + self, |
| 248 | + *, |
| 249 | + kernel_id_regex: str, |
| 250 | + metric_name_regex: str, |
| 251 | + ) -> str: |
| 252 | + return ( |
| 253 | + f'kernel_id=~"{kernel_id_regex}"' |
| 254 | + f',container_metric_name=~"{metric_name_regex}"' |
| 255 | + f',value_type="{ValueType.CURRENT.value}"' |
| 256 | + ) |
| 257 | + |
147 | 258 | def _get_template(self, metric_type: MetricType) -> str: |
148 | 259 | match metric_type: |
149 | 260 | case MetricType.GAUGE: |
|
0 commit comments