Skip to content

Commit 29ac4d5

Browse files
apollo_dashboard,apollo_deployments: add top-level intervalSec to Alerts; use it in alert builder
Move the evaluation interval from a per-alert field to a top-level field on the Alerts struct. The interval is group-scoped in Grafana, so a single value for all alerts is the correct model. Also restructure alert_builder.py to upload/dump per rule-group instead of per individual alert, using the top-level intervalSec as the group interval. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 0631f58 commit 29ac4d5

5 files changed

Lines changed: 70 additions & 117 deletions

File tree

crates/apollo_dashboard/resources/dev_grafana_alerts.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"intervalSec": 30,
23
"alerts": [
34
{
45
"name": "batcher_storage_open_read_transactions",

crates/apollo_dashboard/src/alert_definitions.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,5 +692,5 @@ pub fn get_apollo_alerts() -> Alerts {
692692
alerts.push(get_state_sync_lag());
693693
alerts.append(&mut get_state_sync_stuck_vec());
694694

695-
Alerts::new(alerts)
695+
Alerts::new(alerts, EVALUATION_INTERVAL_SEC_DEFAULT)
696696
}

crates/apollo_dashboard/src/alerts.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ pub(crate) const SECS_IN_MIN: u64 = 60;
1616
/// Alerts to be configured in the dashboard.
1717
#[derive(Debug, Serialize)]
1818
pub struct Alerts {
19+
// The interval in seconds between evaluations of all alerts in the group.
20+
#[serde(rename = "intervalSec")]
21+
interval_sec: u64,
1922
alerts: Vec<Alert>,
2023
}
2124

2225
impl Alerts {
23-
pub(crate) fn new(alerts: Vec<Alert>) -> Self {
26+
pub(crate) fn new(alerts: Vec<Alert>, interval_sec: u64) -> Self {
2427
// Validate that there are no duplicate alert names.
2528
alerts
2629
.iter()
@@ -39,7 +42,7 @@ impl Alerts {
3942
panic!("Duplicate placeholder name found across alerts: {duplicate}")
4043
});
4144

42-
Self { alerts }
45+
Self { interval_sec, alerts }
4346
}
4447
}
4548

deployments/monitoring/src/builders/alert_builder.py

Lines changed: 35 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import collections
45
import json
56
import os
67
import sys
@@ -20,12 +21,6 @@
2021
)
2122
from common.logger import get_logger
2223
from grafana_client import GrafanaApi
23-
from grafana_client.client import (
24-
GrafanaBadInputError,
25-
GrafanaClientError,
26-
GrafanaException,
27-
GrafanaServerError,
28-
)
2924
from tenacity import before_sleep_log, retry, stop_after_attempt, wait_fixed
3025

3126
# Global logger (initialized in alert_builder function)
@@ -69,7 +64,6 @@ def create_alert_rule(
6964
title: str,
7065
folder_uid: str,
7166
rule_group: str,
72-
interval_sec: int,
7367
_for: str,
7468
expr: str,
7569
conditions: list[dict[str, any]],
@@ -82,7 +76,6 @@ def create_alert_rule(
8276
alert_rule["title"] = title
8377
alert_rule["folderUID"] = folder_uid
8478
alert_rule["ruleGroup"] = rule_group
85-
alert_rule["intervalSec"] = interval_sec
8679
alert_rule["for"] = _for
8780
alert_rule["labels"] = labels
8881
alert_rule["data"] = [
@@ -100,6 +93,10 @@ def create_alert_rule(
10093
return alert_rule
10194

10295

96+
def create_rule_group(name: str, interval_sec: int, rules: list[dict[str, any]]) -> dict[str, any]:
97+
return {"name": name, "interval": interval_sec, "rules": rules}
98+
99+
103100
def get_all_folders(client: GrafanaApi) -> list[dict[str, any]]:
104101
logger.debug("Getting all folders")
105102
return client.folder.get_all_folders()
@@ -128,25 +125,14 @@ def create_folder_return_uid(client: GrafanaApi, title: str) -> str:
128125
return folder["uid"]
129126

130127

131-
def dump_alert(output_dir: str, alert: dict[str, any]) -> None:
132-
alert_full_path = f"{output_dir}/{alert['name']}.json".lower().replace(" ", "_")
128+
def dump_rule_group(output_dir: str, rule_group: dict[str, any]) -> None:
129+
group_full_path = f"{output_dir}/{rule_group['name']}.json".lower().replace(" ", "_")
133130
os.makedirs(output_dir, exist_ok=True)
134-
ordered = {"title": alert["title"], **{k: v for k, v in alert.items() if k != "title"}}
135-
with open(alert_full_path, "w") as f:
136-
json.dump(ordered, f, indent=2)
137-
# Format with professional colors: Alert (white bold), name (cyan), saved to (white bold), path (dim cyan)
131+
with open(group_full_path, "w") as f:
132+
json.dump(rule_group, f, indent=2)
138133
logger.info(
139-
f'[bold white]Alert[/bold white] "[blue]{alert["name"]}[/blue]" [bold white]saved to[/bold white] [dim white]{alert_full_path}[/dim white]'
140-
)
141-
142-
143-
def get_alert_rule_group(client: GrafanaApi, folder_uid: str, group_uid: str) -> str:
144-
logger.debug(f'Getting alert rule group "{group_uid}"')
145-
rule_group = client.alertingprovisioning.get_rule_group(
146-
folder_uid=folder_uid, group_uid=group_uid
134+
f'[bold white]Rule group[/bold white] "[blue]{rule_group["name"]}[/blue]" [bold white]saved to[/bold white] [dim white]{group_full_path}[/dim white]'
147135
)
148-
logger.debug(f"Got alert group: {rule_group}")
149-
return rule_group
150136

151137

152138
@retry(
@@ -296,7 +282,10 @@ def alert_builder(args: argparse.Namespace):
296282
# Exit cleanly without traceback
297283
sys.exit(1)
298284

299-
alerts = []
285+
interval_sec = dev_alerts["intervalSec"]
286+
287+
# group_name -> list of alert rules (preserving insertion order within each group)
288+
groups: dict[str, list[dict[str, any]]] = collections.defaultdict(list)
300289

301290
for dev_alert in dev_alerts["alerts"]:
302291
# Apply config overrides to replace placeholders
@@ -315,13 +304,14 @@ def alert_builder(args: argparse.Namespace):
315304
)
316305
else:
317306
expr = remove_expr_placeholder(expr=dev_alert["expr"])
318-
alerts.append(
307+
308+
group_name = dev_alert["ruleGroup"]
309+
groups[group_name].append(
319310
create_alert_rule(
320311
name=dev_alert["name"],
321312
title=dev_alert["title"],
322313
folder_uid=folder_uid,
323-
interval_sec=dev_alert["intervalSec"],
324-
rule_group=dev_alert["ruleGroup"],
314+
rule_group=group_name,
325315
_for=dev_alert["for"],
326316
expr=expr,
327317
conditions=dev_alert["conditions"],
@@ -333,66 +323,31 @@ def alert_builder(args: argparse.Namespace):
333323
)
334324
)
335325

336-
alerts.sort(key=lambda a: a["title"])
326+
rule_groups = [
327+
create_rule_group(
328+
name=group_name,
329+
interval_sec=interval_sec,
330+
rules=sorted(rules, key=lambda a: a["title"]),
331+
)
332+
for group_name, rules in sorted(groups.items())
333+
]
337334

338-
for alert in alerts:
335+
for rule_group in rule_groups:
339336
if args.debug:
340-
logger.debug(json.dumps(alert))
337+
logger.debug(json.dumps(rule_group))
341338
if not args.dry_run:
342-
alert_created_or_exists = False
343339
try:
344-
client.alertingprovisioning.create_alertrule(
345-
alertrule=alert,
346-
disable_provenance=True,
347-
)
348-
logger.info(f'Alert "{alert["name"]}" uploaded to Grafana successfully')
349-
alert_created_or_exists = True
350-
351-
except GrafanaBadInputError as e:
352-
if "alerting.alert-rule.conflict" in e.message:
353-
logger.info(f'Alert "{alert["name"]}" already exists. Skipping creation.')
354-
alert_created_or_exists = True
355-
else:
356-
# Handle other bad input errors
357-
logger.error(
358-
f'Failed to create alert "{alert["name"]}". Bad input: {e.message}'
359-
)
360-
except GrafanaClientError as e:
361-
# Handle other client-side errors (e.g., invalid request)
362-
logger.error(f'Failed to create alert "{alert["name"]}". Client error: {e.message}')
363-
except GrafanaServerError as e:
364-
# Handle server-side errors (5xx errors)
365-
logger.error(f'Failed to create alert "{alert["name"]}". Server error: {e.message}')
366-
except GrafanaException as e:
367-
# Catch any other Grafana-related exceptions
368-
logger.error(
369-
f'Failed to create alert "{alert["name"]}". Grafana error: {e.message}'
340+
update_alert_rule_group(
341+
client=client,
342+
folder_uid=folder_uid,
343+
group_uid=rule_group["name"],
344+
alertrule_group=rule_group,
370345
)
371346
except Exception as e:
372-
# Catch any other exceptions (non-Grafana-related)
373-
logger.error(f'Failed to create alert "{alert["name"]}". Unexpected error: {e}')
374-
375-
# Only update rule group interval if alert was successfully created or already exists
376-
if alert_created_or_exists:
377-
try:
378-
group_uid = alert["ruleGroup"]
379-
rule_group = get_alert_rule_group(
380-
client=client, folder_uid=folder_uid, group_uid=group_uid
381-
)
382-
if rule_group["interval"] != alert["intervalSec"]:
383-
rule_group["interval"] = alert["intervalSec"]
384-
update_alert_rule_group(
385-
client=client,
386-
folder_uid=folder_uid,
387-
group_uid=group_uid,
388-
alertrule_group=rule_group,
389-
)
390-
logger.info(f'Alert rule group "{group_uid}" updated successfully')
391-
except Exception as e:
392-
logger.error(f'Failed to update alert rule group "{alert["ruleGroup"]}". {e}')
347+
logger.error(f'Failed to update rule group "{rule_group["name"]}". {e}')
393348

394349
if args.out_dir:
395350
output_dir = f"{args.out_dir}/alerts"
396-
dump_alert(output_dir=output_dir, alert=alert)
351+
dump_rule_group(output_dir=output_dir, rule_group=rule_group)
397352

398353
logger.info("Done building grafana alerts")

deployments/sequencer/src/constructs/grafana.py

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,7 @@ def __init__(
105105
super().__init__(scope, id, cluster, namespace)
106106

107107
self.grafana_alert_group = grafana_alert_rule_group
108-
self.grafana_alert_files = self.grafana_alert_group.get_alert_files()
109-
self.hash_value = generate_random_hash(from_string=f"{self.cluster}-{self.namespace}")
110-
self.custom_name = f"{self.namespace}-arg-{self.hash_value}"
111-
self._get_shared_grafana_alert_rule_group()
108+
self._create_alert_rule_group_crds()
112109

113110
def _exec_err_state_enum_selector(self, exec_err_state: str) -> Optional[str]:
114111
"""Convert string to ExecErrState enum."""
@@ -167,33 +164,30 @@ def _get_shared_grafana_alert_rule_group_rules(self, rule: Dict[str, Any]):
167164
],
168165
)
169166

170-
def _get_shared_grafana_alert_rule_group_spec(self):
171-
"""Build the spec for the alert rule group."""
172-
loaded_alert_rules = [
173-
self.grafana_alert_group.load(str(alert_file))
174-
for alert_file in self.grafana_alert_files
175-
]
176-
# Keep rule order deterministic so generated YAML has stable PR diffs.
177-
loaded_alert_rules.sort(key=lambda rule: rule["title"].lower())
178-
rules = [
179-
self._get_shared_grafana_alert_rule_group_rules(alert_rule)
180-
for alert_rule in loaded_alert_rules
181-
]
182-
183-
return SharedGrafanaAlertRuleGroupSpec(
184-
name=self.custom_name,
185-
instance_selector=SharedGrafanaAlertRuleGroupSpecInstanceSelector(),
186-
interval="1m",
187-
editable=False,
188-
folder_ref=self.cluster,
189-
rules=rules,
190-
)
191-
192-
def _get_shared_grafana_alert_rule_group(self):
193-
"""Create the SharedGrafanaAlertRuleGroup resource."""
194-
return SharedGrafanaAlertRuleGroup(
195-
self,
196-
self.node.id,
197-
metadata=self._get_api_object_metadata(name=self.custom_name),
198-
spec=self._get_shared_grafana_alert_rule_group_spec(),
199-
)
167+
def _create_alert_rule_group_crds(self):
168+
"""Create one SharedGrafanaAlertRuleGroup CRD per rule group."""
169+
for group_file in sorted(self.grafana_alert_group.get_alert_files()):
170+
group_data = self.grafana_alert_group.load(str(group_file))
171+
group_name = group_data["name"]
172+
interval_sec = group_data["interval"]
173+
# Keep rule order deterministic so generated YAML has stable PR diffs.
174+
rules_data = sorted(group_data["rules"], key=lambda rule: rule["title"].lower())
175+
176+
k8s_group_name = group_name.replace("_", "-")
177+
custom_name = f"{self.namespace}-arg-{k8s_group_name}"
178+
179+
rules = [self._get_shared_grafana_alert_rule_group_rules(rule) for rule in rules_data]
180+
spec = SharedGrafanaAlertRuleGroupSpec(
181+
name=custom_name,
182+
instance_selector=SharedGrafanaAlertRuleGroupSpecInstanceSelector(),
183+
interval=f"{interval_sec}s",
184+
editable=False,
185+
folder_ref=self.cluster,
186+
rules=rules,
187+
)
188+
SharedGrafanaAlertRuleGroup(
189+
self,
190+
custom_name,
191+
metadata=self._get_api_object_metadata(name=custom_name),
192+
spec=spec,
193+
)

0 commit comments

Comments
 (0)