Skip to content

Commit 3f7193d

Browse files
apollo_dashboard,apollo_deployments: add top-level intervalSec to Alerts; use it in alert builder
Move the evaluation interval from a per-alert field to a top-level field on the Alerts struct. The interval is group-scoped in Grafana, so a single value for all alerts is the correct model. Also restructure alert_builder.py to upload/dump per rule-group instead of per individual alert, using the top-level intervalSec as the group interval. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d67b8d8 commit 3f7193d

4 files changed

Lines changed: 44 additions & 83 deletions

File tree

crates/apollo_dashboard/resources/dev_grafana_alerts.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"intervalSec": 30,
23
"alerts": [
34
{
45
"name": "batcher_storage_open_read_transactions",

crates/apollo_dashboard/src/alert_definitions.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,5 +635,5 @@ pub fn get_apollo_alerts() -> Alerts {
635635
alerts.push(get_state_sync_lag());
636636
alerts.append(&mut get_state_sync_stuck_vec());
637637

638-
Alerts::new(alerts)
638+
Alerts::new(alerts, EVALUATION_INTERVAL_SEC_DEFAULT)
639639
}

crates/apollo_dashboard/src/alerts.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ pub(crate) const SECS_IN_MIN: u64 = 60;
1616
/// Alerts to be configured in the dashboard.
1717
#[derive(Debug, Serialize)]
1818
pub struct Alerts {
19+
// The interval in seconds between evaluations of all alerts in the group.
20+
#[serde(rename = "intervalSec")]
21+
interval_sec: u64,
1922
alerts: Vec<Alert>,
2023
}
2124

2225
impl Alerts {
23-
pub(crate) fn new(alerts: Vec<Alert>) -> Self {
26+
pub(crate) fn new(alerts: Vec<Alert>, interval_sec: u64) -> Self {
2427
// Validate that there are no duplicate alert names.
2528
alerts
2629
.iter()
@@ -39,7 +42,7 @@ impl Alerts {
3942
panic!("Duplicate placeholder name found across alerts: {duplicate}")
4043
});
4144

42-
Self { alerts }
45+
Self { interval_sec, alerts }
4346
}
4447
}
4548

deployments/monitoring/src/builders/alert_builder.py

Lines changed: 37 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import collections
45
import json
56
import os
67
import sys
@@ -20,12 +21,6 @@
2021
)
2122
from common.logger import get_logger
2223
from grafana_client import GrafanaApi
23-
from grafana_client.client import (
24-
GrafanaBadInputError,
25-
GrafanaClientError,
26-
GrafanaException,
27-
GrafanaServerError,
28-
)
2924
from tenacity import before_sleep_log, retry, stop_after_attempt, wait_fixed
3025

3126
# Global logger (initialized in alert_builder function)
@@ -69,7 +64,6 @@ def create_alert_rule(
6964
title: str,
7065
folder_uid: str,
7166
rule_group: str,
72-
interval_sec: int,
7367
_for: str,
7468
expr: str,
7569
conditions: list[dict[str, any]],
@@ -82,7 +76,6 @@ def create_alert_rule(
8276
alert_rule["title"] = title
8377
alert_rule["folderUID"] = folder_uid
8478
alert_rule["ruleGroup"] = rule_group
85-
alert_rule["intervalSec"] = interval_sec
8679
alert_rule["for"] = _for
8780
alert_rule["labels"] = labels
8881
alert_rule["data"] = [
@@ -100,6 +93,12 @@ def create_alert_rule(
10093
return alert_rule
10194

10295

96+
def create_rule_group(
97+
name: str, interval_sec: int, rules: list[dict[str, any]]
98+
) -> dict[str, any]:
99+
return {"name": name, "interval": interval_sec, "rules": rules}
100+
101+
103102
def get_all_folders(client: GrafanaApi) -> list[dict[str, any]]:
104103
logger.debug("Getting all folders")
105104
return client.folder.get_all_folders()
@@ -128,25 +127,14 @@ def create_folder_return_uid(client: GrafanaApi, title: str) -> str:
128127
return folder["uid"]
129128

130129

131-
def dump_alert(output_dir: str, alert: dict[str, any]) -> None:
132-
alert_full_path = f"{output_dir}/{alert['name']}.json".lower().replace(" ", "_")
130+
def dump_rule_group(output_dir: str, rule_group: dict[str, any]) -> None:
131+
group_full_path = f"{output_dir}/{rule_group['name']}.json".lower().replace(" ", "_")
133132
os.makedirs(output_dir, exist_ok=True)
134-
ordered = {"title": alert["title"], **{k: v for k, v in alert.items() if k != "title"}}
135-
with open(alert_full_path, "w") as f:
136-
json.dump(ordered, f, indent=2)
137-
# Format with professional colors: Alert (white bold), name (cyan), saved to (white bold), path (dim cyan)
133+
with open(group_full_path, "w") as f:
134+
json.dump(rule_group, f, indent=2)
138135
logger.info(
139-
f'[bold white]Alert[/bold white] "[blue]{alert["name"]}[/blue]" [bold white]saved to[/bold white] [dim white]{alert_full_path}[/dim white]'
140-
)
141-
142-
143-
def get_alert_rule_group(client: GrafanaApi, folder_uid: str, group_uid: str) -> str:
144-
logger.debug(f'Getting alert rule group "{group_uid}"')
145-
rule_group = client.alertingprovisioning.get_rule_group(
146-
folder_uid=folder_uid, group_uid=group_uid
136+
f'[bold white]Rule group[/bold white] "[blue]{rule_group["name"]}[/blue]" [bold white]saved to[/bold white] [dim white]{group_full_path}[/dim white]'
147137
)
148-
logger.debug(f"Got alert group: {rule_group}")
149-
return rule_group
150138

151139

152140
@retry(
@@ -296,7 +284,10 @@ def alert_builder(args: argparse.Namespace):
296284
# Exit cleanly without traceback
297285
sys.exit(1)
298286

299-
alerts = []
287+
interval_sec = dev_alerts["intervalSec"]
288+
289+
# group_name -> list of alert rules (preserving insertion order within each group)
290+
groups: dict[str, list[dict[str, any]]] = collections.defaultdict(list)
300291

301292
for dev_alert in dev_alerts["alerts"]:
302293
# Apply config overrides to replace placeholders
@@ -315,13 +306,14 @@ def alert_builder(args: argparse.Namespace):
315306
)
316307
else:
317308
expr = remove_expr_placeholder(expr=dev_alert["expr"])
318-
alerts.append(
309+
310+
group_name = dev_alert["ruleGroup"]
311+
groups[group_name].append(
319312
create_alert_rule(
320313
name=dev_alert["name"],
321314
title=dev_alert["title"],
322315
folder_uid=folder_uid,
323-
interval_sec=dev_alert["intervalSec"],
324-
rule_group=dev_alert["ruleGroup"],
316+
rule_group=group_name,
325317
_for=dev_alert["for"],
326318
expr=expr,
327319
conditions=dev_alert["conditions"],
@@ -333,66 +325,31 @@ def alert_builder(args: argparse.Namespace):
333325
)
334326
)
335327

336-
alerts.sort(key=lambda a: a["title"])
328+
rule_groups = [
329+
create_rule_group(
330+
name=group_name,
331+
interval_sec=interval_sec,
332+
rules=sorted(rules, key=lambda a: a["title"]),
333+
)
334+
for group_name, rules in sorted(groups.items())
335+
]
337336

338-
for alert in alerts:
337+
for rule_group in rule_groups:
339338
if args.debug:
340-
logger.debug(json.dumps(alert))
339+
logger.debug(json.dumps(rule_group))
341340
if not args.dry_run:
342-
alert_created_or_exists = False
343341
try:
344-
client.alertingprovisioning.create_alertrule(
345-
alertrule=alert,
346-
disable_provenance=True,
347-
)
348-
logger.info(f'Alert "{alert["name"]}" uploaded to Grafana successfully')
349-
alert_created_or_exists = True
350-
351-
except GrafanaBadInputError as e:
352-
if "alerting.alert-rule.conflict" in e.message:
353-
logger.info(f'Alert "{alert["name"]}" already exists. Skipping creation.')
354-
alert_created_or_exists = True
355-
else:
356-
# Handle other bad input errors
357-
logger.error(
358-
f'Failed to create alert "{alert["name"]}". Bad input: {e.message}'
359-
)
360-
except GrafanaClientError as e:
361-
# Handle other client-side errors (e.g., invalid request)
362-
logger.error(f'Failed to create alert "{alert["name"]}". Client error: {e.message}')
363-
except GrafanaServerError as e:
364-
# Handle server-side errors (5xx errors)
365-
logger.error(f'Failed to create alert "{alert["name"]}". Server error: {e.message}')
366-
except GrafanaException as e:
367-
# Catch any other Grafana-related exceptions
368-
logger.error(
369-
f'Failed to create alert "{alert["name"]}". Grafana error: {e.message}'
342+
update_alert_rule_group(
343+
client=client,
344+
folder_uid=folder_uid,
345+
group_uid=rule_group["name"],
346+
alertrule_group=rule_group,
370347
)
371348
except Exception as e:
372-
# Catch any other exceptions (non-Grafana-related)
373-
logger.error(f'Failed to create alert "{alert["name"]}". Unexpected error: {e}')
374-
375-
# Only update rule group interval if alert was successfully created or already exists
376-
if alert_created_or_exists:
377-
try:
378-
group_uid = alert["ruleGroup"]
379-
rule_group = get_alert_rule_group(
380-
client=client, folder_uid=folder_uid, group_uid=group_uid
381-
)
382-
if rule_group["interval"] != alert["intervalSec"]:
383-
rule_group["interval"] = alert["intervalSec"]
384-
update_alert_rule_group(
385-
client=client,
386-
folder_uid=folder_uid,
387-
group_uid=group_uid,
388-
alertrule_group=rule_group,
389-
)
390-
logger.info(f'Alert rule group "{group_uid}" updated successfully')
391-
except Exception as e:
392-
logger.error(f'Failed to update alert rule group "{alert["ruleGroup"]}". {e}')
349+
logger.error(f'Failed to update rule group "{rule_group["name"]}". {e}')
393350

394351
if args.out_dir:
395352
output_dir = f"{args.out_dir}/alerts"
396-
dump_alert(output_dir=output_dir, alert=alert)
353+
dump_rule_group(output_dir=output_dir, rule_group=rule_group)
397354

398355
logger.info("Done building grafana alerts")

0 commit comments

Comments
 (0)