Skip to content

Commit 58846b5

Browse files
committed
fix: alert check called nonexistent manager methods (cluster + VM targets)
The alert loop called manager.get_cluster_summary() and manager.get_resources() — neither method exists on PegaProxManager. Cluster + VM target alerts have been raising AttributeError since the loop was first written, killing the entire iteration before any other alerts got evaluated. Only Node-target alerts ever worked. - cluster targets: aggregate cpu/mem/disk from get_node_status() across online nodes (avg cpu, sum used/total for memory + disk) - vm targets: switch to manager.get_vm_resources() (the real method name) - per-alert try/except so a single broken metric lookup doesn't take down the rest of the queue Surfaced by customer hitting /api/alerts/force-check and getting 'PegaProxManager' object has no attribute 'get_cluster_summary' — which the diagnostic endpoint exposed instead of swallowing.
1 parent e0b6806 commit 58846b5

1 file changed

Lines changed: 63 additions & 43 deletions

File tree

pegaprox/background/alerts.py

Lines changed: 63 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -178,49 +178,69 @@ def check_and_send_alerts():
178178

179179
if cluster_id in cluster_managers:
180180
manager = cluster_managers[cluster_id]
181-
182-
if target_type == 'cluster':
183-
# Get cluster-wide metrics
184-
summary = manager.get_cluster_summary()
185-
if metric == 'cpu':
186-
current_value = summary.get('cpu_usage', 0)
187-
elif metric == 'memory':
188-
mem = summary.get('memory', {})
189-
if mem.get('total', 0) > 0:
190-
current_value = (mem.get('used', 0) / mem.get('total', 1)) * 100
191-
elif metric == 'disk':
192-
storage = summary.get('storage', {})
193-
if storage.get('total', 0) > 0:
194-
current_value = (storage.get('used', 0) / storage.get('total', 1)) * 100
195-
target_name = manager.config.name
196-
197-
elif target_type == 'node':
198-
node_summary = manager.get_node_summary(target_id)
199-
if metric == 'cpu':
200-
current_value = node_summary.get('cpu', 0) * 100
201-
elif metric == 'memory':
202-
mem = node_summary.get('memory', {})
203-
if mem.get('total', 0) > 0:
204-
current_value = (mem.get('used', 0) / mem.get('total', 1)) * 100
205-
elif metric == 'disk':
206-
rootfs = node_summary.get('rootfs', {})
207-
if rootfs.get('total', 0) > 0:
208-
current_value = (rootfs.get('used', 0) / rootfs.get('total', 1)) * 100
209-
210-
elif target_type == 'vm':
211-
# Get VM metrics
212-
for res in manager.get_resources():
213-
if str(res.get('vmid')) == str(target_id):
214-
if metric == 'cpu':
215-
current_value = res.get('cpu', 0) * 100
216-
elif metric == 'memory':
217-
if res.get('maxmem', 0) > 0:
218-
current_value = (res.get('mem', 0) / res.get('maxmem', 1)) * 100
219-
elif metric == 'disk':
220-
if res.get('maxdisk', 0) > 0:
221-
current_value = (res.get('disk', 0) / res.get('maxdisk', 1)) * 100
222-
target_name = res.get('name', target_id)
223-
break
181+
182+
# NS May 2026 — guard the metric lookup. The old code called
183+
# `manager.get_cluster_summary()` and `manager.get_resources()` —
184+
# neither method exists on PegaProxManager. Cluster + VM targets
185+
# have been raising AttributeError since this was written.
186+
try:
187+
if target_type == 'cluster':
188+
# Aggregate cluster CPU/mem/disk from per-node status
189+
per_node = manager.get_node_status() or {}
190+
online = [n for n in per_node.values()
191+
if (n.get('status') or '').lower() == 'online']
192+
if metric == 'cpu' and online:
193+
current_value = sum(n.get('cpu_percent', 0) for n in online) / len(online)
194+
elif metric == 'memory':
195+
used = sum(n.get('mem_used', 0) for n in online)
196+
total = sum(n.get('mem_total', 0) for n in online)
197+
if total > 0:
198+
current_value = used / total * 100
199+
elif metric == 'disk':
200+
used = sum(n.get('disk_used', 0) for n in online)
201+
total = sum(n.get('disk_total', 0) for n in online)
202+
if total > 0:
203+
current_value = used / total * 100
204+
try:
205+
target_name = manager.config.name
206+
except Exception:
207+
target_name = cluster_id
208+
209+
elif target_type == 'node':
210+
node_summary = manager.get_node_summary(target_id) or {}
211+
if metric == 'cpu':
212+
current_value = node_summary.get('cpu', 0) * 100
213+
elif metric == 'memory':
214+
mem = node_summary.get('memory', {}) or {}
215+
if mem.get('total', 0) > 0:
216+
current_value = (mem.get('used', 0) / mem.get('total', 1)) * 100
217+
elif metric == 'disk':
218+
rootfs = node_summary.get('rootfs', {}) or {}
219+
if rootfs.get('total', 0) > 0:
220+
current_value = (rootfs.get('used', 0) / rootfs.get('total', 1)) * 100
221+
222+
elif target_type == 'vm':
223+
# MK: was `manager.get_resources()` which doesn't exist; the
224+
# actual VM enumerator on PegaProxManager is get_vm_resources()
225+
fetch = getattr(manager, 'get_vm_resources', None)
226+
vms = fetch() if callable(fetch) else []
227+
for res in (vms or []):
228+
if str(res.get('vmid')) == str(target_id):
229+
if metric == 'cpu':
230+
current_value = res.get('cpu', 0) * 100
231+
elif metric == 'memory':
232+
if res.get('maxmem', 0) > 0:
233+
current_value = (res.get('mem', 0) / res.get('maxmem', 1)) * 100
234+
elif metric == 'disk':
235+
if res.get('maxdisk', 0) > 0:
236+
current_value = (res.get('disk', 0) / res.get('maxdisk', 1)) * 100
237+
target_name = res.get('name', target_id)
238+
break
239+
except Exception as e:
240+
logging.warning(f"[AlertCheck] alert {alert_id} metric lookup raised: {e}")
241+
_record_eval(alert_id, reason=f'metric lookup error: {e}',
242+
cluster_id=cluster_id, metric=metric, target_type=target_type)
243+
continue
224244

225245
if current_value is None:
226246
_record_eval(alert_id, reason=f"metric '{metric}' returned no value for {target_type} '{target_id}'",

0 commit comments

Comments
 (0)