@@ -178,49 +178,69 @@ def check_and_send_alerts():
178178
179179 if cluster_id in cluster_managers :
180180 manager = cluster_managers [cluster_id ]
181-
182- if target_type == 'cluster' :
183- # Get cluster-wide metrics
184- summary = manager .get_cluster_summary ()
185- if metric == 'cpu' :
186- current_value = summary .get ('cpu_usage' , 0 )
187- elif metric == 'memory' :
188- mem = summary .get ('memory' , {})
189- if mem .get ('total' , 0 ) > 0 :
190- current_value = (mem .get ('used' , 0 ) / mem .get ('total' , 1 )) * 100
191- elif metric == 'disk' :
192- storage = summary .get ('storage' , {})
193- if storage .get ('total' , 0 ) > 0 :
194- current_value = (storage .get ('used' , 0 ) / storage .get ('total' , 1 )) * 100
195- target_name = manager .config .name
196-
197- elif target_type == 'node' :
198- node_summary = manager .get_node_summary (target_id )
199- if metric == 'cpu' :
200- current_value = node_summary .get ('cpu' , 0 ) * 100
201- elif metric == 'memory' :
202- mem = node_summary .get ('memory' , {})
203- if mem .get ('total' , 0 ) > 0 :
204- current_value = (mem .get ('used' , 0 ) / mem .get ('total' , 1 )) * 100
205- elif metric == 'disk' :
206- rootfs = node_summary .get ('rootfs' , {})
207- if rootfs .get ('total' , 0 ) > 0 :
208- current_value = (rootfs .get ('used' , 0 ) / rootfs .get ('total' , 1 )) * 100
209-
210- elif target_type == 'vm' :
211- # Get VM metrics
212- for res in manager .get_resources ():
213- if str (res .get ('vmid' )) == str (target_id ):
214- if metric == 'cpu' :
215- current_value = res .get ('cpu' , 0 ) * 100
216- elif metric == 'memory' :
217- if res .get ('maxmem' , 0 ) > 0 :
218- current_value = (res .get ('mem' , 0 ) / res .get ('maxmem' , 1 )) * 100
219- elif metric == 'disk' :
220- if res .get ('maxdisk' , 0 ) > 0 :
221- current_value = (res .get ('disk' , 0 ) / res .get ('maxdisk' , 1 )) * 100
222- target_name = res .get ('name' , target_id )
223- break
181+
182+ # NS May 2026 — guard the metric lookup. The old code called
183+ # `manager.get_cluster_summary()` and `manager.get_resources()` —
184+ # neither method exists on PegaProxManager. Cluster + VM targets
185+ # have been raising AttributeError since this was written.
186+ try :
187+ if target_type == 'cluster' :
188+ # Aggregate cluster CPU/mem/disk from per-node status
189+ per_node = manager .get_node_status () or {}
190+ online = [n for n in per_node .values ()
191+ if (n .get ('status' ) or '' ).lower () == 'online' ]
192+ if metric == 'cpu' and online :
193+ current_value = sum (n .get ('cpu_percent' , 0 ) for n in online ) / len (online )
194+ elif metric == 'memory' :
195+ used = sum (n .get ('mem_used' , 0 ) for n in online )
196+ total = sum (n .get ('mem_total' , 0 ) for n in online )
197+ if total > 0 :
198+ current_value = used / total * 100
199+ elif metric == 'disk' :
200+ used = sum (n .get ('disk_used' , 0 ) for n in online )
201+ total = sum (n .get ('disk_total' , 0 ) for n in online )
202+ if total > 0 :
203+ current_value = used / total * 100
204+ try :
205+ target_name = manager .config .name
206+ except Exception :
207+ target_name = cluster_id
208+
209+ elif target_type == 'node' :
210+ node_summary = manager .get_node_summary (target_id ) or {}
211+ if metric == 'cpu' :
212+ current_value = node_summary .get ('cpu' , 0 ) * 100
213+ elif metric == 'memory' :
214+ mem = node_summary .get ('memory' , {}) or {}
215+ if mem .get ('total' , 0 ) > 0 :
216+ current_value = (mem .get ('used' , 0 ) / mem .get ('total' , 1 )) * 100
217+ elif metric == 'disk' :
218+ rootfs = node_summary .get ('rootfs' , {}) or {}
219+ if rootfs .get ('total' , 0 ) > 0 :
220+ current_value = (rootfs .get ('used' , 0 ) / rootfs .get ('total' , 1 )) * 100
221+
222+ elif target_type == 'vm' :
223+ # MK: was `manager.get_resources()` which doesn't exist; the
224+ # actual VM enumerator on PegaProxManager is get_vm_resources()
225+ fetch = getattr (manager , 'get_vm_resources' , None )
226+ vms = fetch () if callable (fetch ) else []
227+ for res in (vms or []):
228+ if str (res .get ('vmid' )) == str (target_id ):
229+ if metric == 'cpu' :
230+ current_value = res .get ('cpu' , 0 ) * 100
231+ elif metric == 'memory' :
232+ if res .get ('maxmem' , 0 ) > 0 :
233+ current_value = (res .get ('mem' , 0 ) / res .get ('maxmem' , 1 )) * 100
234+ elif metric == 'disk' :
235+ if res .get ('maxdisk' , 0 ) > 0 :
236+ current_value = (res .get ('disk' , 0 ) / res .get ('maxdisk' , 1 )) * 100
237+ target_name = res .get ('name' , target_id )
238+ break
239+ except Exception as e :
240+ logging .warning (f"[AlertCheck] alert { alert_id } metric lookup raised: { e } " )
241+ _record_eval (alert_id , reason = f'metric lookup error: { e } ' ,
242+ cluster_id = cluster_id , metric = metric , target_type = target_type )
243+ continue
224244
225245 if current_value is None :
226246 _record_eval (alert_id , reason = f"metric '{ metric } ' returned no value for { target_type } '{ target_id } '" ,
0 commit comments