Skip to content

Commit 4647c39

Browse files
committed
Added the option to alert on snapmirror lag time based on a percentage (over 100%) of the last scheduled update.
1 parent 6cd77e4 commit 4647c39

File tree

3 files changed

+63
-36
lines changed

3 files changed

+63
-36
lines changed

Monitoring/monitor-ontap-services/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ Each rule should be an object with one, or more, of the following keys:
409409
"rules": [
410410
{
411411
"maxLagTime": 86400
412+
"maxLagTimePercent": 200
412413
},
413414
{
414415
"healthy": false
@@ -474,6 +475,8 @@ In the above example, it will alert on:
474475
- Any network interfaces that are down.
475476
- Any EMS message that has an event name of “passwd.changed”.
476477
- Any EMS message that has a severity of "alert" or “emergency”.
478+
- Any SnapMirror relationship with a lag time more than 200% the amount of time since its last scheduled update, if it has a schedule assoicated with it.
479+
Otherwise, if the last successful update has been more than 86400 seconds (24 hours).
477480
- Any SnapMirror relationship with a lag time more than 86400 seconds (24 hours).
478481
- Any SnapMirror relationship that has a non-healthy status.
479482
- Any SnapMirror update that hasn't had any flow of data in 600 seconds (10 minutes).

Monitoring/monitor-ontap-services/cloudformation.yaml

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -497,8 +497,8 @@ Resources:
497497
# "matching conditions." It is intended to be run as a Lambda function, but
498498
# can be run as a standalone program.
499499
#
500-
# Version: v2.11
501-
# Date: 2025-04-22-15:41:20
500+
# Version: v2.12
501+
# Date: 2025-04-24-15:46:12
502502
################################################################################
503503
504504
import json
@@ -878,9 +878,9 @@ Resources:
878878
# This function is used to find an existing SM relationship based on the source
879879
# and destinatino path passed in. It returns None if one isn't found
880880
################################################################################
881-
def getPreviousSMRecord(relationShips, sourceCluster, sourcePath, destPath):
881+
def getPreviousSMRecord(relationShips, uuid):
882882
for relationship in relationShips:
883-
if relationship['sourcePath'] == sourcePath and relationship['destPath'] == destPath and relationship['sourceCluster'] == sourceCluster:
883+
if relationship.get('uuid') == uuid:
884884
relationship['refresh'] = True
885885
return(relationship)
886886
@@ -974,11 +974,20 @@ Resources:
974974
cron_expression = f"{minutes} {hours} {daysOfMonth} {months} {daysOfWeek}"
975975
#
976976
# Initialize CronSim with the cron expression and current time.
977-
it = CronSim(cron_expression, datetime.datetime.now(), reverse=True)
977+
curTime = datetime.datetime.now()
978+
curTimeSec = curTime.timestamp()
979+
it = CronSim(cron_expression, curTime, reverse=True)
978980
#
979981
# Get the last run time.
980-
last_run_time = next(it)
981-
return last_run_time.timestamp()
982+
lastRunTime = next(it)
983+
lastRunTimeSec = lastRunTime.timestamp()
984+
#
985+
# If the lastRunTime is now, or within a minute the resolution of cron,
986+
# then go one more back.
987+
if (curTimeSec - lastRunTimeSec) <= 60:
988+
lastRunTime = next(it)
989+
lastRunTimeSec = lastRunTime.timestamp()
990+
return int(lastRunTimeSec)
982991
else:
983992
logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.')
984993
return -1
@@ -1108,7 +1117,10 @@ Resources:
11081117
#
11091118
# For lag time if maxLagTimePercent is defined check to see if there is a schedule,
11101119
# if there is alert on that otherrwise alert on the maxLagTime.
1111-
if record.get("lag_time") is not None:
1120+
# But, first check that lag_time is defined, and that the state is not "uninitialized",
1121+
# since the lag_time is set to the oldest snapshot of the source volume which would
1122+
# cause a false positive.
1123+
if record.get("lag_time") is not None and record["state"].lower() != "uninitialized":
11121124
lagSeconds = parseLagTime(record["lag_time"])
11131125
if maxLagTimePercent is not None:
11141126
lastScheduledUpdate = getLastScheduledUpdate(record)
@@ -1161,21 +1173,18 @@ Resources:
11611173
events.append(event)
11621174
11631175
if stalledTransferSeconds is not None:
1164-
if record.get('transfer') and record['transfer']['state'].lower() == "transferring":
1165-
sourcePath = record['source']['path']
1166-
destPath = record['destination']['path']
1176+
if record.get('transfer') is not None and record['transfer']['state'].lower() == "transferring":
1177+
transferUuid = record['transfer']['uuid']
11671178
bytesTransferred = record['transfer']['bytes_transferred']
1168-
1169-
prevRec = getPreviousSMRecord(smRelationships, sourceClusterName, sourcePath, destPath)
1170-
1179+
prevRec = getPreviousSMRecord(smRelationships, transferUuid) # This reset the "refresh" field if found.
11711180
if prevRec != None:
11721181
timeDiff=curTime - prevRec["time"]
11731182
if prevRec['bytesTransferred'] == bytesTransferred:
11741183
if (curTime - prevRec['time']) > stalledTransferSeconds:
11751184
uniqueIdentifier = record['uuid'] + "_" + "transfer"
11761185
11771186
if not eventExist(events, uniqueIdentifier):
1178-
message = f'Snapmiorror transfer has stalled: {sourceClusterName}::{sourcePath} -> {clusterName}::{destPath}.'
1187+
message = f"Snapmiorror transfer has stalled: {sourceClusterName}::{record['source']['path']} -> {clusterName}::{record['destination']['path']}."
11791188
sendAlert(message, "WARNING")
11801189
changedEvents=True
11811190
event = {
@@ -1194,9 +1203,7 @@ Resources:
11941203
"time": curTime,
11951204
"refresh": True,
11961205
"bytesTransferred": bytesTransferred,
1197-
"sourcePath": sourcePath,
1198-
"destPath": destPath,
1199-
"sourceCluster": sourceClusterName
1206+
"uuid": transferUuid
12001207
}
12011208
updateRelationships = True
12021209
smRelationships.append(prevRec)
@@ -1205,7 +1212,12 @@ Resources:
12051212
i = 0
12061213
while i < len(smRelationships):
12071214
if not smRelationships[i]["refresh"]:
1208-
logger.debug(f'Deleting smRelationship: {smRelationships[i]["destPath"]}')
1215+
relationshipId = smRelationships[i].get("uuid")
1216+
if relationshipId is None:
1217+
id="Old format"
1218+
else:
1219+
id = relationshipId
1220+
logger.debug(f'Deleting smRelationship: {id}')
12091221
del smRelationships[i]
12101222
updateRelationships = True
12111223
else:

Monitoring/monitor-ontap-services/monitor_ontap_services.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,9 @@ def processEMSEvents(service):
399399
# This function is used to find an existing SM relationship based on the source
400400
# and destinatino path passed in. It returns None if one isn't found
401401
################################################################################
402-
def getPreviousSMRecord(relationShips, sourceCluster, sourcePath, destPath):
402+
def getPreviousSMRecord(relationShips, uuid):
403403
for relationship in relationShips:
404-
if relationship['sourcePath'] == sourcePath and relationship['destPath'] == destPath and relationship['sourceCluster'] == sourceCluster:
404+
if relationship.get('uuid') == uuid:
405405
relationship['refresh'] = True
406406
return(relationship)
407407

@@ -495,11 +495,20 @@ def getLastRunTime(scheduleUUID):
495495
cron_expression = f"{minutes} {hours} {daysOfMonth} {months} {daysOfWeek}"
496496
#
497497
# Initialize CronSim with the cron expression and current time.
498-
it = CronSim(cron_expression, datetime.datetime.now(), reverse=True)
498+
curTime = datetime.datetime.now()
499+
curTimeSec = curTime.timestamp()
500+
it = CronSim(cron_expression, curTime, reverse=True)
499501
#
500502
# Get the last run time.
501-
last_run_time = next(it)
502-
return last_run_time.timestamp()
503+
lastRunTime = next(it)
504+
lastRunTimeSec = lastRunTime.timestamp()
505+
#
506+
# If the lastRunTime is now, or within a minute the resolution of cron,
507+
# then go one more back.
508+
if (curTimeSec - lastRunTimeSec) <= 60:
509+
lastRunTime = next(it)
510+
lastRunTimeSec = lastRunTime.timestamp()
511+
return int(lastRunTimeSec)
503512
else:
504513
logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.')
505514
return -1
@@ -629,7 +638,10 @@ def processSnapMirrorRelationships(service):
629638
#
630639
# For lag time if maxLagTimePercent is defined check to see if there is a schedule,
631640
# if there is alert on that otherrwise alert on the maxLagTime.
632-
if record.get("lag_time") is not None:
641+
# But, first check that lag_time is defined, and that the state is not "uninitialized",
642+
# since the lag_time is set to the oldest snapshot of the source volume which would
643+
# cause a false positive.
644+
if record.get("lag_time") is not None and record["state"].lower() != "uninitialized":
633645
lagSeconds = parseLagTime(record["lag_time"])
634646
if maxLagTimePercent is not None:
635647
lastScheduledUpdate = getLastScheduledUpdate(record)
@@ -682,21 +694,18 @@ def processSnapMirrorRelationships(service):
682694
events.append(event)
683695

684696
if stalledTransferSeconds is not None:
685-
if record.get('transfer') and record['transfer']['state'].lower() == "transferring":
686-
sourcePath = record['source']['path']
687-
destPath = record['destination']['path']
697+
if record.get('transfer') is not None and record['transfer']['state'].lower() == "transferring":
698+
transferUuid = record['transfer']['uuid']
688699
bytesTransferred = record['transfer']['bytes_transferred']
689-
690-
prevRec = getPreviousSMRecord(smRelationships, sourceClusterName, sourcePath, destPath)
691-
700+
prevRec = getPreviousSMRecord(smRelationships, transferUuid) # This reset the "refresh" field if found.
692701
if prevRec != None:
693702
timeDiff=curTime - prevRec["time"]
694703
if prevRec['bytesTransferred'] == bytesTransferred:
695704
if (curTime - prevRec['time']) > stalledTransferSeconds:
696705
uniqueIdentifier = record['uuid'] + "_" + "transfer"
697706

698707
if not eventExist(events, uniqueIdentifier):
699-
message = f'Snapmiorror transfer has stalled: {sourceClusterName}::{sourcePath} -> {clusterName}::{destPath}.'
708+
message = f"Snapmiorror transfer has stalled: {sourceClusterName}::{record['source']['path']} -> {clusterName}::{record['destination']['path']}."
700709
sendAlert(message, "WARNING")
701710
changedEvents=True
702711
event = {
@@ -715,9 +724,7 @@ def processSnapMirrorRelationships(service):
715724
"time": curTime,
716725
"refresh": True,
717726
"bytesTransferred": bytesTransferred,
718-
"sourcePath": sourcePath,
719-
"destPath": destPath,
720-
"sourceCluster": sourceClusterName
727+
"uuid": transferUuid
721728
}
722729
updateRelationships = True
723730
smRelationships.append(prevRec)
@@ -726,7 +733,12 @@ def processSnapMirrorRelationships(service):
726733
i = 0
727734
while i < len(smRelationships):
728735
if not smRelationships[i]["refresh"]:
729-
logger.debug(f'Deleting smRelationship: {smRelationships[i]["destPath"]}')
736+
relationshipId = smRelationships[i].get("uuid")
737+
if relationshipId is None:
738+
id="Old format"
739+
else:
740+
id = relationshipId
741+
logger.debug(f'Deleting smRelationship: {id}')
730742
del smRelationships[i]
731743
updateRelationships = True
732744
else:

0 commit comments

Comments
 (0)