diff --git a/Monitoring/monitor-ontap-services/README.md b/Monitoring/monitor-ontap-services/README.md index b923527c..f852fa43 100644 --- a/Monitoring/monitor-ontap-services/README.md +++ b/Monitoring/monitor-ontap-services/README.md @@ -1,131 +1,179 @@ # Monitoring ONTAP Services ## Introduction -This program is used to monitor various services of a NetApp ONTAP file system. It uses the ONTAP APIs to obtain the required information to determine if any of the conditions that are being monitored have been met. If they have, then the program will send an SNS message to the specified SNS topic. The program will also send a syslog message to a syslog server if the syslogIP parameter is set. The program will store the event information in an S3 bucket so that it can be compared against it before sending a second message for the same event. The configuration files is also kept in the S3 bucket for easy access. +This program is used to monitor various services of an AWS FSx for NetApp ONTAP file system and alert you if anything +is outside of the specified conditions. It uses the ONTAP APIs to obtain the required information to +determine if any of the conditions that are being monitored have been met. +If they have, then the program will send an SNS message to the specified SNS topic. The program can also send +a syslog message to a syslog server as well as store the event information into a CloudWatch Log Stream. +The program will store the event information in an S3 bucket so that it can be compared against to ensure +it doesn't send multiple messages for the same event. You can configure the program either via environment variables +or via a configuration file. The configuration file is kept in the S3 bucket for easy access. + Here is an itemized list of the services that this program can monitor: - If the file system is available. - If the underlying Data ONTAP version has changed. - If the file system is running off its partner node (i.e. is running in failover mode). -- Any EMS message, with filtering to allow you to only be alerted on the ones you care about. -- If a SnapMirror relationship hasn't been updated in a specified amount of time. +- If any of the network interfaces are down. +- Any EMS message. Filtering is provided to allow you to only be alerted on the EMS messages you care about. +- If any of the vservers are down. +- If any of the protocol (NFS & CIFS) servers within a vserver are down. +- If a SnapMirror relationship hasn't been updated within either a specified amount of time or as a percentage of time since its last scheduled update. - If a SnapMirror update has stalled. - If a SnapMirror relationship is in a "non-healthy" state. -- If the aggregate is over a certain percentage full. User can set two thresholds (Warning and Critical). -- If a volume is over a certain percentage full. User can set two thresholds (Warning and Critical). -- If any quotas are over a certain percentage full. User can follow both soft and hard limits. +- If the aggregate is over a certain percentage full. You can set two thresholds (Warning and Critical). +- If a volume is over a certain percentage full. You can set two thresholds (Warning and Critical). +- If a volume is using more than a specified percentage of its inodes. You can set two thresholds (Warning and Critical). +- If a volume if offline. +- If any quotas are over a certain percentage full. You can be alerted on both soft and hard limits. ## Architecture -The program is designed to be run as a Lambda function. It is triggered by an EventBridge rule that is set to run -on a regular basis. The program will then use the ONTAP APIs to obtain the required information to determine if -any of the conditions that are being monitored have been met. If they have, then the program will send an SNS message -to the specified SNS topic. The program will also send a syslog message to a syslog server if the syslogIP parameter -is set. The program will store the event information in an S3 bucket so that it can be compared against it before sending -a second message for the same event. The configuration files is also kept in the S3 bucket for easy access. +The program is designed to be run as a Lambda function but can be run as a standalone program. +As a Lambda function it can be set up to run on a regular basis by creating an EventBridge schedule. +Once the program has been invoked it will use the ONTAP APIs to obtain the required information +from the ONTAP system. It will compare this information against the conditions that have been +specified in the conditions files. If they have, then the program will send an SNS message +to the specified SNS topic and optionally, send a syslog message as well as put an event +into a CloudWatch log group. The program stores event information in an S3 bucket so it can ensure that it doesn't +send duplicate messages for the same event. The configuration file is also kept in the S3 bucket for easy access. + +Since the program must be able to communicate with the FSxN file system management endpoint, it must +run within a VPC that has connectivity to the FSxN file system. This requires special considerations for +a Lambda function, both how it is deployed, and how it is able to access AWS services. You can read more about +that in the [Endpoints for AWS Services](#endpoints-for-aws-services) section below. ![Architecture](images/Monitoring_ONTAP_Services_Architecture-2.png) ## Prerequisites - An FSx for NetApp ONTAP file system you want to monitor. +- An S3 bucket to store the configuration and event status files, as well as the Lambda layer zip file. + - You will need to download the [Lambda layer zip file](https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_onstap_services/lambda_layer.zip) from this repo and upload it to the S3 bucket. Be sure to preserve the name `lambda_layer.zip`. - The security group associated with the FSx for ONTAP file system must allow inbound traffic from the Lambda function over TCP port 443. - An SNS topic to send the alerts to. - An AWS Secrets Manager secret that holds the FSx for ONTAP file system credentials. There should be two keys in the secret, one for the username and one for the password. +- Optionally: + - A CloudWatch Log Group to store events. + - A syslog server to receive event messages. ## Installation -There are two ways to install this program. You can either perform all the steps show in the [Manual Installation](#manual-installation) section below, or run -the CloudFormation template that is provided in this repository. The manual installation is more involved, but it gives you more control and allows to you -make changes to settings that aren't available in the CloudFormation template. The CloudFormation template is easier to use, but it doesn't allow for as much -customization. +There are two ways to install this program. You can either perform all the steps shown in the +[Manual Installation](#manual-installation) section below, or run the [CloudFormation template](cloudformation.yaml) +that is provided in this repository. The manual installation is more involved, but it gives you +more control and allows you to make changes to settings that aren't available through the CloudFormation template. +The CloudFormation template is easier to use, but it doesn't allow for as much customization. ### Installation using the CloudFormation template The CloudFormation template will do the following: -- Create a role for the Lambda function to use. The permissions will be the same as what is outlined in the [Create an AWS Role](#create-an-aws-role) section below. -- Create the Lambda function with the Python code provided in this repository. -- Create an S3 bucket for the Lambda function to store the matching conditions file, and the event information, in. -- Create an EventBridge Schedule to trigger the Lambda function every 15 minutes. If you want the function to run more or less frequently, you can change that after the CloudFormation stack has been created. +- Create a role for the Lambda function to use. The permissions will be the same as what + is outlined in the [Create an AWS Role](#create-an-aws-role) section below. + **NOTE:** You can provide the ARN of an existing role to use instead of having it create a new one. - Create a role that allows the EventBridge schedule to trigger the Lambda function. + The only permission that this role needs is to be able to invoke a Lambda function. + **NOTE:** You can provide the ARN of an existing role to use instead of having it create a new one. +- Create the Lambda function with the Python code provided in this repository. +- Create an EventBridge Schedule to trigger the Lambda function. By default, it will trigger + it to run every 15 minutes, although there is a parameter that will allow you to set it to whatever interval you want. - Optionally create a CloudWatch alarm that will alert you if the Lambda function fails. -- Optionally create a VPC Endpoints for the SNS, Secrets Manager and/or S3 services. +- Optionally create a VPC Endpoints for the SNS, Secrets Manager, CloudWatch and/or S3 AWS services. To install the program using the CloudFormation template, you will need to do the following: -1. Download the CloudFormation template from this repository. You can do that by clicking on the 'cloudformation.yaml' file in the repository, then clicking on the download icon next to the "Raw" button at the top right of the page. That should cause your browser to download the file to you local computer. -2. Go to the CloudFormation service in the AWS console and click on "Create stack (with new resources)". +1. Download the CloudFormation template from this repository. You can do that by clicking on + the [cloudformation.yaml](./cloudformation.yaml) file in the repository, then clicking on + the download icon next to the "Raw" button at the top right of the page. That should + cause your browser to download the file to your local computer. +2. Go to the [CloudFormation service in the AWS console](https://us-west-2.console.aws.amazon.com/cloudformation/) and click on "Create stack (with new resources)". 3. Choose the "Upload a template file" option and select the CloudFormation template you downloaded in step 1. -4. This should bring up a new window with several of parameters to provide values to. Most have defaults, but some do require values to be provided. See the list below for what each parameter is for. +4. This should bring up a new window with several parameters to provide values to. Most have + defaults, but some do require values to be provided. See the list below for what each parameter is for. |Parameter Name | Notes| |---|---| -|Stackname|The name you want to assign to the CloudFormation stack. Note that this name is used as a base name for the resources it creates, so please keep it **under 25 characters**. Also, since it is used as part of the s3 bucket name that it creates to keep event information in, it **must be in all lower case letters**.| +|Stackname|The name you want to assign to the CloudFormation stack. Note that this name is used as a base name for some of the resources it creates, so please keep it **under 25 characters**.| |OntapAdminServer|The DNS name, or IP address, of the management endpoint of the FSxN file system you wish to monitor.| -|SubnetIds|The subnet IDs that the Lambda function will be attached to. Must have connectivity to the FSxN file system you wish to monitor.| -|SecurityGroupIds|The security group IDs that the Lambda function will be attached to. The security group most allow outbound traffic over port 443 to the SNS, Secrets Manager and S3 endpoints, as well as the FSxN file system you want to monitor.| +|S3BucketName|The name of the S3 bucket where you want the program to store event information. It should also have a copy of the `lambda_layer.zip` file. **NOTE** This bucket must be in the same region where this CloudFormation stack is being created.| +|SubnetIds|The subnet IDs that the Lambda function will be attached to. They must have connectivity to the FSxN file system management endpoint that you wish to monitor.| +|SecurityGroupIds|The security group IDs that the Lambda function will be attached to. The security group must allow outbound traffic over port 443 to the SNS, Secrets Manager, and CloudWatch and S3 AWS service endpoints, as well as the FSxN file system you want to monitor.| |SnsTopicArn|The ARN of the SNS topic you want the program to publish alert messages to.| -|SecretArn|The ARN of the secret within the AWS Secrets Manager that holds the FSxN file system credentials. **NOTE:** The secret must be in the same region as the FSxN file system.| -|SecretUsernameKey|The key name within the secret that holds the username portion of the FSxN file system credentials.| -|SecretPasswordKey|The key name within the secret that holds the password portion of the FSxN file system credentials.| +|CloudWatchLogGroupName|The name of **an existing** CloudWatch Log Group that the Lambda function can send event messages to. It will create a new Log Stream within the Log Group every day that is unique to this file system so you can use the same Log Group for multiple instances of this program. If this field is left blank, alerts will not be sent to CloudWatch.| +|SecretArn|The ARN of the secret within the AWS Secrets Manager that holds the FSxN file system credentials.| +|SecretUsernameKey|The name of the key within the secret that holds the username portion of the FSxN file system credentials.| +|SecretPasswordKey|The name of the key within the secret that holds the password portion of the FSxN file system credentials.| |CheckInterval|The interval, in minutes, that the EventBridge schedule will trigger the Lambda function. The default is 15 minutes.| |CreateCloudWatchAlarm|Set to "true" if you want to create a CloudWatch alarm that will alert you if the Lambda function fails.| -|CreateSNSEndpoint|Set to "true" if you want to create an SNS endpoint. **NOTE:** If an SNS Endpoint already exist for the specified Subnet the creation will fail, causing the entire CloudFormation script to fail. Since the Lambda function will be running within your VPC it will most likely not have access to the Internet, therefore a endpoint will need to be created if you don't already have one. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| -|CreateSecretsManagerEndpoint|Set to "true" if you want create a Secrets Manager endpoint. **NOTE:** If an SecretsManager Endpoint already exist for the specified Subnet the creation will fail, causing the entire CloudFormation script to fail. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| -|CreateS3Endpoint|Set to "true" if you want create an S3 endpoint. **NOTE:** If an S3 Gateway Endpoint already exist for the specified VPC the creation will fail, causing the entire CloudFormation script to fail. Note that this will be a "Gateway" type endpoint, since they are free to use. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| -|RoutetableIds|The route table IDs to update to use the S3 endpoint. Since the S3 endpoint is of type 'Gateway' route tables have to be updated to use it. This parameter is only needed if createS3Endpoint is set to 'true'.| -|VpcId|The VPC ID where the FSxN file system is located. This is only needed if you are creating an endpoint.| -|EndpointSecurityGroupIds|The security group IDs that the endpoint will be attached to. The security group must allow traffic over TCP port 443 from the Lambda function. This is only needed if you are creating an SNS or SecretsManager endpoint.| - -The remaining parameters are used to create the matching conditions file, which specify when the program will send an SNS alert. -You can read more about it in the [Matching Conditions File](#matching-conditions-file) section below. All these parameters have default values -so you don't have to set them if you don't want to. Note that if you enable EMS alerts, then the default rule will +|CreateSecretsManagerEndpoint|Set to "true" if you want to create a Secrets Manager endpoint. **NOTE:** If an SecretsManager Endpoint already exist for the specified Subnet the endpoint creation will fail, causing the entire CloudFormation stack to fail. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| +|CreateSNSEndpoint|Set to "true" if you want to create an SNS endpoint. **NOTE:** If an SNS Endpoint already exist for the specified Subnet the endpoint creation will fail, causing the entire CloudFormation stack to fail. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| +|CreateCWEndpoint|Set to "true" if you want to create a CloudWatch endpoint. **NOTE:** If an CloudWatch Endpoint already exist for the specified Subnet the endpoint creation will fail, causing the entire CloudFormation stack to fail. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| +|CreateS3Endpoint|Set to "true" if you want to create an S3 endpoint. **NOTE:** If an S3 Gateway Endpoint already exist for the specified VPC the endpoint creation will fail, causing the entire CloudFormation stack to fail. Note that this will be a "Gateway" type endpoint, since they are free to use. Please read the [Endpoints for AWS services](#endpoints-for-aws-services) for more information.| +|RoutetableIds|The route table IDs to update to use the S3 endpoint. Since the S3 endpoint is of type `Gateway` route tables have to be updated to use it. This parameter is only needed if you are creating an S3 endpoint.| +|VpcId|The ID of a VPC where the subnets provided above are located. This is only needed if you are creating an endpoint.| +|EndpointSecurityGroupIds|The security group IDs that the endpoint will be attached to. The security group must allow traffic over TCP port 443 from the Lambda function. This is only needed if you are creating an SNS, CloudWatch or SecretsManager endpoint.| +|LambdaRoleArn|The ARN of the role that the Lambda function will use. This role must have the permissions listed in the [Create an AWS Role](#create-an-aws-role) section below. If left blank a role will be created for you.| +|SchedulerRoleArn|The ARN of the role that the EventBridge schedule will use to trigger the Lambda function. It just needs the permission to invoke a Lambda function. If left blank a role will be created for you.| + +The remaining parameters are used to create the matching conditions configuration file, which specify when the program will send an alert. +You can read more about it in the [Matching Conditions File](#matching-conditions-file) section below. All these parameters have reasonable default values +so you probably won't have to change any of them. Note that if you enable EMS alerts, then the default rule will send all EMS messages that have a severity of `Error`, `Alert` or `Emergency`. You can change the matching conditions at any time by updating the matching conditions file that is created in the S3 bucket. The name of the file will be \-conditions where "\" is the value you -set for the OntapAdminServer parameter. To find the name of the S3 bucket, or any of the resources that were -created, you can go to the CloudFormation service in the AWS console, click on the stack you created -(based on the name you provided as the first parameter above), and then click on the "Resources" tab. +set for the OntapAdminServer parameter. ### Post Installation Checks -After the stack has been created, I would recommend checking the status of the Lambda function to make sure it is +After the stack has been created, check the status of the Lambda function to make sure it is not in an error state. To find the Lambda function go to the Resources tab of the CloudFormation stack and click on the "Physical ID" of the Lambda function. This should bring you to the Lambda service in the AWS -console. Once there, you can click on the "Monitor" tab to see if the function has been invoked. Locate the +console. Once there, click on the "Monitor" tab to see if the function has been invoked. Locate the "Error count and success rate(%)" chart, which is usually found at the top right corner of the "Monitor" dashboard. Within the "CheckInterval" number of minutes there should be at least one dot on that chart. Note that sometimes -the chart is initially slow to reflect any status so you might have to be patient, and continue to press the "refresh" -button (the icon with a circle on it) to see an status. Once you see a dot on the chart, when you hover your mouse +the chart is initially slow to reflect any status so you might have to be patient. Continue to press the "refresh" +button (the icon with a circle on it) to update the status. Once you see a dot on the chart, when you hover your mouse over it, you should see the "success rate" and "number of errors." The success rate should be 100% and the number of errors should be 0. If it is not, then scroll down to the CloudWatch Logs section and click on the most recent log stream. This will show you the output of the Lambda function. If there are any errors, they will be displayed -there. If you can't figure out what the error is, then please create an issue in this repository and someone will -help you. +there. If you can't figure out what is causing an error, then please create an issue in this repository and someone +will help you. + +--- ### Manual Installation If you want more control over the installation then you can install it manually by following the steps below. Note that these instructions assume you have familiarity with how to create the various AWS service mentioned below. If you do not, -I would recommend using the CloudFormation method of deploying the program. Afterwards, if you need to change things, make the required -modifications then. +the recommended course of action is to use the CloudFormation method of deploying the program. Then, if you need to change things, +you can make the required modifications using the information found below. #### Create an AWS Role -This program doesn't need many permissions. It just needs to be able to read the FSxN file system credentials stored in a Secrets Manager secret, -read and write objects in an s3 bucket, and be able to publish SNS messages. Below is the specific list of permissions -needed. The easiest way to give the Lambda function the permissions it needs is by creating a role with these -permissions and assigning it to the Lambda function. +This program doesn't need many AWS permissions. It just needs to be able to read the FSxN file system credentials stored in a Secrets Manager secret, +read and write objects in an s3 bucket, be able to publish SNS messages, and optionally create CloudWatch log Streams and put events. +Below is the specific list of permissions needed. | Permission | Reason | |:------------------------------|:----------------| -|secretsmanager:GetSecretValue | Needs to be able to retrieve the FSxN administrator credentials. | -|sns:Publish | Since it sends messages (alerts) via SNS, it needs to be able to do so. | -|s3:PutObject | The program stores its state information in various s3 objects.| -|s3:GetObject | The program reads previous state information, as well as configuration from various s3 objects. | -|s3:ListBucket | To allow the program to know if an object exist or not. | -|ec2:CreateNetworkInterface | Since the program runs as a Lambda function within your VPC, it needs to be able to create a network interface in your VPC. you can read more about that [here](https://docs.aws.amazon.com/lambda/latest/dg/configuration-vpc.html). | +|secretsmanager:GetSecretValue | To be able to retrieve the FSxN administrator credentials.| +|sns:Publish | To allow it to send messages (alerts) via SNS.| +|s3:PutObject | So it can store its state information in various s3 objects.| +|s3:GetObject | So it can retrieve previous state information, as well as configuration files, from various s3 objects. | +|s3:ListBucket | So it can detect if an object exist or not. | +|logs:CreateLogStream | If you want the program to send its logs to CloudWatch, it needs to be able to create a log stream. | +|logs:PutLogEvents | If you want the program to send its logs to CloudWatch, it needs to be able to put log events into the log stream. | +|logs:DescribeLogStreams | If you want the program to send its logs to CloudWatch, it needs to be able to see if a log stream already exists before attempting to send events to it. | +|ec2:CreateNetworkInterface | Since the program runs as a Lambda function within your VPC, it needs to be able to create a network interface in your VPC. You can read more about that [here](https://docs.aws.amazon.com/lambda/latest/dg/configuration-vpc.html). | |ec2:DeleteNetworkInterface | Since it created a network interface, it needs to be able to delete it when not needed anymore. | -|ec2:DescribeNetworkInterfaces | So it can check to see if an network interface already exist. | +|ec2:DescribeNetworkInterfaces | So it can check to see if a network interface already exists. | #### Create an S3 Bucket -One of the goals of the program is to not send multiple messages for the same event. It does this by storing the event -information in an s3 object so it can be compared against before sending a second message for the same event. +The first use of the s3 bucket will be to store the Lambda layer zip file. This is required to include some dependencies that +aren't included in the AWS Lambda environment. Currently the only dependency in the zip file is [cronsim](https://pypi.org/project/cronsim/). +This is used to interpret the SnapMirror schedules to be able to report on lag issues. You can download the zip file from this repository by clicking on +the [lambda_layer.zip](https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_onstap_services/lambda_layer.zip) link. +You will refer to this file, and bucket, when you create the Lambda function. + +Another use of the s3 bucket is to store events that have already reported on so they can be compared against +to ensure program does not send multiple messages for the same event. Note that it doesn't keep every event indefinitely, it only stores them while the condition is true. So, say for example it sends an alert for a SnapMirror relationship that has a lag time that is too long. It will send the alert and store the event. Once a successful SnapMirror synchronization has happened, the event will be removed -from the s3 object allowing for a new event to be created and alerted on. +from the s3 object allowing for a new event to be created and alerted on. If you want to keep the event information +longer than that, please configure the program to store them in a CloudWatch log group. So, for the program to function, you will need to provide an S3 bucket for it to store event history. It is recommended to have a separate bucket for each deployment of this program. However, that isn't required, since you can @@ -134,50 +182,97 @@ overwrite the event files of another instance. This bucket is also used to store the Matching Condition file. You can read more about it in the [Matching Conditions File](#matching-conditions-file) below. -**Note:** This bucket must be in the same region as the FSxN file system. - #### Create an SNS Topic Since the way this program sends alerts is via an SNS topic, you need to either create SNS topic, or use an existing one. -**Note:** This SNS topic must be in the same region as the FSxN file system. +#### Create a Secrets Manager Secret +Since the program issues API calls to the FSxN file system, it needs to be able to authenticate itself to the FSxN file system. +The safest way to provide credentials to the program is to use the AWS Secrets Manager service. Therefore, a Secrets Manager +secret must be created that contains the FSxN file system credentials. The secret should contain two keys, one for the +username and one for the password. + +The following command will create the secret for you. Just replace the values in the command with your own. + +```bash +aws secretsmanager create-secret --name --secret-string '{"username":"","password":""}' +``` + +#### Create a CloudWatch Log Group +If you want the program to send its logs to CloudWatch, you will need to create a CloudWatch Log Group for it to +send its logs to. The program will create a new log stream within the log group every day that is unique to the file system. +This step is optional if you don't want to send the logs to CloudWatch. #### Endpoints for AWS Services -If you deploy this as a Lambda function, you will have to attach it to the VPC that your FSx file system resides -in so it can run ONTAP APIs against it. When you do that, it is likely that Lambda function will not have access the -Internet. Therefore, the Lambda function will require an AWS Service Endpoints for -any service that it uses. In the case of this program, it needs an endpoint for the SNS, Secrets Manager and the S3 services. -For the S3 service, it is best to deploy a "Gateway" type endpoint, since they are free. Unfortunately, you can't -deploy a Gateway type endpoints for the SNS and Secret Manager services, so those have to be "Interface" type. If -you don't setup the endpoints, the Lambda function will hang on the first AWS API call it tries to perform, which is typically calling the -Secrets Managers to obtain the credentials of the FSx File System. So, if you -find that the Lambda function times out, even after adjusting the timeout to more than a minute, then chances -are this is your problem. - -**NOTE:** The way the Lambda function is able to use the "local" (i.e. within the subnet) Interface endpoint, as -opposed to the Internet facing one, is usually from the DNS resolution of the endpoint hostname -"\.\.amazonaws.com". In order for that to happen, you have to enable “Private DNS names” -for the endpoint. In order to do that, it is required to enable “DNS Hostnames” within the VPC settings. This VPC -setting is not enabled by default. After making these changes, if you are using Route53 as your DNS resolver for -your VPC, then it will automatically return the local endpoint IP address instead of the Internet facing one. -However, if you have your VPC setup to not use Route53 as its DNS resolver then you'll need to override the -endpoint that the Lambda function uses for the SNS and Secrets Manager services by setting the snsEndPointHostname, -and secretsManagerEndPointHostname configuration variables (you'll see how to do that below). You should set -them to the "local" DNS name of the respective endpoints. +If you deploy this program as a Lambda function, you will have to run it within a VPC that has connectivity to the FSxN file +system that you want to monitor. The Lambda function will also need access to the AWS service endpoints for the AWS services +that it uses (S3, SNS, CloudWatch, and SecretsManager). Access to these service endpoints is typically routed through the Internet, +however, because of the way AWS gives Lambda access to your subnet, it will not be allowed access to the Internet through an Internet Gateway +but it will allow access to the Internet through a NAT Gateway. Therefore, in order to allow access to these service endpoints you'll need to do one of the following: +- Deploy the Lambda function in a subnet that does **not** have an Internet Gateway, yet does have access + to the Internet via a NAT Gateway or through a Transit Gateway. +- Deploy AWS Service endpoint for each of the AWS services that the Lambda function uses. + +If you do need to deploy AWS service endpoints, keep the following in mind: +- Since VPC endpoints can't traverse AWS regions, all AWS assets (e.g. FSx for ONTAP file system, SecretsManager Secret, S3 bucket, + SNS Topic and CloudWatch Log Group) must be in the same region as the VPC endpoint. +- For interface type endpoints, a network interface will be created in the VPC's subnet to allow access. + To transparently access the service via the VPC endpoint, AWS will update its + DNS entry for the service endpoint to point to the IP address of the VPC interface endpoint. This is only + done if the "Private DNS names" option is enabled for the endpoint and "DNS Hostnames" is enabled for the subnet. + If those two options aren't possible, and/or you aren't using AWS's DNS resolver, then you can set the + following configuration parameters to override the hostname that the program + uses for the specified AWS service endpoints. You will want to set the parameter to the DNS hostname for the + endpoint. It typically starts with 'vpce'. + + |Configuration Parameter|AWS Service Name| + |---|---| + |snsEndPointHostname|Simple Notification Service (SNS)| + |secretsManagerEndPointHostname|Secrets Manager| + |cloudWatchEndPointHostname|CloudWatch Logs| +- For the S3 service endpoint it is best to deploy a "Gateway" type endpoint, since they are free. For the other + services they will have to be of type "interface." + - In order for a gateway type service endpoint to be accessible, a route has to be created in the + subnet's routing table. + +**NOTE:** One indication that the Lambda function doesn't have access to an AWS service endpoint is if the Lambda function +times out, even after adjusting the timeout to more than several minutes. #### Lambda Function There are a few things you need to do to properly configure the Lambda function. -- Give it the permissions listed above. -- Put it in a VPC and subnet that has access to the FSxN file system management endpoint. **NOTE:** It must be in the same region as the FSxN file system. -- Increase the total run time to at least 20 seconds. You might have to raise that if you have a lot of components in your FSxN file system. However, if you have to raise it to more than a minute, it could be an issue with the endpoint causing the calls to the AWS services to hang. See the [Endpoints for AWS Services](#endpoints-for-aws-services) section above for more information. -- Provide for the base configuration via environment variables and/or a configuration file. See the [Configuration Parameters](#configuration-parameters) section below for more information. -- Create the "Matching Conditions" file, that specifies when the Lambda function should send alerts. See the [Matching Conditions File](#matching-conditions-file) section below for more information. -- Set up an EventBridge Schedule rule to trigger the function on a regular basis. +- Assign it the role you created above. +- Put it in a VPC and subnet that has access to the FSxN file system management endpoint. +- Assign it the security group that allows outbound traffic over TCP port 443 to the FSxN file system management endpoint. + +Once you have created the function you will be able to: +- Copy the Python code from the [monitor_ontap_service.py](monitor_ontap_services.py) + file found in this repository into the code box and deploy it. +- Add the Lambda layer to the function. You do this by first creating a Lambda layer then adding it to your function. + To create a Lambda layer go to the Lambda service page on the AWS console and click on the "Layers" + tab under the "Additional resources" section. Then, click on the "Create layer" button. + From there you'll need to provide a name for the layer, and the path to the + [lambda_layer.zip](https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_onstap_services/lambda_layer.zip) + file that you should download from this repository. If you uploaded that into the S3 bucket you created above, then + just provide the S3 path to the file. For example, `s3://.s3..awsamzoneaws.com/lambda_layer.zip`. + Once you have the layer created, you can add it to your Lambda function by going to the Lambda + function in the AWS console, and clicking on the `Code` tab and scrolling down to the Layers section. + Click on the "Add a layer" button. From there you can select the layer you just created. +- Increase the total run time to at least 20 seconds. You might have to raise that if you have a lot + of components in your FSxN file system. However, if you have to raise it to more than a couple minutes + and the function still times out, then it could be an issue with the endpoint causing the calls to the + AWS services to hang. See the [Endpoints for AWS Services](#endpoints-for-aws-services) section above + for more information. +- Provide for the base configuration via environment variables and/or a configuration file. + See the [Configuration Parameters](#configuration-parameters) section below for more information. +- Create the "Matching Conditions" file, that specifies when the Lambda function should send alerts. + See the [Matching Conditions File](#matching-conditions-file) section below for more information. +- Once you have tested the function to ensure it works, set up an EventBridge Schedule + rule to trigger the function on a regular basis. ##### Configuration Parameters Below is a list of parameters that are used to configure the program. Some parameters are required to be set -for the program to function, and others that are optional. Some of the optional ones are still required but -will have a usable default value if the parameter is not set. For the parameters that aren't required to be +while others are optional. Some of the optional ones are still required to be set to something but +will have a usable default value if the parameter is not explicitly set. For the parameters that aren't required to be set via an environment variable, they can be set by creating a "configuration file" and putting the assignments in it. The assignments should be of the form "parameter=value". The default filename for the configuration file is what you set the OntapAdminServer variable to plus the string "-config". If you want to use a different @@ -187,78 +282,84 @@ filename, then set the configFilename environment variable to the name of your c |Parameter Name | Required | Required as an Environment Variable | Default Value | Description | |:--------------|:--------:|:-----------------------------------:|:--------------|:------------| -| s3BucketName | Yes | Yes | None | Set to the name of the S3 bucket you want the program to store events to. It will also read the matching configuration file from this bucket. | -| s3BucketRegion | Yes | Yes | None | Set to the region the S3 bucket resides in. | -| OntapAdminServer | Yes | Yes | None | Set to the DNS name,or IP address of the ONTAP server you wish to monitor. | +| s3BucketName | Yes | Yes | None | Set to the name of the S3 bucket where you want the program to store events to. It will also read the matching configuration file from this bucket. | +| s3BucketRegion | Yes | Yes | None | Set to the region where the S3 bucket is located. | +| OntapAdminServer | Yes | Yes | None | Set to the DNS name, or IP address, of the ONTAP server you wish to monitor. | | configFilename | No | No | OntapAdminServer + "-config" | Set to the filename (S3 object) that contains parameter assignments. It's okay if it doesn't exist, as long as there are environment variables for all the required parameters. | -| emsEventsFilename | No | No | OntapAdminServer + "-emsEvents" | Set to the filename (S3 object) that you want the program to store the EMS events that it alerts on into. This file will be created as necessary. | -| smEventsFilesname | No | No | OntapAdminServer + "-smEvents" | Set to the filename (S3 object) that you want the program to store the SnapMirror alerts into. This file will be created as necessary. | -| smRelationshipsFilename | No | No | OntapAdminServer + "-smRelationships" | Set to the filename (S3 object) that you want the program to store the SnapMirror relationships into. This file will be created as necessary. | -| storageEventsFilename | No | No | OntapAdminServer + "-storageEvents" | Set to the filename (S3 object) that you want the program to store the Storage alerts into. This file will be created as necessary. | -| quotaEventsFilename | No | No | OntapAdminServer + "-quotaEvents" | Set to the filename (S3 object) that you want the program to store the Quota alerts into. This file will be created as necessary. | -| systemStatusFilename | No | No | OntapAdminServer + "-systemStatus" | Set to the filename (S3 object) that you want the program to store the overall system status information into. This file will be created as necessary. | -| snsTopicArn | Yes | No | None | Set to the ARN of the SNS topic you want the program to publish alert messages to. | +| emsEventsFilename | No | No | OntapAdminServer + "-emsEvents" | Set to the filename (S3 object) where you want the program to store the EMS events that it has alerted on. This file will be created as necessary. | +| smEventsFilesname | No | No | OntapAdminServer + "-smEvents" | Set to the filename (S3 object) where you want the program to store the SnapMirror that it has alerted on. This file will be created as necessary. | +| smRelationshipsFilename | No | No | OntapAdminServer + "-smRelationships" | Set to the filename (S3 object) where you want the program to store the SnapMirror relationships into. This file is used to track the number of bytes transferred so it can detect stalled SnapMirror updates. This file will be created as necessary. | +| storageEventsFilename | No | No | OntapAdminServer + "-storageEvents" | Set to the filename (S3 object) where you want the program to store the Storage Utilization events it has alerted on. This file will be created as necessary. | +| quotaEventsFilename | No | No | OntapAdminServer + "-quotaEvents" | Set to the filename (S3 object) where you want the program to store the Quota Utilization events it has alerted on. This file will be created as necessary. | +| vserverEventsFilename | No | No | OntapAdminServer + "-vserverEvents" | Set to the filename (S3 object) where you want the program to store the vserver events it has alerted on. This file will be created as necessary. | +| systemStatusFilename | No | No | OntapAdminServer + "-systemStatus" | Set to the filename (S3 object) where you want the program to store the overall system status information into. This file will be created as necessary. | +| snsTopicArn | Yes | No | None | Set to the ARN of the SNS topic you want the program to publish alert messages to. | +| cloudWatchLogGroupName | No | No | None | The name of **an existing** CloudWatch log group that the Lambda function will also send alerts to. If left blank, alerts will not be sent to CloudWatch.| | conditionsFilename | Yes | No | OntapAdminServer + "-conditions" | Set to the filename (S3 object) where you want the program to read the matching condition information from. | | secretArn | Yes | No | None | Set to the ARN of the secret within the AWS Secrets Manager that holds the FSxN credentials. | -| secretUsernameKey | Yes | No | None | Set to the key name within the secretName that holds the username portion of the FSxN credentials. | -| secretPasswordKey | Yes | No | None | Set to the key name within the secretName that holds the password portion of the FSxN credentials. | -| snsEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SNS endpoint created above. | -| secretsManagerEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SecretsManager endpoint created above. | -| syslogIP | No | No | None | To have the program send syslog messages along with SNS messages set this to the IP address (or hostname) of the syslog server to send the messages to.| +| secretUsernameKey | Yes | No | None | Set to the key name within the AWS Secrets Manager secret that holds the username portion of the FSxN credentials. | +| secretPasswordKey | Yes | No | None | Set to the key name within the AWS Secrets Manager secret that holds the password portion of the FSxN credentials. | +| snsEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SNS endpoint. Only needed if you had to create a VPC endpoint for the SNS service. | +| secretsManagerEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SecretsManager endpoint created above. Only needed if you had to create a VPC endpoint for the Secrets Manager service.| +| cloudWatchLogsEndPointHostname | No | No | None | Set to the DNS hostname assigned to the CloudWatch Logs endpoint created above. Only needed if you had to create a VPC endpoint for the Cloud Watch Logs service| +| syslogIP | No | No | None | Set to the IP address (or DNS hostname) of the syslog server where you want alerts sent to.| ##### Matching Conditions File The Matching Conditions file allows you to specify which events you want to be alerted on. The format of the file is JSON. JSON is basically a series of "key" : "value" pairs. Where the value can be object that also has "key" : "value" pairs. For more information about the format of a JSON file, please refer to this [page](https://www.json.org/json-en.html). -The JSON schema in this file is made up of an array of objects, with with a key name of "services". Each element of the "services" array -is an object with two keys. The first key is “name" which specifies the name of the service it is going to provide +The JSON schema in this file is made up of an array of objects, with a key name of "services". Each element of the "services" array +is an object with at least two keys. The first key is “name" which specifies the name of the service it is going to provide matching conditions (rules) for. The second key is "rules" which is an array of objects that provide the specific -matching condition. Note that each service's rules has its own unique schema. The following is the unique schema -for each of the service's rules. +matching condition. Note that each service's rules has its own unique schema. The following is the definition of each service's schema. ###### Matching condition schema for System Health (systemHealth) Each rule should be an object with one, or more, of the following keys: |Key Name|Value Type|Notes| |---|---|---| -|versionChange|Boolean (true, false)|If 'true' the program will send an alert when the ONTAP version changes. If it is set to false, it will not report on version changes.| -|failover|Boolean|If 'true' the program will send an alert if the FSxN cluster is running on its standby node. If it is set to false, it will not report on failover status.| -|networkInterfaces|Boolean|If 'true' the program will send an alert if any of the network interfaces are down. If it is set to false, it will not report on any network interfaces that are down.| +|versionChange|Boolean (true, false)|If `true` the program will send an alert when the ONTAP version changes. If it is set to `false`, it will not report on version changes.| +|failover|Boolean|If 'true' the program will send an alert if the FSxN cluster is running on its standby node. If it is set to `false`, it will not report on failover status.| +|networkInterfaces|Boolean|If 'true' the program will send an alert if any of the network interfaces are down. If it is set to `false`, it will not report on any network interfaces that are down.| ###### Matching condition schema for EMS Messages (ems) -Each rule should be an object with three keys: +Each rule should be an object with three keys, with an optional 4th key: |Key Name|Value Type|Notes| |---|---|---| -|name|String|Which will match on the EMS event name.| -|message|String|Which will match on the EMS event message text.| -|severity|String|Which will match on the severity of the EMS event (debug, informational, notice, error, alert or emergency).| - -Note that all values to each of the keys are used as a regular expressions against the associated EMS component. So, for -example, if you want to match on any event message text that starts with “snapmirror” then you would put “\^snapmirror”. -The “\^” character matches the beginning on the string. If you want to match on a specific EMS event name, then you should -anchor it with an regular express that starts with “\^” for the beginning of the string and ends with “\$” for the end of -the string. For example, “^arw.volume.state$’. For a complete explanation of the regular expression syntax and special -characters, please see the Python documentation found [here](https://docs.python.org/3/library/re.html). +|name|String|Regular expression that will match on an EMS event name.| +|message|String|Regular expression that will match on an EMS event message text.| +|severity|String|Regular expression that will match on the severity of the EMS event (debug, informational, notice, error, alert or emergency).| +|filter|String|If any event's message text match this regular express, then the EMS event will be skipped. Try to be as specific as possible to avoid unintentional filtering. This key is optional.| + +Note that all values to each of the keys are used as a regular expressions against the associated EMS component. For +example, if you want to match on any event message text that starts with “snapmirror” then you would put `^snapmirror`. +The `^` character matches the beginning on the string. If you want to match on a specific EMS event name, then you should +anchor it with a regular express that starts with `^` for the beginning of the string and ends with `$` for the end of +the string. For example, `^arw.volume.state$`. For a complete explanation of the regular expression syntax and special +characters, please refer to the [Python documentation](https://docs.python.org/3/library/re.html). ###### Matching condition schema for SnapMirror relationships (snapmirror) Each rule should be an object with one, or more, of the following keys: |Key Name|Value Type|Notes| |---|---|---| -|maxLagTime|Integer|Specifies the maximum allowable time, in seconds, since the last successful SnapMirror update before an alert will be sent.| +|maxLagTime|Integer|Specifies the maximum allowable time, in seconds, since the last successful SnapMirror update before an alert will be sent. Only used if maxLagTimePercent hasn't been provide, or if the SnapMirror relationship, and the policy it is assigned to, don't have a schedule associated with them. Best practice is to provide both maxLagTime and maxLagTimePercent to ensure all relationships get monitored, in case a schedule gets accidentally removed.| +|maxLagTimePercent|Integer|Specifies the maximum allowable time, in terms of percent of the amount of time since the last scheduled SnapMirror update, before an alert will be sent. Should be over 100. For example, a value of 200 means 2 times the period since the last scheduled update and if that was supposed to have happen 1 hour ago, it would alert if the relationship hasn't been updated within 2 hours.| |stalledTransferSeconds|Integer|Specifies the minimum number of seconds that have to transpire before a SnapMirror transfer will be considered stalled.| -|health|Boolean|If true will alert with the relationship is health. If false will alert with the relationship is unhealthy.| +|healthy|Boolean|If `true` will alert with the relationship is healthy. If `false` will alert with the relationship is unhealthy.| -###### Matching condition schema for Storage (storage) +###### Matching condition schema for Storage Utilization (storage) Each rule should be an object with one, or more, of the following keys: - |Key Name|Value Type|Notes| |---|---|---| |aggrWarnPercentUsed|Integer|Specifies the maximum allowable physical storage (aggregate) utilization (between 0 and 100) before an alert is sent.| |aggrCriticalPercentUsed|Integer|Specifies the maximum allowable physical storage (aggregate) utilization (between 0 and 100) before an alert is sent.| |volumeWarnPercentUsed|Integer|Specifies the maximum allowable volume utilization (between 0 and 100) before an alert is sent.| |volumeCriticalPercentUsed|Integer|Specifies the maximum allowable volume utilization (between 0 and 100) before an alert is sent.| +|volumeWarnFilesPercentUsed|Integer|Specifies the maximum allowable volume files (inodes) utilization (between 0 and 100) before an alert is sent.| +|volumeCriticalFilesPercentUsed|Integer|Specifies the maximum allowable volume files (inodes) utilization (between 0 and 100) before an alert is sent.| +|offline:|Boolean|If `true` will alert if the volume is offline.| ###### Matching condition schema for Quota (quota) Each rule should be an object with one, or more, of the following keys: @@ -269,6 +370,14 @@ Each rule should be an object with one, or more, of the following keys: |maxSoftQuotaSpacePercentUsed|Integer|Specifies the maximum allowable storage utilization (between 0 and 100) against the soft quota limit before an alert is sent.| |maxQuotaInodesPercentUsed|Integer|Specifies the maximum allowable inode utilization (between 0 and 100) before an alert is sent.| +###### Matching condition schema for Vserver (vserver) +Each rule should be an object with one, or more, of the following keys: +|Key Name|Value Type|Notes| +|---|---|---| +|vserverState|Boolean|If `true` will alert if the vserver is not in `running` state.| +|nfsProtocolState|Boolean|If `true` will alert if the NFS protocol is not enabled on a vserver.| +|cifsProtocolState|Boolean|If `true` will alert if the CIFS protocol is enabled for a vserver but doesn't have an `online` status.| + ###### Example Matching conditions file: ```json { @@ -305,6 +414,7 @@ Each rule should be an object with one, or more, of the following keys: "rules": [ { "maxLagTime": 86400 + "maxLagTimePercent": 200 }, { "healthy": false @@ -316,18 +426,33 @@ Each rule should be an object with one, or more, of the following keys: }, { "name": "storage", + "exceptions": [{"svm": "fsx", "name": "fsx_root"}], "rules": [ { - "aggrWarnPercentUsed": 80 + "aggrWarnPercentUsed": 80, + "aggrCriticalPercentUsed": 95 }, { - "aggrCriticalPercentUsed": 95 + "volumeWarnPercentUsed": 85, + "volumeCriticalPercentUsed": 90 }, { - "volumeWarnPercentUsed": 85 + "volumeWarnFilesPercentUsed": 85, + "volumeCriticalFilesPercentUsed": 90 + } + ] + }, + { + "name": "storage", + "match": [{"svm": "fsx", "name": "fsx_root"}], + "rules": [ + { + "volumeWarnPercentUsed": 75, + "volumeCriticalPercentUsed": 85 }, { - "volumeCriticalPercentUsed": 90 + "volumeWarnFilesPercentUsed": 80, + "volumeCriticalFilesPercentUsed": 90 } ] }, @@ -355,18 +480,29 @@ In the above example, it will alert on: - Any network interfaces that are down. - Any EMS message that has an event name of “passwd.changed”. - Any EMS message that has a severity of "alert" or “emergency”. +- Any SnapMirror relationship with a lag time more than 200% the amount of time since its last scheduled update, if it has a schedule assoicated with it. + Otherwise, if the last successful update has been more than 86400 seconds (24 hours). - Any SnapMirror relationship with a lag time more than 86400 seconds (24 hours). - Any SnapMirror relationship that has a non-healthy status. - Any SnapMirror update that hasn't had any flow of data in 600 seconds (10 minutes). - If the cluster aggregate is more than 80% full. - If the cluster aggregate is more than 95% full. -- If any volume is more than 85% full. -- if any volume is more than 90% full. +- If any volume, except for the 'fsx_root' volume in the 'fsx' SVM, that is more than 85% full. +- if any volume, except for the 'fsx_root' volume in the 'fsx' SVM, that is more than 90% full. +- if any volume, except for the 'fsx_root' volume in the 'fsx' SVM, that is using more than 85% of its inodes. +- if any volume, except for the 'fsx_root' volume in the 'fsx' SVM, that is using more than 90% of its inodes. +- If for the 'fsx_root' volume in the 'fsx SVM, when it is more than 75% full. +- if for the 'fsx_root' volume in the 'fsx SVM, when it is more than 85% full. +- if for the 'fsx_root' volume in the 'fsx SVM, when it is using more than 80% of its inodes. +- if for the 'fsx_root' volume in the 'fsx SVM, when it is using more than 90% of its inodes. - If any quota policies where the space utilization is more than 95% of the hard limit. - If any quota policies where the space utilization is more than 100% of the soft limit. - If any quota policies are showing any inode utilization more than 95% -A matching conditions file must be created and stored in the S3 bucket with the name given as the "conditionsFilename" configuration variable. Feel free to use the example above as a starting point. Note that you should ensure it is in valid JSON format, otherwise the program will fail to load the file. There are various programs and websites that can validate a JSON file for you. +A matching conditions file must be created and stored in the S3 bucket with the name given as the "conditionsFilename" +configuration variable. Feel free to use the example above as a starting point. Note that you should ensure it +is in valid JSON format, otherwise the program will fail to load the file. There are various programs and +websites that can validate a JSON file for you. ## Author Information diff --git a/Monitoring/monitor-ontap-services/buildLambdaLayer b/Monitoring/monitor-ontap-services/buildLambdaLayer new file mode 100755 index 00000000..72f0a63a --- /dev/null +++ b/Monitoring/monitor-ontap-services/buildLambdaLayer @@ -0,0 +1,7 @@ +#!/bin/bash +# +# This script builds the Lambda Layer required for the monitor_ontap_services to function. +rm -rf python lambda_layer.zip +mkdir python +pip3 install --target ./python 'cronsim>=2.6' pytz +zip -r lambda_layer.zip python diff --git a/Monitoring/monitor-ontap-services/cloudformation.yaml b/Monitoring/monitor-ontap-services/cloudformation.yaml index 09057158..00a49bff 100644 --- a/Monitoring/monitor-ontap-services/cloudformation.yaml +++ b/Monitoring/monitor-ontap-services/cloudformation.yaml @@ -8,20 +8,25 @@ Metadata: default: "Configuration Parameters" Parameters: - OntapAdminSever + - s3BucketName - subNetIds - securityGroupIds - snsTopicArn + - cloudWatchLogGroupArn - secretArn - secretUsernameKey - secretPasswordKey - checkInterval - createWatchdogAlarm - - createSecretManagerEndpoint + - createSecretsManagerEndpoint - createSNSEndpoint + - createCloudWatchLogsEndpoint - createS3Endpoint - routeTableIds - vpcId - endpointSecurityGroupIds + - LambdaRoleArn + - SchedulerRoleArn - Label: default: "Alert Parameters" Parameters: @@ -29,27 +34,38 @@ Metadata: - failoverAlert - emsEventsAlert - snapMirrorLagTimeAlert + - snapMirrorLagTimePercentAlert - snapMirrorStalledAlert - snapMirrorHealthAlert - fileSystemUtilizationWarnAlert - fileSystemUtilizationCriticalAlert - volumeUtilizationWarnAlert - volumeUtilizationCriticalAlert + - volumeFileUtilizationWarnAlert + - volumeFileUtilizationCriticalAlert + - volumeOfflineAlert - softQuotaUtilizationAlert - hardQuotaUtilizationAlert - inodesQuotaUtilizationAlert + - vserverStateAlert + - vserverNFSProtocolStateAlert + - vserverCIFSProtocolStateAlert Parameters: OntapAdminSever: Description: "The DNS name, or IP address, of the management endpoint of the FSxN file system to be monitored." Type: String + s3BucketName: + Description: "The name of the S3 bucket to use to store the status and configuration files. Must also be holding the lambda_layer.zip file." + Type: String + subNetIds: - Description: "The subnet IDs where the FSxN file system is located." + Description: "The subnet IDs where you want to deploy the Lambda function. Must have connectivity to the FSxN file system to be monitored." Type: "List" securityGroupIds: - Description: "The security group IDs to associate with the Lambda function." + Description: "The security group IDs to associate with the Lambda function. Must allow outbound traffic over TCP port 443 to the FSxN file system, and the AWS service endpoints." Type: "List" Default: "" @@ -57,8 +73,13 @@ Parameters: Description: "The ARN of the SNS topic where you want alerts sent to." Type: String + cloudWatchLogGroupArn: + Description: "The ARN of the CloudWatch log group to send alerts to. If left blank, alerts will not be sent to CloudWatch. Note that the log group must already exist." + Type: String + Default: "" + secretArn: - Description: "The ARN of the secret that holds the FSxN credentials to use." + Description: "The ARN of the Secrets Manager secret that holds the FSxN credentials to use." Type: String secretUsernameKey: @@ -72,13 +93,13 @@ Parameters: Default: "password" createWatchdogAlarm: - Description: "Create a CloudWatch alarm to monitor the monitor-ontap-services Lambda function." + Description: "Create a CloudWatch alarm to monitor the Lambda function. It will alert you if the function fails to run successfully." Type: String Default: "true" AllowedValues: ["true", "false"] - createSecretManagerEndpoint: - Description: "Create a secret manager endpoint." + createSecretsManagerEndpoint: + Description: "Create a Secrets Manager endpoint." Type: String Default: "false" AllowedValues: ["true", "false"] @@ -89,6 +110,12 @@ Parameters: Default: "false" AllowedValues: ["true", "false"] + createCloudWatchLogsEndpoint: + Description: "Create a CloudWatch logs endpoint." + Type: String + Default: "false" + AllowedValues: ["true", "false"] + createS3Endpoint: Description: "Create an S3 endpoint." Type: String @@ -106,10 +133,20 @@ Parameters: Default: "" endpointSecurityGroupIds: - Description: "The security group IDs, comma separated list, to associate with the SNS and SecretsManager endpoints. Must allow traffic from from the Lambda function over TCP port 443. This parameter is only needed if you are creating the SNS or SecretsManager endpoint." + Description: "The security group IDs, comma separated list, to associate with the SNS, SecretsManager and/or CloudWatch Logs endpoints. Must allow traffic from from the Lambda function over TCP port 443. This parameter is only needed if you are creating the SNS, SecretsManager, or CloudWatch Logs endpoint." Type: CommaDelimitedList Default: "" + LambdaRoleArn: + Description: "The ARN of the role to use for the Lambda function. This is only needed if you want to provide an existing role, otherwise an appropriate one will be created for you." + Type: String + Default: "" + + SchedulerRoleArn: + Description: "The ARN of the role to use for the Lambda scheduler. This is only needed if you want to provide an existing role, otherwise an appropriate one will be created for you." + Type: String + Default: "" + checkInterval: Description: "The interval, in minutes, between checks." Type: Number @@ -135,12 +172,18 @@ Parameters: AllowedValues: ["true", "false"] snapMirrorLagTimeAlert: - Description: "Alert when an SnapMirror update time is more than the specified seconds. Set to 0 to disable this alert." + Description: "Alert when a SnapMirror update time is more than the specified seconds. Set to 0 to disable this alert. Recommended to set both snapMirrorLagTimeAlert and snapMirrorLagTimePercentAlert." Type: Number Default: 86400 + snapMirrorLagTimePercentAlert: + Description: "Alert when the last succesful SnapMirror update time is more than this percent of the amount of time since the last scheduled update. Must be more than 100. A value of 200 means 2 times the update interval. Set to 0 to disable this alert." + Type: Number + MinValue: 100 + Default: 200 + snapMirrorStalledAlert: - Description: "Alert when an SnapMirror update hasn't transferred any new data in the specified seconds. Set to 0 to disable this alert." + Description: "Alert when a SnapMirror update hasn't transferred any new data in the specified seconds. Set to 0 to disable this alert." Type: Number Default: 600 MinValue: 60 @@ -171,13 +214,29 @@ Parameters: Type: Number Default: 95 + volumeFileUtilizationWarnAlert: + Description: "Alert when a volume inode utilization exceeds this threshold in percentage." + Type: Number + Default: 90 + + volumeFileUtilizationCriticalAlert: + Description: "Alert when a volume inode utilization exceeds this threshold in percentage." + Type: Number + Default: 95 + + volumeOfflineAlert: + Description: "Alert when a volume goes offline." + Type: String + AllowedValues: ["true", "false"] + Default: "true" + softQuotaUtilizationAlert: - Description: "Alert when a quota exceeds this threshold in percentage." + Description: "Alert when a soft quota exceeds this threshold in percentage." Type: Number Default: 100 hardQuotaUtilizationAlert: - Description: "Alert when a quota exceeds this threshold in percentage." + Description: "Alert when a hard quota exceeds this threshold in percentage." Type: Number Default: 80 @@ -186,16 +245,39 @@ Parameters: Type: Number Default: 80 + vserverStateAlert: + Description: "Alert when a vserver goes offline." + Type: String + AllowedValues: ["true", "false"] + Default: "true" + + vserverNFSProtocolStateAlert: + Description: "Alert when a vserver's NFS protocol goes offline." + Type: String + AllowedValues: ["true", "false"] + Default: "true" + + vserverCIFSProtocolStateAlert: + Description: "Alert when a vserver's CIFS protocol goes offline." + Type: String + AllowedValues: ["true", "false"] + Default: "true" + Conditions: - CreateSecretManagerEndpoint: !Equals [!Ref createSecretManagerEndpoint, "true"] + CreateSecretsManagerEndpoint: !Equals [!Ref createSecretsManagerEndpoint, "true"] CreateSNSEndpoint: !Equals [!Ref createSNSEndpoint, "true"] CreateS3Endpoint: !Equals [!Ref createS3Endpoint, "true"] + CreateCloudWatchLogsEndpoint: !Equals [!Ref createCloudWatchLogsEndpoint, "true"] CreateWatchdogAlarm: !Equals [!Ref createWatchdogAlarm, "true"] + CreateLambdaRoleWithCW: !And [!Equals [!Ref LambdaRoleArn, ""], !Not [!Equals [!Ref cloudWatchLogGroupArn, ""]]] + CreateLambdaRoleWithoutCW: !And [!Equals [!Ref LambdaRoleArn, ""], !Equals [!Ref cloudWatchLogGroupArn, ""]] + CreateSchedulerRole: !Equals [!Ref SchedulerRoleArn, ""] + IncludeCloudWatchPermissions: !Not [!Equals [!Ref cloudWatchLogGroupArn, ""]] Resources: SecretManagerEndpoint: Type: AWS::EC2::VPCEndpoint - Condition: CreateSecretManagerEndpoint + Condition: CreateSecretsManagerEndpoint Properties: VpcId: !Ref vpcId ServiceName: !Sub "com.amazonaws.${AWS::Region}.secretsmanager" @@ -204,6 +286,17 @@ Resources: SubnetIds: !Ref subNetIds SecurityGroupIds: !Ref endpointSecurityGroupIds + CWEndpoint: + Type: AWS::EC2::VPCEndpoint + Condition: CreateCloudWatchLogsEndpoint + Properties: + VpcId: !Ref vpcId + ServiceName: !Sub "com.amazonaws.${AWS::Region}.logs" + VpcEndpointType: 'Interface' + PrivateDnsEnabled: true + SubnetIds: !Ref subNetIds + SecurityGroupIds: !Ref endpointSecurityGroupIds + SNSEndpoint: Type: AWS::EC2::VPCEndpoint Condition: CreateSNSEndpoint @@ -224,12 +317,6 @@ Resources: VpcEndpointType: 'Gateway' RouteTableIds: !Ref routeTableIds - s3Bucket: - Type: "AWS::S3::Bucket" - Properties: - BucketName: !Sub "monitor-ontap-services-${AWS::StackName}" - AccessControl: "Private" - watchDogAlarm: Type: "AWS::CloudWatch::Alarm" Condition: CreateWatchdogAlarm @@ -250,8 +337,9 @@ Resources: AlarmActions: - !Ref snsTopicArn - LambdaRole: + LambdaRoleWithoutCW: Type: "AWS::IAM::Role" + Condition: CreateLambdaRoleWithoutCW Properties: RoleName: !Sub "mon-ontap-services-${AWS::StackName}" AssumeRolePolicyDocument: @@ -263,7 +351,7 @@ Resources: Action: "sts:AssumeRole" ManagedPolicyArns: - - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + - "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" Policies: - PolicyName: "LambdaPolicy" @@ -277,22 +365,53 @@ Resources: - "s3:GetObject" - "s3:PutObject" - "s3:ListBucket" - Resource: + Resource: - !Ref secretArn - !Ref snsTopicArn - - !GetAtt s3Bucket.Arn - - !Sub - - "${s3BucketArn}/*" - - s3BucketArn: !GetAtt s3Bucket.Arn + - !Sub "arn:aws:s3:::{s3BucketName}" + - !Sub "arn:aws:s3:::{s3BucketName}/*" + + LambdaRoleWithCW: + Type: "AWS::IAM::Role" + Condition: CreateLambdaRoleWithCW + Properties: + RoleName: !Sub "mon-ontap-services-${AWS::StackName}" + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Principal: + Service: "lambda.amazonaws.com" + Action: "sts:AssumeRole" + + ManagedPolicyArns: + - "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" + + Policies: + - PolicyName: "LambdaPolicy" + PolicyDocument: + Version: "2012-10-17" + Statement: - Effect: "Allow" Action: - - "ec2:CreateNetworkInterface" - - "ec2:DeleteNetworkInterface" - - "ec2:DescribeNetworkInterfaces" - Resource: "*" + - "secretsManager:GetSecretValue" + - "sns:Publish" + - "logs:PutLogEvents" + - "logs:DescribeLogStreams" + - "logs:CreateLogStream" + - "s3:GetObject" + - "s3:PutObject" + - "s3:ListBucket" + Resource: + - !Ref secretArn + - !Ref snsTopicArn + - !Sub "arn:aws:s3:::${s3BucketName}" + - !Sub "arn:aws:s3:::${s3BucketName}/*" + - !Ref cloudWatchLogGroupArn SchedulerRole: Type: "AWS::IAM::Role" + Condition: CreateSchedulerRole Properties: RoleName: !Sub "SchedulerRole-${AWS::StackName}" AssumeRolePolicyDocument: @@ -323,46 +442,66 @@ Resources: ScheduleExpression: !Sub "rate(${checkInterval} minutes)" Target: Arn: !GetAtt LambdaFunction.Arn - RoleArn: !GetAtt SchedulerRole.Arn + RoleArn: !If [CreateSchedulerRole, !GetAtt SchedulerRole.Arn, !Ref SchedulerRoleArn] + + LambdaLayer: + Type: "AWS::Lambda::LayerVersion" + Properties: + LayerName: !Sub "MOS-${AWS::StackName}" + Content: + S3Bucket: !Ref s3BucketName + S3Key: "lambda_layer.zip" + CompatibleRuntimes: + - "python3.12" LambdaFunction: Type: "AWS::Lambda::Function" Properties: FunctionName: !Sub "monitor-ontap-services-${AWS::StackName}" - Role: !GetAtt LambdaRole.Arn + Role: !If [CreateLambdaRoleWithCW, !GetAtt LambdaRoleWithCW.Arn, !If [CreateLambdaRoleWithoutCW, !GetAtt LambdaRoleWithoutCW.Arn, !Ref LambdaRoleArn]] VpcConfig: SecurityGroupIds: !Ref securityGroupIds SubnetIds: !Ref subNetIds PackageType: "Zip" Runtime: "python3.12" + Layers: + - !GetAtt LambdaLayer.LayerVersionArn Handler: "index.lambda_handler" Timeout: 60 Environment: Variables: s3BucketRegion: !Ref AWS::Region - s3BucketArn: !GetAtt s3Bucket.Arn + s3BucketName: !Ref s3BucketName OntapAdminServer: !Ref OntapAdminSever secretArn: !Ref secretArn secretUsernameKey: !Ref secretUsernameKey secretPasswordKey: !Ref secretPasswordKey snsTopicArn: !Ref snsTopicArn + cloudWatchLogGroupArn: !Ref cloudWatchLogGroupArn initialVersionChangeAlert: !Ref versionChangeAlert initialFailoverAlert: !Ref failoverAlert initialEmsEventsAlert: !Ref emsEventsAlert initialSnapMirrorLagTimeAlert: !Ref snapMirrorLagTimeAlert + initialSnapMirrorLagTimePercentAlert: !Ref snapMirrorLagTimePercentAlert initialSnapMirrorStalledAlert: !Ref snapMirrorStalledAlert initialSnapMirrorHealthAlert: !Ref snapMirrorHealthAlert initialFileSystemUtilizationWarnAlert: !Ref fileSystemUtilizationWarnAlert initialFileSystemUtilizationCriticalAlert: !Ref fileSystemUtilizationCriticalAlert initialVolumeUtilizationWarnAlert: !Ref volumeUtilizationWarnAlert initialVolumeUtilizationCriticalAlert: !Ref volumeUtilizationCriticalAlert + initialVolumeFileUtilizationWarnAlert: !Ref volumeFileUtilizationWarnAlert + initialVolumeFileUtilizationCriticalAlert: !Ref volumeFileUtilizationCriticalAlert + initialVolumeOfflineAlert: !Ref volumeOfflineAlert initialSoftQuotaUtilizationAlert: !Ref softQuotaUtilizationAlert initialHardQuotaUtilizationAlert: !Ref hardQuotaUtilizationAlert initialInodesQuotaUtilizationAlert: !Ref inodesQuotaUtilizationAlert + initialVserverStateAlert: !Ref vserverStateAlert + initialVserverNFSProtocolStateAlert: !Ref vserverNFSProtocolStateAlert + initialVserverCIFSProtocolStateAlert: !Ref vserverCIFSProtocolStateAlert Code: ZipFile: | - #!/bin/python3.11 + #!/bin/python3 ################################################################################ # THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF @@ -382,16 +521,18 @@ Resources: # "matching conditions." It is intended to be run as a Lambda function, but # can be run as a standalone program. # - # Version: v2.1 - # Date: 2024-08-23-22:01:08 + # Version: v2.14 + # Date: 2025-04-29-12:53:45 ################################################################################ import json import re import os import datetime + import pytz import logging from logging.handlers import SysLogHandler + from cronsim import CronSim import urllib3 from urllib3.util import Retry import botocore @@ -400,7 +541,7 @@ Resources: eventResilience = 4 # Times an event has to be missing before it is removed # from the alert history. # This was added since the Ontap API that returns EMS - # events would often drop some events and then including + # events would often drop some events and then including # them in the subsequent calls. If I don't "age" the # alert history duplicate alerts will be sent. initialVersion = "Initial Run" # The version to store if this is the first @@ -443,7 +584,7 @@ Resources: elif string[end:endp1] == "M": num=num*60 elif string[end:endp1] != "S": - print(f'Unknown lag time specifier "{string[end:endp1]}".') + logger.warning(f'Unknown lag time specifier "{string[end:endp1]}".') return (num, endp1) @@ -509,7 +650,7 @@ Resources: # 'True'. ################################################################################ def checkSystem(): - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, clusterTimezone changedEvents = False # @@ -533,11 +674,11 @@ Resources: else: fsxStatus = json.loads(data["Body"].read().decode('UTF-8')) # - # Get the cluster name and ONTAP version from the FSxN. + # Get the cluster name, ONTAP version and timezone from the FSxN. # This is also a way to test that the FSxN cluster is accessible. badHTTPStatus = False try: - endpoint = f'https://{config["OntapAdminServer"]}/api/cluster?fields=version,name' + endpoint = f'https://{config["OntapAdminServer"]}/api/cluster?fields=version,name,timezone' response = http.request('GET', endpoint, headers=headers, timeout=5.0) if response.status == 200: if not fsxStatus["systemHealth"]: @@ -558,8 +699,10 @@ Resources: clusterVersion = data["version"]["full"].split()[2].replace(":", "") if fsxStatus["version"] == initialVersion: fsxStatus["version"] = clusterVersion + # + # Get the Timezone for SnapMirror lag time calculations. + clusterTimezone = data["timezone"]["name"] else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') badHTTPStatus = True raise Exception(f'API call to {endpoint} failed. HTTP status code: {response.status}.') except: @@ -572,14 +715,13 @@ Resources: message = f'CRITICAL: Received a non 200 HTTP status code ({response.status}) when trying to access {clusterName}.' else: message = f'CRITICAL: Failed to issue API against {clusterName}. Cluster could be down.' - logger.critical(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "CRITICAL") fsxStatus["systemHealth"] = False changedEvents = True if changedEvents: s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) - # + # # If the cluster is done, return false so the program can exit cleanly. return(fsxStatus["systemHealth"]) @@ -609,8 +751,7 @@ Resources: if lkey == "versionchange": if rule[key] and clusterVersion != fsxStatus["version"]: message = f'NOTICE: The ONTAP vesion changed on cluster {clusterName} from {fsxStatus["version"]} to {clusterVersion}.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "INFO") fsxStatus["version"] = clusterVersion changedEvents = True elif lkey == "failover": @@ -624,12 +765,11 @@ Resources: data = json.loads(response.data) if data["num_records"] != fsxStatus["numberNodes"]: message = f'Alert: The number of nodes on cluster {clusterName} went from {fsxStatus["numberNodes"]} to {data["num_records"]}.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "INFO") fsxStatus["numberNodes"] = data["num_records"] changedEvents = True else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') elif lkey == "networkinterfaces": if rule[key]: endpoint = f'https://{config["OntapAdminServer"]}/api/network/ip/interfaces?fields=state' @@ -639,15 +779,14 @@ Resources: # Decrement the refresh field to know if any events have really gone away. for interface in fsxStatus["downInterfaces"]: interface["refresh"] -= 1 - + data = json.loads(response.data) for interface in data["records"]: if interface.get("state") != None and interface["state"] != "up": uniqueIdentifier = interface["name"] if(not eventExist(fsxStatus["downInterfaces"], uniqueIdentifier)): # Resets the refresh key. message = f'Alert: Network interface {interface["name"]} on cluster {clusterName} is down.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") event = { "index": uniqueIdentifier, "refresh": eventResilience @@ -659,7 +798,7 @@ Resources: i = 0 while i < len(fsxStatus["downInterfaces"]): if fsxStatus["downInterfaces"][i]["refresh"] <= 0: - print(f'Deleting downed interface: {fsxStatus["downInterfaces"][i]["index"]}') + logger.debug(f'Deleting interface: {fsxStatus["downInterfaces"][i]["index"]}') del fsxStatus["downInterfaces"][i] changedEvents = True else: @@ -667,9 +806,9 @@ Resources: changedEvents = True i += 1 else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') else: - print(f'Unknown System Health alert type: "{key}".') + logger.warning(f'Unknown System Health alert type: "{key}".') if changedEvents: s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) @@ -709,27 +848,31 @@ Resources: logger.debug(f'Received {len(data["records"])} EMS records.') for record in data["records"]: for rule in service["rules"]: - if (re.search(rule["name"], record["message"]["name"]) and + messageFilter = rule.get("filter") + if messageFilter == None or messageFilter == "": + messageFilter = "ThisShouldn'tMatchAnything" + + if (not re.search(messageFilter, record["log_message"]) and + re.search(rule["name"], record["message"]["name"]) and re.search(rule["severity"], record["message"]["severity"]) and re.search(rule["message"], record["log_message"])): if (not eventExist (events, record["index"])): # This resets the "refresh" field if found. message = f'{record["time"]} : {clusterName} {record["message"]["name"]}({record["message"]["severity"]}) - {record["log_message"]}' useverity=record["message"]["severity"].upper() if useverity == "EMERGENCY": - logger.critical(message) + sendAlert(message, "CRITICAL") elif useverity == "ALERT": - logger.error(message) - elif useverity == "ERROR": - logger.warning(message) + sendAlert(message, "ERROR") + elif useverity == "ERROR": + sendAlert(message, "WARNING") elif useverity == "NOTICE" or useverity == "INFORMATIONAL": - logger.info(message) + sendAlert(message, "INFO") elif useverity == "DEBUG": - logger.debug(message) + sendAlert(message, "DEBUG") else: - print(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') - logger.info(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.', "INFO") + sendAlert(message, "INFO") + changedEvents = True event = { "index": record["index"], @@ -738,14 +881,13 @@ Resources: "message": record["log_message"], "refresh": eventResilience } - print(message) events.append(event) # # Now that we have processed all the events, check to see if any events should be deleted. i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["time"]} : {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["time"]} : {events[i]["message"]}') del events[i] changedEvents = True else: @@ -758,32 +900,170 @@ Resources: if changedEvents: s3Client.put_object(Key=config["emsEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') - logger.debug(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') ################################################################################ # This function is used to find an existing SM relationship based on the source # and destinatino path passed in. It returns None if one isn't found ################################################################################ - def getPreviousSMRecord(relationShips, sourceCluster, sourcePath, destPath): + def getPreviousSMRecord(relationShips, uuid): for relationship in relationShips: - if relationship['sourcePath'] == sourcePath and relationship['destPath'] == destPath and relationship['sourceCluster'] == sourceCluster: + if relationship.get('uuid') == uuid: relationship['refresh'] = True return(relationship) return(None) + ################################################################################ + # This function will convert seconds into an ascii string of number days, hours, + # minutes, and seconds. It will return the string. + ################################################################################ + def lagTimeStr(seconds): + days = seconds // (60 * 60 * 24) + seconds = seconds - (days * (60 * 60 * 24)) + hours = seconds // (60 * 60) + seconds = seconds - (hours * (60 * 60)) + minutes = seconds // 60 + seconds = seconds - (minutes * 60) + + timeStr="" + if days > 0: + plural = "s" if days != 1 else "" + timeStr = f'{days} day{plural} ' + if hours > 0 or days > 0: + plural = "s" if hours != 1 else "" + timeStr += f'{hours} hour{plural} ' + if minutes > 0 or days > 0 or hours > 0: + plural = "s" if minutes != 1 else "" + timeStr += f'{minutes} minute{plural} and ' + plural = "s" if seconds != 1 else "" + timeStr += f'{seconds} second{plural}' + return timeStr + + ################################################################################ + # This function converts an array of numbers to a comma separated string. If + # the array is empty, it returns "*". + ################################################################################ + def convertArrayToString(array): + + text = "" + for item in array: + if text != "": + text += "," + text += str(item) + + return text if text != "" else "*" + + ################################################################################ + # This function takes a schedule dictionary and returns the last time it should + # run. It returns the time in seconds since the UNIX epoch. + ################################################################################ + def getLastRunTime(scheduleUUID): + global config, http, headers, clusterName, clusterVersion, logger, clusterTimezone + + minutes = "" + hours = "" + months = "" + daysOfMonth = "" + daysOfWeek = "" + # + # Run the API call to get the schedule information. + endpoint = f'https://{config["OntapAdminServer"]}/api/cluster/schedules/{scheduleUUID}?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + schedule = json.loads(response.data) + + if schedule['cron'].get("minutes") is not None: + minutes = convertArrayToString(schedule['cron']['minutes']) + else: + minutes = "*" + + if schedule['cron'].get("hours") is not None: + hours = convertArrayToString(schedule['cron']['hours']) + else: + hours = "*" + + if schedule['cron'].get("days") is not None: + daysOfMonth = convertArrayToString(schedule['cron']['days']) + else: + daysOfMonth = "*" + + if schedule['cron'].get("months") is not None: + months = convertArrayToString(schedule['cron']['months']) + else: + months = "*" + + if schedule['cron'].get("weekdays") is not None: + daysOfWeek = convertArrayToString(schedule['cron']['weekdays']) + else: + daysOfWeek = "*" + # + # Create the cron expression. + cron_expression = f"{minutes} {hours} {daysOfMonth} {months} {daysOfWeek}" + # + # Initialize CronSim with the cron expression and current time. + curTime = datetime.datetime.now(pytz.timezone(clusterTimezone) if clusterTimezone != None else datetime.timezone.utc) + curTimeSec = curTime.timestamp() + it = CronSim(cron_expression, curTime, reverse=True) + # + # Get the last run time. + lastRunTime = next(it) + lastRunTimeSec = lastRunTime.timestamp() + return int(lastRunTimeSec) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + return -1 + + ################################################################################ + ################################################################################ + def getPolicySchedule(policyUUID): + global config, http, headers, clusterName, clusterVersion, logger + + # Run the API call to get the policy information. + endpoint = f'https://{config["OntapAdminServer"]}/api/snapmirror/policies/{policyUUID}?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + if data.get('transfer_schedule') != None: + return data['transfer_schedule']['uuid'] + else: + return None + else: + logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + return None + + ################################################################################ + # This function is used to find the last time a SnapMirror relationship should + # have been updated. It returns the time in seconds since the UNIX epoch. + ################################################################################ + def getLastScheduledUpdate(record): + global config, http, headers, clusterName, clusterVersion, logger + # + # First check to see if there is a schedule associated with the SM relationship. + if record.get("transfer_schedule") is not None: + lastRunTime = getLastRunTime(record["transfer_schedule"]["uuid"]) + else: + # + # If there is no schedule at the relationship level, check to see + # if the policy has one. + scheduleUUID = getPolicySchedule(record["policy"]["uuid"]) + if scheduleUUID is not None: + lastRunTime = getLastRunTime(scheduleUUID) + else: + lastRunTime = -1 + return lastRunTime + ################################################################################ # This function is used to check SnapMirror relationships. ################################################################################ def processSnapMirrorRelationships(service): - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, clusterTimezone # # Get the saved events so we can ensure we are only reporting on new ones. try: data = s3Client.get_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"]) except botocore.exceptions.ClientError as err: - # If the error is that the object doesn't exist, then it will get created once an alert it sent. + # If the error is that the object doesn't exist, then it will get created once an alert is sent. if err.response['Error']['Code'] == "NoSuchKey": events = [] else: @@ -801,7 +1081,7 @@ Resources: try: data = s3Client.get_object(Key=config["smRelationshipsFilename"], Bucket=config["s3BucketName"]) except botocore.exceptions.ClientError as err: - # If the error is that the object doesn't exist, then it will get created once an alert it sent. + # If the error is that the object doesn't exist, then it will get created once an alert is sent. if err.response['Error']['Code'] == "NoSuchKey": smRelationships = [] else: @@ -816,112 +1096,154 @@ Resources: updateRelationships = False # # Get the current time in seconds since UNIX epoch 01/01/1970. - curTime = int(datetime.datetime.now().timestamp()) + curTimeSeconds = int(datetime.datetime.now(pytz.timezone(clusterTimezone) if clusterTimezone != None else datetime.timezone.utc).timestamp()) + # + # Consolidate all the rules so we can decide how to process lagtime. + maxLagtime = None + maxLagTimePercent = None + healthy = None + stalledTransferSeconds = None + offline = None + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() + if lkey == "maxlagtime": + maxLagTime = rule[key] + maxLagTimeKey = key + elif lkey == "maxlagtimepercent": + maxLagTimePercent = rule[key] + maxLagTimePercentKey = key + elif lkey == "healthy": + healthy = rule[key] + healthyKey = key + elif lkey == "stalledtransferseconds": + stalledTransferSeconds = rule[key] + stalledTransferSecondsKey = key + else: + logger.warning(f'Unknown snapmirror alert type: "{key}".') # # Run the API call to get the current state of all the snapmirror relationships. endpoint = f'https://{config["OntapAdminServer"]}/api/snapmirror/relationships?fields=*' response = http.request('GET', endpoint, headers=headers) if response.status == 200: data = json.loads(response.data) - for record in data["records"]: - for rule in service["rules"]: - for key in rule.keys(): - lkey = key.lower() - # - # If the source cluster isn't defined, then assume it is a local SM relationship. - sourceCluster = record['source'].get('cluster') - if sourceCluster == None: - sourceClusterName = clusterName - else: - sourceClusterName = sourceCluster['name'] - - if lkey == "maxlagtime": - if record.get("lag_time") != None: - lagSeconds = parseLagTime(record["lag_time"]) - if lagSeconds > rule["maxLagTime"]: - uniqueIdentifier = record["uuid"] + "_" + key + # + # Since there are multiple ways to process lag time, make sure to only do it one way for each relationship. + processedLagTime = False + # + # If the source cluster isn't defined, then assume it is a local SM relationship. + if record['source'].get('cluster') is None: + sourceClusterName = clusterName + else: + sourceClusterName = record['source']['cluster']['name'] + # + # For lag time if maxLagTimePercent is defined check to see if there is a schedule, + # if there is a schedule alert on that otherrwise alert on the maxLagTime. + # But, first check that lag_time is defined, and that the state is not "uninitialized", + # since the lag_time is set to the oldest snapshot of the source volume which would + # cause a false positive. + if record.get("lag_time") is not None and record["state"].lower() != "uninitialized": + lagSeconds = parseLagTime(record["lag_time"]) + if maxLagTimePercent is not None: + lastScheduledUpdate = getLastScheduledUpdate(record) + if lastScheduledUpdate != -1: + processedLagTime = True + if lagSeconds > ((curTimeSeconds - lastScheduledUpdate) * maxLagTimePercent/100): + # + # If the transfer is in progress, and they have stalled transfer alert enabled, we don't need to alert on the lag time. + if not (record.get("transfer") is not None and record["transfer"]["state"].lower() in ["transferring", "finalizing", "preparing", "fasttransferring"] and stalledTransferSeconds is not None): + uniqueIdentifier = record["uuid"] + "_" + maxLagTimePercentKey if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + timeStr = lagTimeStr(lagSeconds) + asciiTime = datetime.datetime.fromtimestamp(lastScheduledUpdate).strftime('%Y-%m-%d %H:%M:%S') + message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds ({timeStr}) which is more than {maxLagTimePercent}% of its last scheduled update at {asciiTime}.' + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) events.append(event) - elif lkey == "healthy": - if not record["healthy"]: - uniqueIdentifier = record["uuid"] + "_" + key - if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - message = f'Snapmirror Health Alert: {sourceClusterName}::{record["source"]["path"]} {clusterName}::{record["destination"]["path"]} has a status of {record["healthy"]}' - logger.warning(message) # Intentionally put this before adding the reasons, since I'm not sure how syslog will handle a multi-line message. - for reason in record["unhealthy_reason"]: - message += "\n" + reason["message"] - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') - changedEvents=True - event = { - "index": uniqueIdentifier, - "message": message, - "refresh": eventResilience - } - print(message) - events.append(event) - elif lkey == "stalledtransferseconds": - if record.get('transfer') and record['transfer']['state'].lower() == "transferring": - sourcePath = record['source']['path'] - destPath = record['destination']['path'] - bytesTransferred = record['transfer']['bytes_transferred'] - - prevRec = getPreviousSMRecord(smRelationships, sourceClusterName, sourcePath, destPath) - - if prevRec != None: - timeDiff=curTime - prevRec["time"] - print(f'transfer bytes last time:{prevRec["bytesTransferred"]} this time:{bytesTransferred} and {timeDiff} > {rule[key]}') - if prevRec['bytesTransferred'] == bytesTransferred: - if (curTime - prevRec['time']) > rule[key]: - uniqueIdentifier = record['uuid'] + "_" + "transfer" - - if not eventExist(events, uniqueIdentifier): - message = f'Snapmiorror transfer has stalled: {sourceClusterName}::{sourcePath} -> {clusterName}::{destPath}.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject='Monitor ONTAP Services Alert for cluster {clusterName}') - changedEvents=True - event = { - "index": uniqueIdentifier, - "message": message, - "refresh": eventResilience - } - print(message) - events.append(event) - else: - prevRec['time'] = curTime - prevRec['refresh'] = True - prevRec['bytesTransferred'] = bytesTransferred - updateRelationships = True - else: - prevRec = { - "time": curTime, - "refresh": True, - "bytesTransferred": bytesTransferred, - "sourcePath": sourcePath, - "destPath": destPath, - "sourceCluster": sourceClusterName - } - updateRelationships = True - smRelationships.append(prevRec) + + if maxLagTime is not None and not processedLagTime: + if lagSeconds > maxLagTime: + uniqueIdentifier = record["uuid"] + "_" + maxLagTimeKey + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + timeStr = lagTimeStr(lagSeconds) + message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds, or {timeStr} which is more than {maxLagTime}.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + + if healthy is not None: + if not healthy and not record["healthy"]: # Report on "not healthy" and the status is "not healthy" + uniqueIdentifier = record["uuid"] + "_" + healthyKey + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f'Snapmirror Health Alert: {sourceClusterName}::{record["source"]["path"]} {clusterName}::{record["destination"]["path"]} has a status of {record["healthy"]}.' + for reason in record["unhealthy_reason"]: + message += "\n" + reason["message"] + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + + if stalledTransferSeconds is not None: + if record.get('transfer') is not None and record['transfer']['state'].lower() == "transferring": + transferUuid = record['transfer']['uuid'] + bytesTransferred = record['transfer']['bytes_transferred'] + prevRec = getPreviousSMRecord(smRelationships, transferUuid) # This reset the "refresh" field if found. + if prevRec != None: + timeDiff=curTimeSeconds - prevRec["time"] + if prevRec['bytesTransferred'] == bytesTransferred: + if (curTimeSeconds - prevRec['time']) > stalledTransferSeconds: + uniqueIdentifier = record['uuid'] + "_" + "transfer" + + if not eventExist(events, uniqueIdentifier): + message = f"Snapmiorror transfer has stalled: {sourceClusterName}::{record['source']['path']} -> {clusterName}::{record['destination']['path']}." + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + prevRec['time'] = curTimeSeconds + prevRec['refresh'] = True + prevRec['bytesTransferred'] = bytesTransferred + updateRelationships = True else: - message = f'Unknown snapmirror alert type: "{key}".' - logger.warning(message) - print(message) + prevRec = { + "time": curTimeSeconds, + "refresh": True, + "bytesTransferred": bytesTransferred, + "uuid": transferUuid + } + updateRelationships = True + smRelationships.append(prevRec) # # After processing the records, see if any SM relationships need to be removed. i = 0 while i < len(smRelationships): if not smRelationships[i]["refresh"]: + relationshipId = smRelationships[i].get("uuid") + if relationshipId is None: + id="Old format" + else: + id = relationshipId + logger.debug(f'Deleting smRelationship: {id}') del smRelationships[i] updateRelationships = True else: @@ -935,7 +1257,7 @@ Resources: i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -948,7 +1270,7 @@ Resources: if(changedEvents): s3Client.put_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code {response.status}.') ################################################################################ # This function is used to check all the volume and aggregate utlization. @@ -973,71 +1295,112 @@ Resources: # Decrement the refresh field to know if any records have really gone away. for event in events: event["refresh"] -= 1 + # + # Run the API call to get the physical storage used. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/aggregates?fields=space' + aggrResponse = http.request('GET', endpoint, headers=headers) + if aggrResponse.status != 200: + logger.error(f'API call to {endpoint} failed. HTTP status code {aggrResponse.status}.') + aggrResponse = None + # + # Run the API call to get the volume information. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/volumes?fields=space,files,svm,state' + volumeResponse = http.request('GET', endpoint, headers=headers) + if volumeResponse.status != 200: + logger.error(f'API call to {endpoint} failed. HTTP status code {volumeResponse.status}.') + volumeResponse = None + # + # If both API calls failed, no point on continuing. + if volumeResponse is None and aggrResponse is None: + return for rule in service["rules"]: for key in rule.keys(): lkey=key.lower() if lkey == "aggrwarnpercentused" or lkey == 'aggrcriticalpercentused': - # - # Run the API call to get the physical storage used. - endpoint = f'https://{config["OntapAdminServer"]}/api/storage/aggregates?fields=space' - response = http.request('GET', endpoint, headers=headers) - if response.status == 200: - data = json.loads(response.data) + if aggrResponse is not None: + data = json.loads(aggrResponse.data) for aggr in data["records"]: if aggr["space"]["block_storage"]["used_percent"] >= rule[key]: uniqueIdentifier = aggr["uuid"] + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. alertType = 'Warning' if lkey == "aggrwarnpercentused" else 'Critical' message = f'Aggregate {alertType} Alert: Aggregate {aggr["name"]} on {clusterName} is {aggr["space"]["block_storage"]["used_percent"]}% full, which is more or equal to {rule[key]}% full.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents = True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(event) + logger.debug(event) events.append(event) - else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') elif lkey == "volumewarnpercentused" or lkey == "volumecriticalpercentused": - # - # Run the API call to get the volume information. - endpoint = f'https://{config["OntapAdminServer"]}/api/storage/volumes?fields=space,svm' - response = http.request('GET', endpoint, headers=headers) - if response.status == 200: - data = json.loads(response.data) + if volumeResponse is not None: + data = json.loads(volumeResponse.data) for record in data["records"]: if record["space"].get("percent_used"): if record["space"]["percent_used"] >= rule[key]: uniqueIdentifier = record["uuid"] + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. alertType = 'Warning' if lkey == "volumewarnpercentused" else 'Critical' - message = f'Volume Usage {alertType} Alert: volume {record["svm"]["name"]}:/{record["name"]} on {clusterName} is {record["space"]["percent_used"]}% full, which is more or equal to {rule[key]}% full.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + message = f'Volume Usage {alertType} Alert: volume {record["svm"]["name"]}:{record["name"]} on {clusterName} is {record["space"]["percent_used"]}% full, which is more or equal to {rule[key]}% full.' + sendAlert(message, "WARNING") changedEvents = True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) events.append(event) - else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + elif lkey == "volumewarnfilespercentused" or lkey == "volumecriticalfilespercentused": + if volumeResponse is not None: + data = json.loads(volumeResponse.data) + for record in data["records"]: + # + # If a volume is offline, the API will not report the "files" information. + if record.get("files") is not None: + maxFiles = record["files"].get("maximum") + usedFiles = record["files"].get("used") + if maxFiles != None and usedFiles != None: + percentUsed = (usedFiles / maxFiles) * 100 + if percentUsed >= rule[key]: + uniqueIdentifier = record["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): + alertType = 'Warning' if lkey == "volumewarnfilespercentused" else 'Critical' + message = f"Volume File (inode) Usage {alertType} Alert: volume {record['svm']['name']}:{record['name']} on {clusterName} is using {percentUsed:.0f}% of it's inodes, which is more or equal to {rule[key]}% utilization." + sendAlert(message, "WARNING") + changedEvents = True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + elif lkey == "offline": + data = json.loads(volumeResponse.data) + for record in data["records"]: + if rule[key] and record["state"].lower() == "offline": + uniqueIdentifier = f'{record["uuid"]}_{key}_{rule[key]}' + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f"Volume Offline Alert: volume {record['svm']['name']}:{record['name']} on {clusterName} is offline." + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) else: message = f'Unknown storage alert type: "{key}".' logger.warning(message) - print(message) # # After processing the records, see if any events need to be removed. i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -1050,6 +1413,56 @@ Resources: if(changedEvents): s3Client.put_object(Key=config["storageEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) + ################################################################################ + # This function sends the message to the various alerting systems. + ################################################################################ + def sendAlert(message, severity): + global config, snsClient, logger, cloudWatchClient + + + if severity == "CRITICAL": + logger.critical(message) + elif severity == "ERROR": + logger.error(message) + elif severity == "WARNING": + logger.warning(message) + elif severity == "INFO": + logger.info(message) + elif severity == "DEBUG": + logger.debug(message) + else: + logger.info(message) + + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'{severity}: Monitor ONTAP Services Alert for cluster {clusterName}') + + if cloudWatchClient is not None: + # + # Create a new log stream for the current day if it doesn't exist. + dateStr = datetime.datetime.now().strftime("%Y-%m-%d") + logStreamName = f'{clusterName}-monitor-ontap-services-{dateStr}' + # + # Don't ask me why AWS puts a ":*" at the end of the log group ARN, but they do. + logGroupName = config["cloudWatchLogGroupArn"].split(":")[-2] if config["cloudWatchLogGroupArn"].endswith(":*") else config["cloudWatchLogGroupArn"].split(":")[-1] + # + # Check to see if the log stream already exists. + logStreams = cloudWatchClient.describe_log_streams(logGroupName=logGroupName, logStreamNamePrefix=logStreamName) + if len(logStreams["logStreams"]) == 0: + cloudWatchClient.create_log_stream( + logGroupName=logGroupName, + logStreamName=logStreamName) + # + # Send the message to CloudWatch. + cloudWatchClient.put_log_events( + logGroupName=logGroupName, + logStreamName=logStreamName, + logEvents=[ + { + 'timestamp': int(datetime.datetime.now().timestamp() * 1000), + 'message': message + }, + ] + ) + ################################################################################ # This function is used to check utilization of quota limits. ################################################################################ @@ -1087,18 +1500,18 @@ Resources: # # Since the quota report might not have the files key, and even if it does, it might not have # the hard_limit_percent" key, need to check for their existencae first. - if(record.get("files") != None and record["files"]["used"].get("hard_limit_percent") != None and + if(record.get("files") is not None and record["files"]["used"].get("hard_limit_percent") is not None and record["files"]["used"]["hard_limit_percent"] > rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=' ' - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -1106,29 +1519,28 @@ Resources: else: user='' message = f'Quota Inode Usage Alert: Quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["files"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its inodes.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) elif lkey == "maxhardquotaspacepercentused": - if(record.get("space") != None and record["space"]["used"].get("hard_limit_percent") and + if(record.get("space") is not None and record["space"]["used"].get("hard_limit_percent") and record["space"]["used"]["hard_limit_percent"] >= rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=" " - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -1136,29 +1548,28 @@ Resources: else: user='' message = f'Quota Space Usage Alert: Hard quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) elif lkey == "maxsoftquotaspacepercentused": - if(record.get("space") != None and record["space"]["used"].get("soft_limit_percent") and + if(record.get("space") is not None and record["space"]["used"].get("soft_limit_percent") and record["space"]["used"]["soft_limit_percent"] >= rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=" " - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -1166,26 +1577,24 @@ Resources: else: user='' message = f'Quota Space Usage Alert: Soft quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["soft_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) else: message = f'Unknown quota matching condition type "{key}".' logger.warning(message) - print(message) # # After processing the records, see if any events need to be removed. i=0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -1198,7 +1607,135 @@ Resources: if(changedEvents): s3Client.put_object(Key=config["quotaEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + ################################################################################ + ################################################################################ + def processVserver(service): + global config, s3Client, snsClient, http, headers, clusterName, logger + + changedEvents=False + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["vserverEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + # + # Consolidate the rules + vserverState = None + nfsProtocolState = None + cifsProtocolState = None + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() # Convert to all lower case so the key can be case insensitive. + if lkey == "vserverstate": + vserverState = rule[key] + vserverStateKey = key + elif lkey == "nfsprotocolstate": + nfsProtocolState = rule[key] + nfsProtocolStateKey = key + elif lkey == "cifsprotocolstate": + cifsProtocolState = rule[key] + cifsProtocolStateKey = key + # + # Check for any vservers that are down. + if vserverState is not None and vserverState: + # + # Run the API call to get the vserver state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/svm/svms?fields=state' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if record["state"].lower() != "running": + uniqueIdentifier = str(record["uuid"]) + "_" + vserverStateKey + if not eventExist(events, uniqueIdentifier): + message = f'SVM State Alert: SVM {record["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + if nfsProtocolState is not None and nfsProtocolState: + # + # Run the API call to get the NFS protocol state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/protocols/nfs/services?fields=state' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if record["state"].lower() != "online": + uniqueIdentifier = str(record["svm"]["uuid"]) + "_" + nfsProtocolStateKey + if not eventExist(events, uniqueIdentifier): + message = f'NFS Protocol State Alert: NFS protocol on {record["svm"]["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + if cifsProtocolState is not None and cifsProtocolState: + # + # Run the API call to get the NFS protocol state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/protocols/cifs/services?fields=enabled' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if not record["enabled"]: + uniqueIdentifier = str(record["svm"]["uuid"]) + "_" + cifsProtocolStateKey + if not eventExist(events, uniqueIdentifier): + message = f'CIFS Protocol State Alert: CIFS protocol on {record["svm"]["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + # + # After processing the records, see if any events need to be removed. + i=0 + while i < len(events): + if events[i]["refresh"] <= 0: + logger.debug(f'Deleting event: {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if(changedEvents): + s3Client.put_object(Key=config["vserverEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) ################################################################################ # This function returns the index of the service in the conditions dictionary. @@ -1210,7 +1747,7 @@ Resources: if conditions["services"][i]["name"] == targetService: return i i += 1 - + return None ################################################################################ @@ -1228,7 +1765,8 @@ Resources: {"name": "ems", "rules": []}, {"name": "snapmirror", "rules": []}, {"name": "storage", "rules": []}, - {"name": "quota", "rules": []} + {"name": "quota", "rules": []}, + {"name": "vserver", "rules": []} ]} # # Now, add rules based on the environment variables. @@ -1260,6 +1798,10 @@ Resources: value = int(value) if value > 0: conditions["services"][getServiceIndex("snapmirror", conditions)]["rules"].append({"maxLagTime": value}) + elif name == "initialSnapMirrorLagTimePercentAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("snapmirror", conditions)]["rules"].append({"maxLagTimePercent": value}) elif name == "initialSnapMirrorStalledAlert": value = int(value) if value > 0: @@ -1280,6 +1822,19 @@ Resources: value = int(value) if value > 0: conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeCriticalPercentUsed": value}) + elif name == "initialVolumeFileUtilizationWarnAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeWarnFilesPercentUsed": value}) + elif name == "initialVolumeFileUtilizationCriticalAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeCriticalFilesPercentUsed": value}) + elif name == "initialVolumeOfflineAlert": + if value == "true": + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"offline": True}) + else: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"offline": False}) elif name == "initialSoftQuotaUtilizationAlert": value = int(value) if value > 0: @@ -1292,6 +1847,21 @@ Resources: value = int(value) if value > 0: conditions["services"][getServiceIndex("quota", conditions)]["rules"].append({"maxQuotaInodesPercentUsed": value}) + elif name == "initialVserverStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"vserverState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"vserverState": False}) + elif name == "initialVserverNFSProtocolStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"nfsProtocolState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"nfsProtocolState": False}) + elif name == "initialVserverCIFSProtocolStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"cifsProtocolState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"cifsProtocolState": False}) return conditions @@ -1319,7 +1889,9 @@ Resources: "configFilename": None, "secretsManagerEndPointHostname": None, "snsEndPointHostname": None, + "cloudWatchLogsEndPointHostname": None, "syslogIP": None, + "cloudWatchLogGroupArn": None, "awsAccountId": None } @@ -1330,7 +1902,8 @@ Resources: "conditionsFilename": None, "storageEventsFilename": None, "quotaEventsFilename": None, - "systemStatusFilename": None + "systemStatusFilename": None, + "vserverEventsFilename": None } config = { @@ -1347,13 +1920,21 @@ Resources: for var in config: config[var] = os.environ.get(var) # - # Check to see if s3BacketArn was provided instead of s3BucketName. - if config["s3BucketName"] == None and os.environ.get("s3BucketArn") != None: + # Since the CloudFormation template will set the environment variables + # to an empty string if someone doesn't provide a value, reset the + # values back to None. + for var in config: + if config[var] == "": + config[var] = None + # + # Since CloudFormation has to pass an ARN, get the Bucket name from it. + # Too bad the bucket ARN doesn't include the region, like most (all?) the others do. + if config["s3BucketName"] is None and os.environ.get("s3BucketArn") is not None: config["s3BucketName"] = os.environ.get("s3BucketArn").split(":")[-1] # # Check that required environmental variables are there. for var in requiredEnvVariables: - if config[var] == None: + if config[var] is None: raise Exception (f'\n\nMissing required environment variable "{var}".') # # Open a client to the s3 service. @@ -1361,14 +1942,9 @@ Resources: # # Calculate the config filename if it hasn't already been provided. defaultConfigFilename = config["OntapAdminServer"] + "-config" - if config["configFilename"] == None: + if config["configFilename"] is None: config["configFilename"] = defaultConfigFilename # - # Calculate the conditions filename if it hasn't already been provided. - defaultConditionsFilename = config["OntapAdminServer"] + "-conditions" - if config["conditionsFilename"] == None: - config["conditionsFilename"] = defaultConditionsFilename - # # Process the config file if it exist. try: lines = s3Client.get_object(Key=config["configFilename"], Bucket=config["s3BucketName"])['Body'].iter_lines() @@ -1377,7 +1953,7 @@ Resources: raise err else: if config["configFilename"] != defaultConfigFilename: - print(f"Warning, did not find file '{config['configFilename']}' in s3 bucket '{config['s3BucketName']}' in region '{config['s3BucketRegion']}'.") + logger.warning(f"Warning, did not find file '{config['configFilename']}' in s3 bucket '{config['s3BucketName']}' in region '{config['s3BucketRegion']}'.") else: # # While iterating through the file, get rid of any "export ", comments, blank lines, or anything else that isn't key=value. @@ -1392,42 +1968,39 @@ Resources: (key, value) = line.split("=") key = key.strip() value = value.strip() - # - # Preserve any environment variables settings. - if key in config: - if config[key] == None: - config[key] = value + if len(value) == 0: + logger.warning(f"Warning, empty value for key '{key}'. Ignored.") else: - print(f"Warning, unknown config parameter '{key}'.") + # + # Preserve any environment variables settings. + if key in config: + if config[key] is None: + config[key] = value + else: + logger.warning(f"Warning, unknown config parameter '{key}'.") # # Now, fill in the filenames for any that aren't already defined. for filename in filenameVariables: - if config[filename] == None: + if config[filename] is None: config[filename] = config["OntapAdminServer"] + "-" + filename.replace("Filename", "") # - # Define the endpoints if alternates weren't provided. - if config.get("secretArn") != None: + # Define endpoints if alternates weren't provided. + if config.get("secretArn") is not None and config["secretsManagerEndPointHostname"] is None: secretRegion = config["secretArn"].split(":")[3] - else: - # - # Give it a value so secretsManagerEndPointHostname can be set. The check for all variables will correctly error out because secretArn is missing. - secretRegion = "No-secretArn-was-provided" - if config["secretsManagerEndPointHostname"] == None or config["secretsManagerEndPointHostname"] == "": config["secretsManagerEndPointHostname"] = f'secretsmanager.{secretRegion}.amazonaws.com' - if config.get("snsTopicArn") != None: + if config.get("snsTopicArn") is not None and config["snsEndPointHostname"] is None: snsRegion = config["snsTopicArn"].split(":")[3] - else: - # - # Give it a value so snsEndPointHostname can be set. The check for all variables will correctly error out because snsTopicArn is missing. - snsRegion = "No-snsTopicArn-was-provided" - if config["snsEndPointHostname"] == None or config["snsEndPointHostname"] == "": config["snsEndPointHostname"] = f'sns.{snsRegion}.amazonaws.com' + + if config.get("cloudWatchLogGroupArn") is not None and config["cloudWatchLogsEndPointHostname"] is None: + cloudWatchRegion = config["cloudWatchLogGroupArn"].split(":")[3] + config["cloudWatchLogsEndPointHostname"] = f'logs.{cloudWatchRegion}.amazonaws.com' # # Now, check that all the configuration parameters have been set. for key in config: - if config[key] == None and key not in optionalVariables: - raise Exception(f'Missing configuration parameter "{key}".') + if config[key] is None and key not in optionalVariables: + raise Exception(f'\n\nMissing configuration parameter "{key}".\n\n') ################################################################################ # Main logic @@ -1435,18 +2008,30 @@ Resources: def lambda_handler(event, context): # # Define global variables so we don't have to pass them to all the functions. - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, cloudWatchClient, clusterTimezone + # + # Set up logging. + logger = logging.getLogger("mon_fsxn_service") + if lambdaFunction: + logger.setLevel(logging.INFO) # Anything at this level and above this get logged. + else: # Assume we are running in a test environment. + logger.setLevel(logging.DEBUG) # Anything at this level and above this get logged. + formatter = logging.Formatter( + fmt="%(name)s:%(funcName)s - Level:%(levelname)s - Message:%(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + loggerscreen = logging.StreamHandler() + loggerscreen.setFormatter(formatter) + logger.addHandler(loggerscreen) # # Read in the configuraiton. readInConfig() # This defines the s3Client variable. # - # Set up loging. - logger = logging.getLogger("mon_fsxn_service") - logger.setLevel(logging.DEBUG) # Anything at this level and above this get logged. - if config["syslogIP"] != None: + # Set up the logger to log to a file and to syslog. + if config["syslogIP"] is not None: # # Due to a bug with the SysLogHandler() of not sending proper framing with a message - # when using TCP (it should end it with a LF and not a NUL like it does now) you must add + # when using TCP (it should end it with a LF and not a NUL like it does now) you must add # an additional frame delimiter to the receiving syslog server. With rsyslog, you add # a AddtlFrameDelimiter="0" directive to the "input()" line where they have it listen # to a TCP port. For example: @@ -1454,7 +2039,7 @@ Resources: # # provides TCP syslog reception # module(load="imtcp") # input(type="imtcp" port="514" AddtlFrameDelimiter="0") - # + # # Because of this bug, I am going to stick with UDP, the default protocol used by # the syslog handler. If TCP is required, then the above changes will have to be made # to the syslog server. Or, the program will have to handle closing and opening the @@ -1484,12 +2069,12 @@ Resources: # Get the username and password of the ONTAP/FSxN system. secretsInfo = client.get_secret_value(SecretId=config["secretArn"]) secrets = json.loads(secretsInfo['SecretString']) - if secrets.get(config['secretUsernameKey']) == None: - print(f'Error, "{config["secretUsernameKey"]}" not found in secret "{config["secretArn"]}".') + if secrets.get(config['secretUsernameKey']) is None: + logger.critical(f'Error, "{config["secretUsernameKey"]}" not found in secret "{config["secretArn"]}".') return - if secrets.get(config['secretPasswordKey']) == None: - print(f'Error, "{config["secretPasswordKey"]}" not found in secret "{config["secretArn"]}".') + if secrets.get(config['secretPasswordKey']) is None: + logger.critical(f'Error, "{config["secretPasswordKey"]}" not found in secret "{config["secretArn"]}".') return username = secrets[config['secretUsernameKey']] @@ -1499,6 +2084,10 @@ Resources: #s3Client = boto3.client('s3', config["s3BucketRegion"]) # Defined in readInConfig() snsRegion = config["snsTopicArn"].split(":")[3] snsClient = boto3.client('sns', region_name=snsRegion, endpoint_url=f'https://{config["snsEndPointHostname"]}') + cloudWatchClient = None + if config["cloudWatchLogGroupArn"] is not None: + cloudWatchRegion = config["cloudWatchLogGroupArn"].split(":")[3] + cloudWatchClient = boto3.client('logs', region_name=cloudWatchRegion, endpoint_url=f'https://{config["cloudWatchLogsEndPointHostname"]}') # # Create a http handle to make ONTAP/FSxN API calls with. auth = urllib3.make_headers(basic_auth=f'{username}:{password}') @@ -1512,15 +2101,17 @@ Resources: # Get the conditions we know what to alert on. try: data = s3Client.get_object(Key=config["conditionsFilename"], Bucket=config["s3BucketName"]) + matchingConditions = json.loads(data["Body"].read().decode('UTF-8')) except botocore.exceptions.ClientError as err: if err.response['Error']['Code'] != "NoSuchKey": - print(f'\n\nError, could not retrieve configuration file {config["conditionsFilename"]} from: s3://{config["s3BucketName"]}.\nBelow is additional information:\n\n') + logger.error(f'Error, could not retrieve configuration file {config["conditionsFilename"]} from: s3://{config["s3BucketName"]}.\nBelow is additional information:') raise err else: matchingConditions = buildDefaultMatchingConditions() s3Client.put_object(Key=config["conditionsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(matchingConditions, indent=4).encode('UTF-8')) - else: - matchingConditions = json.loads(data["Body"].read().decode('UTF-8')) + except json.decoder.JSONDecodeError as err: + logger.error(f'Error, could not decode JSON from configuration file "{config["conditionsFilename"]}". The error message from the decoder:\n{err}\n') + return if(checkSystem()): # @@ -1536,11 +2127,13 @@ Resources: processStorageUtilization(service) elif service["name"].lower() == "quota": processQuotaUtilization(service) + elif service["name"].lower() == "vserver": + processVserver(service) else: - print(f'Unknown service "{service["name"]}".') + logger.warning(f'Unknown service "{service["name"]}".') return - if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None: + if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') is None: lambdaFunction = False lambda_handler(None, None) else: diff --git a/Monitoring/monitor-ontap-services/images/Monitoring_ONTAP_Services_Architecture-2.png b/Monitoring/monitor-ontap-services/images/Monitoring_ONTAP_Services_Architecture-2.png index c45d7bd2..4079fd32 100755 Binary files a/Monitoring/monitor-ontap-services/images/Monitoring_ONTAP_Services_Architecture-2.png and b/Monitoring/monitor-ontap-services/images/Monitoring_ONTAP_Services_Architecture-2.png differ diff --git a/Monitoring/monitor-ontap-services/lambda_layer.zip b/Monitoring/monitor-ontap-services/lambda_layer.zip new file mode 100644 index 00000000..ef9139fb Binary files /dev/null and b/Monitoring/monitor-ontap-services/lambda_layer.zip differ diff --git a/Monitoring/monitor-ontap-services/monitor_ontap_services.py b/Monitoring/monitor-ontap-services/monitor_ontap_services.py index 3c6baadb..616cd7ed 100755 --- a/Monitoring/monitor-ontap-services/monitor_ontap_services.py +++ b/Monitoring/monitor-ontap-services/monitor_ontap_services.py @@ -1,4 +1,4 @@ -#!/bin/python3.11 +#!/bin/python3 ################################################################################ # THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF @@ -26,8 +26,10 @@ import re import os import datetime +import pytz import logging from logging.handlers import SysLogHandler +from cronsim import CronSim import urllib3 from urllib3.util import Retry import botocore @@ -36,7 +38,7 @@ eventResilience = 4 # Times an event has to be missing before it is removed # from the alert history. # This was added since the Ontap API that returns EMS - # events would often drop some events and then including + # events would often drop some events and then including # them in the subsequent calls. If I don't "age" the # alert history duplicate alerts will be sent. initialVersion = "Initial Run" # The version to store if this is the first @@ -79,7 +81,7 @@ def getNumber(string, start): elif string[end:endp1] == "M": num=num*60 elif string[end:endp1] != "S": - print(f'Unknown lag time specifier "{string[end:endp1]}".') + logger.warning(f'Unknown lag time specifier "{string[end:endp1]}".') return (num, endp1) @@ -145,7 +147,7 @@ def eventExist (events, uniqueIdentifier): # 'True'. ################################################################################ def checkSystem(): - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, clusterTimezone changedEvents = False # @@ -169,11 +171,11 @@ def checkSystem(): else: fsxStatus = json.loads(data["Body"].read().decode('UTF-8')) # - # Get the cluster name and ONTAP version from the FSxN. + # Get the cluster name, ONTAP version and timezone from the FSxN. # This is also a way to test that the FSxN cluster is accessible. badHTTPStatus = False try: - endpoint = f'https://{config["OntapAdminServer"]}/api/cluster?fields=version,name' + endpoint = f'https://{config["OntapAdminServer"]}/api/cluster?fields=version,name,timezone' response = http.request('GET', endpoint, headers=headers, timeout=5.0) if response.status == 200: if not fsxStatus["systemHealth"]: @@ -194,8 +196,10 @@ def checkSystem(): clusterVersion = data["version"]["full"].split()[2].replace(":", "") if fsxStatus["version"] == initialVersion: fsxStatus["version"] = clusterVersion + # + # Get the Timezone for SnapMirror lag time calculations. + clusterTimezone = data["timezone"]["name"] else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') badHTTPStatus = True raise Exception(f'API call to {endpoint} failed. HTTP status code: {response.status}.') except: @@ -208,14 +212,13 @@ def checkSystem(): message = f'CRITICAL: Received a non 200 HTTP status code ({response.status}) when trying to access {clusterName}.' else: message = f'CRITICAL: Failed to issue API against {clusterName}. Cluster could be down.' - logger.critical(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "CRITICAL") fsxStatus["systemHealth"] = False changedEvents = True if changedEvents: s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) - # + # # If the cluster is done, return false so the program can exit cleanly. return(fsxStatus["systemHealth"]) @@ -245,8 +248,7 @@ def checkSystemHealth(service): if lkey == "versionchange": if rule[key] and clusterVersion != fsxStatus["version"]: message = f'NOTICE: The ONTAP vesion changed on cluster {clusterName} from {fsxStatus["version"]} to {clusterVersion}.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "INFO") fsxStatus["version"] = clusterVersion changedEvents = True elif lkey == "failover": @@ -260,12 +262,11 @@ def checkSystemHealth(service): data = json.loads(response.data) if data["num_records"] != fsxStatus["numberNodes"]: message = f'Alert: The number of nodes on cluster {clusterName} went from {fsxStatus["numberNodes"]} to {data["num_records"]}.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "INFO") fsxStatus["numberNodes"] = data["num_records"] changedEvents = True else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') elif lkey == "networkinterfaces": if rule[key]: endpoint = f'https://{config["OntapAdminServer"]}/api/network/ip/interfaces?fields=state' @@ -275,15 +276,14 @@ def checkSystemHealth(service): # Decrement the refresh field to know if any events have really gone away. for interface in fsxStatus["downInterfaces"]: interface["refresh"] -= 1 - + data = json.loads(response.data) for interface in data["records"]: if interface.get("state") != None and interface["state"] != "up": uniqueIdentifier = interface["name"] if(not eventExist(fsxStatus["downInterfaces"], uniqueIdentifier)): # Resets the refresh key. message = f'Alert: Network interface {interface["name"]} on cluster {clusterName} is down.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") event = { "index": uniqueIdentifier, "refresh": eventResilience @@ -295,7 +295,7 @@ def checkSystemHealth(service): i = 0 while i < len(fsxStatus["downInterfaces"]): if fsxStatus["downInterfaces"][i]["refresh"] <= 0: - print(f'Deleting downed interface: {fsxStatus["downInterfaces"][i]["index"]}') + logger.debug(f'Deleting interface: {fsxStatus["downInterfaces"][i]["index"]}') del fsxStatus["downInterfaces"][i] changedEvents = True else: @@ -303,9 +303,9 @@ def checkSystemHealth(service): changedEvents = True i += 1 else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') else: - print(f'Unknown System Health alert type: "{key}".') + logger.warning(f'Unknown System Health alert type: "{key}".') if changedEvents: s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) @@ -345,27 +345,31 @@ def processEMSEvents(service): logger.debug(f'Received {len(data["records"])} EMS records.') for record in data["records"]: for rule in service["rules"]: - if (re.search(rule["name"], record["message"]["name"]) and + messageFilter = rule.get("filter") + if messageFilter == None or messageFilter == "": + messageFilter = "ThisShouldn'tMatchAnything" + + if (not re.search(messageFilter, record["log_message"]) and + re.search(rule["name"], record["message"]["name"]) and re.search(rule["severity"], record["message"]["severity"]) and re.search(rule["message"], record["log_message"])): if (not eventExist (events, record["index"])): # This resets the "refresh" field if found. message = f'{record["time"]} : {clusterName} {record["message"]["name"]}({record["message"]["severity"]}) - {record["log_message"]}' useverity=record["message"]["severity"].upper() if useverity == "EMERGENCY": - logger.critical(message) + sendAlert(message, "CRITICAL") elif useverity == "ALERT": - logger.error(message) - elif useverity == "ERROR": - logger.warning(message) + sendAlert(message, "ERROR") + elif useverity == "ERROR": + sendAlert(message, "WARNING") elif useverity == "NOTICE" or useverity == "INFORMATIONAL": - logger.info(message) + sendAlert(message, "INFO") elif useverity == "DEBUG": - logger.debug(message) + sendAlert(message, "DEBUG") else: - print(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') - logger.info(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.', "INFO") + sendAlert(message, "INFO") + changedEvents = True event = { "index": record["index"], @@ -374,14 +378,13 @@ def processEMSEvents(service): "message": record["log_message"], "refresh": eventResilience } - print(message) events.append(event) # # Now that we have processed all the events, check to see if any events should be deleted. i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["time"]} : {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["time"]} : {events[i]["message"]}') del events[i] changedEvents = True else: @@ -394,32 +397,170 @@ def processEMSEvents(service): if changedEvents: s3Client.put_object(Key=config["emsEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') - logger.debug(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code: {response.status}.') ################################################################################ # This function is used to find an existing SM relationship based on the source # and destinatino path passed in. It returns None if one isn't found ################################################################################ -def getPreviousSMRecord(relationShips, sourceCluster, sourcePath, destPath): +def getPreviousSMRecord(relationShips, uuid): for relationship in relationShips: - if relationship['sourcePath'] == sourcePath and relationship['destPath'] == destPath and relationship['sourceCluster'] == sourceCluster: + if relationship.get('uuid') == uuid: relationship['refresh'] = True return(relationship) return(None) +################################################################################ +# This function will convert seconds into an ascii string of number days, hours, +# minutes, and seconds. It will return the string. +################################################################################ +def lagTimeStr(seconds): + days = seconds // (60 * 60 * 24) + seconds = seconds - (days * (60 * 60 * 24)) + hours = seconds // (60 * 60) + seconds = seconds - (hours * (60 * 60)) + minutes = seconds // 60 + seconds = seconds - (minutes * 60) + + timeStr="" + if days > 0: + plural = "s" if days != 1 else "" + timeStr = f'{days} day{plural} ' + if hours > 0 or days > 0: + plural = "s" if hours != 1 else "" + timeStr += f'{hours} hour{plural} ' + if minutes > 0 or days > 0 or hours > 0: + plural = "s" if minutes != 1 else "" + timeStr += f'{minutes} minute{plural} and ' + plural = "s" if seconds != 1 else "" + timeStr += f'{seconds} second{plural}' + return timeStr + +################################################################################ +# This function converts an array of numbers to a comma separated string. If +# the array is empty, it returns "*". +################################################################################ +def convertArrayToString(array): + + text = "" + for item in array: + if text != "": + text += "," + text += str(item) + + return text if text != "" else "*" + +################################################################################ +# This function takes a schedule dictionary and returns the last time it should +# run. It returns the time in seconds since the UNIX epoch. +################################################################################ +def getLastRunTime(scheduleUUID): + global config, http, headers, clusterName, clusterVersion, logger, clusterTimezone + + minutes = "" + hours = "" + months = "" + daysOfMonth = "" + daysOfWeek = "" + # + # Run the API call to get the schedule information. + endpoint = f'https://{config["OntapAdminServer"]}/api/cluster/schedules/{scheduleUUID}?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + schedule = json.loads(response.data) + + if schedule['cron'].get("minutes") is not None: + minutes = convertArrayToString(schedule['cron']['minutes']) + else: + minutes = "*" + + if schedule['cron'].get("hours") is not None: + hours = convertArrayToString(schedule['cron']['hours']) + else: + hours = "*" + + if schedule['cron'].get("days") is not None: + daysOfMonth = convertArrayToString(schedule['cron']['days']) + else: + daysOfMonth = "*" + + if schedule['cron'].get("months") is not None: + months = convertArrayToString(schedule['cron']['months']) + else: + months = "*" + + if schedule['cron'].get("weekdays") is not None: + daysOfWeek = convertArrayToString(schedule['cron']['weekdays']) + else: + daysOfWeek = "*" + # + # Create the cron expression. + cron_expression = f"{minutes} {hours} {daysOfMonth} {months} {daysOfWeek}" + # + # Initialize CronSim with the cron expression and current time. + curTime = datetime.datetime.now(pytz.timezone(clusterTimezone) if clusterTimezone != None else datetime.timezone.utc) + curTimeSec = curTime.timestamp() + it = CronSim(cron_expression, curTime, reverse=True) + # + # Get the last run time. + lastRunTime = next(it) + lastRunTimeSec = lastRunTime.timestamp() + return int(lastRunTimeSec) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + return -1 + +################################################################################ +################################################################################ +def getPolicySchedule(policyUUID): + global config, http, headers, clusterName, clusterVersion, logger + + # Run the API call to get the policy information. + endpoint = f'https://{config["OntapAdminServer"]}/api/snapmirror/policies/{policyUUID}?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + if data.get('transfer_schedule') != None: + return data['transfer_schedule']['uuid'] + else: + return None + else: + logger.error(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + return None + +################################################################################ +# This function is used to find the last time a SnapMirror relationship should +# have been updated. It returns the time in seconds since the UNIX epoch. +################################################################################ +def getLastScheduledUpdate(record): + global config, http, headers, clusterName, clusterVersion, logger + # + # First check to see if there is a schedule associated with the SM relationship. + if record.get("transfer_schedule") is not None: + lastRunTime = getLastRunTime(record["transfer_schedule"]["uuid"]) + else: + # + # If there is no schedule at the relationship level, check to see + # if the policy has one. + scheduleUUID = getPolicySchedule(record["policy"]["uuid"]) + if scheduleUUID is not None: + lastRunTime = getLastRunTime(scheduleUUID) + else: + lastRunTime = -1 + return lastRunTime + ################################################################################ # This function is used to check SnapMirror relationships. ################################################################################ def processSnapMirrorRelationships(service): - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, clusterTimezone # # Get the saved events so we can ensure we are only reporting on new ones. try: data = s3Client.get_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"]) except botocore.exceptions.ClientError as err: - # If the error is that the object doesn't exist, then it will get created once an alert it sent. + # If the error is that the object doesn't exist, then it will get created once an alert is sent. if err.response['Error']['Code'] == "NoSuchKey": events = [] else: @@ -437,7 +578,7 @@ def processSnapMirrorRelationships(service): try: data = s3Client.get_object(Key=config["smRelationshipsFilename"], Bucket=config["s3BucketName"]) except botocore.exceptions.ClientError as err: - # If the error is that the object doesn't exist, then it will get created once an alert it sent. + # If the error is that the object doesn't exist, then it will get created once an alert is sent. if err.response['Error']['Code'] == "NoSuchKey": smRelationships = [] else: @@ -452,112 +593,154 @@ def processSnapMirrorRelationships(service): updateRelationships = False # # Get the current time in seconds since UNIX epoch 01/01/1970. - curTime = int(datetime.datetime.now().timestamp()) + curTimeSeconds = int(datetime.datetime.now(pytz.timezone(clusterTimezone) if clusterTimezone != None else datetime.timezone.utc).timestamp()) + # + # Consolidate all the rules so we can decide how to process lagtime. + maxLagtime = None + maxLagTimePercent = None + healthy = None + stalledTransferSeconds = None + offline = None + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() + if lkey == "maxlagtime": + maxLagTime = rule[key] + maxLagTimeKey = key + elif lkey == "maxlagtimepercent": + maxLagTimePercent = rule[key] + maxLagTimePercentKey = key + elif lkey == "healthy": + healthy = rule[key] + healthyKey = key + elif lkey == "stalledtransferseconds": + stalledTransferSeconds = rule[key] + stalledTransferSecondsKey = key + else: + logger.warning(f'Unknown snapmirror alert type: "{key}".') # # Run the API call to get the current state of all the snapmirror relationships. endpoint = f'https://{config["OntapAdminServer"]}/api/snapmirror/relationships?fields=*' response = http.request('GET', endpoint, headers=headers) if response.status == 200: data = json.loads(response.data) - for record in data["records"]: - for rule in service["rules"]: - for key in rule.keys(): - lkey = key.lower() - # - # If the source cluster isn't defined, then assume it is a local SM relationship. - sourceCluster = record['source'].get('cluster') - if sourceCluster == None: - sourceClusterName = clusterName - else: - sourceClusterName = sourceCluster['name'] - - if lkey == "maxlagtime": - if record.get("lag_time") != None: - lagSeconds = parseLagTime(record["lag_time"]) - if lagSeconds > rule["maxLagTime"]: - uniqueIdentifier = record["uuid"] + "_" + key + # + # Since there are multiple ways to process lag time, make sure to only do it one way for each relationship. + processedLagTime = False + # + # If the source cluster isn't defined, then assume it is a local SM relationship. + if record['source'].get('cluster') is None: + sourceClusterName = clusterName + else: + sourceClusterName = record['source']['cluster']['name'] + # + # For lag time if maxLagTimePercent is defined check to see if there is a schedule, + # if there is a schedule alert on that otherrwise alert on the maxLagTime. + # But, first check that lag_time is defined, and that the state is not "uninitialized", + # since the lag_time is set to the oldest snapshot of the source volume which would + # cause a false positive. + if record.get("lag_time") is not None and record["state"].lower() != "uninitialized": + lagSeconds = parseLagTime(record["lag_time"]) + if maxLagTimePercent is not None: + lastScheduledUpdate = getLastScheduledUpdate(record) + if lastScheduledUpdate != -1: + processedLagTime = True + if lagSeconds > ((curTimeSeconds - lastScheduledUpdate) * maxLagTimePercent/100): + # + # If the transfer is in progress, and they have stalled transfer alert enabled, we don't need to alert on the lag time. + if not (record.get("transfer") is not None and record["transfer"]["state"].lower() in ["transferring", "finalizing", "preparing", "fasttransferring"] and stalledTransferSeconds is not None): + uniqueIdentifier = record["uuid"] + "_" + maxLagTimePercentKey if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + timeStr = lagTimeStr(lagSeconds) + asciiTime = datetime.datetime.fromtimestamp(lastScheduledUpdate).strftime('%Y-%m-%d %H:%M:%S') + message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds ({timeStr}) which is more than {maxLagTimePercent}% of its last scheduled update at {asciiTime}.' + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) events.append(event) - elif lkey == "healthy": - if not record["healthy"]: - uniqueIdentifier = record["uuid"] + "_" + key - if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - message = f'Snapmirror Health Alert: {sourceClusterName}::{record["source"]["path"]} {clusterName}::{record["destination"]["path"]} has a status of {record["healthy"]}' - logger.warning(message) # Intentionally put this before adding the reasons, since I'm not sure how syslog will handle a multi-line message. - for reason in record["unhealthy_reason"]: - message += "\n" + reason["message"] - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') - changedEvents=True - event = { - "index": uniqueIdentifier, - "message": message, - "refresh": eventResilience - } - print(message) - events.append(event) - elif lkey == "stalledtransferseconds": - if record.get('transfer') and record['transfer']['state'].lower() == "transferring": - sourcePath = record['source']['path'] - destPath = record['destination']['path'] - bytesTransferred = record['transfer']['bytes_transferred'] - - prevRec = getPreviousSMRecord(smRelationships, sourceClusterName, sourcePath, destPath) - - if prevRec != None: - timeDiff=curTime - prevRec["time"] - print(f'transfer bytes last time:{prevRec["bytesTransferred"]} this time:{bytesTransferred} and {timeDiff} > {rule[key]}') - if prevRec['bytesTransferred'] == bytesTransferred: - if (curTime - prevRec['time']) > rule[key]: - uniqueIdentifier = record['uuid'] + "_" + "transfer" - - if not eventExist(events, uniqueIdentifier): - message = f'Snapmiorror transfer has stalled: {sourceClusterName}::{sourcePath} -> {clusterName}::{destPath}.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject='Monitor ONTAP Services Alert for cluster {clusterName}') - changedEvents=True - event = { - "index": uniqueIdentifier, - "message": message, - "refresh": eventResilience - } - print(message) - events.append(event) - else: - prevRec['time'] = curTime - prevRec['refresh'] = True - prevRec['bytesTransferred'] = bytesTransferred - updateRelationships = True - else: - prevRec = { - "time": curTime, - "refresh": True, - "bytesTransferred": bytesTransferred, - "sourcePath": sourcePath, - "destPath": destPath, - "sourceCluster": sourceClusterName - } - updateRelationships = True - smRelationships.append(prevRec) + + if maxLagTime is not None and not processedLagTime: + if lagSeconds > maxLagTime: + uniqueIdentifier = record["uuid"] + "_" + maxLagTimeKey + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + timeStr = lagTimeStr(lagSeconds) + message = f'Snapmirror Lag Alert: {sourceClusterName}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds, or {timeStr} which is more than {maxLagTime}.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + + if healthy is not None: + if not healthy and not record["healthy"]: # Report on "not healthy" and the status is "not healthy" + uniqueIdentifier = record["uuid"] + "_" + healthyKey + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f'Snapmirror Health Alert: {sourceClusterName}::{record["source"]["path"]} {clusterName}::{record["destination"]["path"]} has a status of {record["healthy"]}.' + for reason in record["unhealthy_reason"]: + message += "\n" + reason["message"] + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + + if stalledTransferSeconds is not None: + if record.get('transfer') is not None and record['transfer']['state'].lower() == "transferring": + transferUuid = record['transfer']['uuid'] + bytesTransferred = record['transfer']['bytes_transferred'] + prevRec = getPreviousSMRecord(smRelationships, transferUuid) # This reset the "refresh" field if found. + if prevRec != None: + timeDiff=curTimeSeconds - prevRec["time"] + if prevRec['bytesTransferred'] == bytesTransferred: + if (curTimeSeconds - prevRec['time']) > stalledTransferSeconds: + uniqueIdentifier = record['uuid'] + "_" + "transfer" + + if not eventExist(events, uniqueIdentifier): + message = f"Snapmiorror transfer has stalled: {sourceClusterName}::{record['source']['path']} -> {clusterName}::{record['destination']['path']}." + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + prevRec['time'] = curTimeSeconds + prevRec['refresh'] = True + prevRec['bytesTransferred'] = bytesTransferred + updateRelationships = True else: - message = f'Unknown snapmirror alert type: "{key}".' - logger.warning(message) - print(message) + prevRec = { + "time": curTimeSeconds, + "refresh": True, + "bytesTransferred": bytesTransferred, + "uuid": transferUuid + } + updateRelationships = True + smRelationships.append(prevRec) # # After processing the records, see if any SM relationships need to be removed. i = 0 while i < len(smRelationships): if not smRelationships[i]["refresh"]: + relationshipId = smRelationships[i].get("uuid") + if relationshipId is None: + id="Old format" + else: + id = relationshipId + logger.debug(f'Deleting smRelationship: {id}') del smRelationships[i] updateRelationships = True else: @@ -571,7 +754,7 @@ def processSnapMirrorRelationships(service): i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -584,7 +767,7 @@ def processSnapMirrorRelationships(service): if(changedEvents): s3Client.put_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + logger.warning(f'API call to {endpoint} failed. HTTP status code {response.status}.') ################################################################################ # This function is used to check all the volume and aggregate utlization. @@ -609,71 +792,112 @@ def processStorageUtilization(service): # Decrement the refresh field to know if any records have really gone away. for event in events: event["refresh"] -= 1 + # + # Run the API call to get the physical storage used. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/aggregates?fields=space' + aggrResponse = http.request('GET', endpoint, headers=headers) + if aggrResponse.status != 200: + logger.error(f'API call to {endpoint} failed. HTTP status code {aggrResponse.status}.') + aggrResponse = None + # + # Run the API call to get the volume information. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/volumes?fields=space,files,svm,state' + volumeResponse = http.request('GET', endpoint, headers=headers) + if volumeResponse.status != 200: + logger.error(f'API call to {endpoint} failed. HTTP status code {volumeResponse.status}.') + volumeResponse = None + # + # If both API calls failed, no point on continuing. + if volumeResponse is None and aggrResponse is None: + return for rule in service["rules"]: for key in rule.keys(): lkey=key.lower() if lkey == "aggrwarnpercentused" or lkey == 'aggrcriticalpercentused': - # - # Run the API call to get the physical storage used. - endpoint = f'https://{config["OntapAdminServer"]}/api/storage/aggregates?fields=space' - response = http.request('GET', endpoint, headers=headers) - if response.status == 200: - data = json.loads(response.data) + if aggrResponse is not None: + data = json.loads(aggrResponse.data) for aggr in data["records"]: if aggr["space"]["block_storage"]["used_percent"] >= rule[key]: uniqueIdentifier = aggr["uuid"] + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. alertType = 'Warning' if lkey == "aggrwarnpercentused" else 'Critical' message = f'Aggregate {alertType} Alert: Aggregate {aggr["name"]} on {clusterName} is {aggr["space"]["block_storage"]["used_percent"]}% full, which is more or equal to {rule[key]}% full.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents = True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(event) + logger.debug(event) events.append(event) - else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') elif lkey == "volumewarnpercentused" or lkey == "volumecriticalpercentused": - # - # Run the API call to get the volume information. - endpoint = f'https://{config["OntapAdminServer"]}/api/storage/volumes?fields=space,svm' - response = http.request('GET', endpoint, headers=headers) - if response.status == 200: - data = json.loads(response.data) + if volumeResponse is not None: + data = json.loads(volumeResponse.data) for record in data["records"]: if record["space"].get("percent_used"): if record["space"]["percent_used"] >= rule[key]: uniqueIdentifier = record["uuid"] + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. alertType = 'Warning' if lkey == "volumewarnpercentused" else 'Critical' - message = f'Volume Usage {alertType} Alert: volume {record["svm"]["name"]}:/{record["name"]} on {clusterName} is {record["space"]["percent_used"]}% full, which is more or equal to {rule[key]}% full.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + message = f'Volume Usage {alertType} Alert: volume {record["svm"]["name"]}:{record["name"]} on {clusterName} is {record["space"]["percent_used"]}% full, which is more or equal to {rule[key]}% full.' + sendAlert(message, "WARNING") changedEvents = True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) events.append(event) - else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + elif lkey == "volumewarnfilespercentused" or lkey == "volumecriticalfilespercentused": + if volumeResponse is not None: + data = json.loads(volumeResponse.data) + for record in data["records"]: + # + # If a volume is offline, the API will not report the "files" information. + if record.get("files") is not None: + maxFiles = record["files"].get("maximum") + usedFiles = record["files"].get("used") + if maxFiles != None and usedFiles != None: + percentUsed = (usedFiles / maxFiles) * 100 + if percentUsed >= rule[key]: + uniqueIdentifier = record["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): + alertType = 'Warning' if lkey == "volumewarnfilespercentused" else 'Critical' + message = f"Volume File (inode) Usage {alertType} Alert: volume {record['svm']['name']}:{record['name']} on {clusterName} is using {percentUsed:.0f}% of it's inodes, which is more or equal to {rule[key]}% utilization." + sendAlert(message, "WARNING") + changedEvents = True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + elif lkey == "offline": + data = json.loads(volumeResponse.data) + for record in data["records"]: + if rule[key] and record["state"].lower() == "offline": + uniqueIdentifier = f'{record["uuid"]}_{key}_{rule[key]}' + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f"Volume Offline Alert: volume {record['svm']['name']}:{record['name']} on {clusterName} is offline." + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) else: message = f'Unknown storage alert type: "{key}".' logger.warning(message) - print(message) # # After processing the records, see if any events need to be removed. i = 0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -686,6 +910,56 @@ def processStorageUtilization(service): if(changedEvents): s3Client.put_object(Key=config["storageEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) +################################################################################ +# This function sends the message to the various alerting systems. +################################################################################ +def sendAlert(message, severity): + global config, snsClient, logger, cloudWatchClient + + + if severity == "CRITICAL": + logger.critical(message) + elif severity == "ERROR": + logger.error(message) + elif severity == "WARNING": + logger.warning(message) + elif severity == "INFO": + logger.info(message) + elif severity == "DEBUG": + logger.debug(message) + else: + logger.info(message) + + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'{severity}: Monitor ONTAP Services Alert for cluster {clusterName}') + + if cloudWatchClient is not None: + # + # Create a new log stream for the current day if it doesn't exist. + dateStr = datetime.datetime.now().strftime("%Y-%m-%d") + logStreamName = f'{clusterName}-monitor-ontap-services-{dateStr}' + # + # Don't ask me why AWS puts a ":*" at the end of the log group ARN, but they do. + logGroupName = config["cloudWatchLogGroupArn"].split(":")[-2] if config["cloudWatchLogGroupArn"].endswith(":*") else config["cloudWatchLogGroupArn"].split(":")[-1] + # + # Check to see if the log stream already exists. + logStreams = cloudWatchClient.describe_log_streams(logGroupName=logGroupName, logStreamNamePrefix=logStreamName) + if len(logStreams["logStreams"]) == 0: + cloudWatchClient.create_log_stream( + logGroupName=logGroupName, + logStreamName=logStreamName) + # + # Send the message to CloudWatch. + cloudWatchClient.put_log_events( + logGroupName=logGroupName, + logStreamName=logStreamName, + logEvents=[ + { + 'timestamp': int(datetime.datetime.now().timestamp() * 1000), + 'message': message + }, + ] + ) + ################################################################################ # This function is used to check utilization of quota limits. ################################################################################ @@ -723,18 +997,18 @@ def processQuotaUtilization(service): # # Since the quota report might not have the files key, and even if it does, it might not have # the hard_limit_percent" key, need to check for their existencae first. - if(record.get("files") != None and record["files"]["used"].get("hard_limit_percent") != None and + if(record.get("files") is not None and record["files"]["used"].get("hard_limit_percent") is not None and record["files"]["used"]["hard_limit_percent"] > rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=' ' - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -742,29 +1016,28 @@ def processQuotaUtilization(service): else: user='' message = f'Quota Inode Usage Alert: Quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["files"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its inodes.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) elif lkey == "maxhardquotaspacepercentused": - if(record.get("space") != None and record["space"]["used"].get("hard_limit_percent") and + if(record.get("space") is not None and record["space"]["used"].get("hard_limit_percent") and record["space"]["used"]["hard_limit_percent"] >= rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=" " - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -772,29 +1045,28 @@ def processQuotaUtilization(service): else: user='' message = f'Quota Space Usage Alert: Hard quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' - logger.warning(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) elif lkey == "maxsoftquotaspacepercentused": - if(record.get("space") != None and record["space"]["used"].get("soft_limit_percent") and + if(record.get("space") is not None and record["space"]["used"].get("soft_limit_percent") and record["space"]["used"]["soft_limit_percent"] >= rule[key]): uniqueIdentifier = str(record["index"]) + "_" + key if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. - if record.get("qtree") != None: + if record.get("qtree") is not None: qtree=f' under qtree: {record["qtree"]["name"]} ' else: qtree=" " - if record.get("users") != None: + if record.get("users") is not None: users=None for user in record["users"]: - if users == None: + if users is None: users = user["name"] else: users += ',{user["name"]}' @@ -802,26 +1074,24 @@ def processQuotaUtilization(service): else: user='' message = f'Quota Space Usage Alert: Soft quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["soft_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' - logger.info(message) - snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + sendAlert(message, "WARNING") changedEvents=True event = { "index": uniqueIdentifier, "message": message, "refresh": eventResilience } - print(message) + logger.debug(message) events.append(event) else: message = f'Unknown quota matching condition type "{key}".' logger.warning(message) - print(message) # # After processing the records, see if any events need to be removed. i=0 while i < len(events): if events[i]["refresh"] <= 0: - print(f'Deleting event: {events[i]["message"]}') + logger.debug(f'Deleting event: {events[i]["message"]}') del events[i] changedEvents = True else: @@ -834,7 +1104,135 @@ def processQuotaUtilization(service): if(changedEvents): s3Client.put_object(Key=config["quotaEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) else: - print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + +################################################################################ +################################################################################ +def processVserver(service): + global config, s3Client, snsClient, http, headers, clusterName, logger + + changedEvents=False + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["vserverEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + # + # Consolidate the rules + vserverState = None + nfsProtocolState = None + cifsProtocolState = None + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() # Convert to all lower case so the key can be case insensitive. + if lkey == "vserverstate": + vserverState = rule[key] + vserverStateKey = key + elif lkey == "nfsprotocolstate": + nfsProtocolState = rule[key] + nfsProtocolStateKey = key + elif lkey == "cifsprotocolstate": + cifsProtocolState = rule[key] + cifsProtocolStateKey = key + # + # Check for any vservers that are down. + if vserverState is not None and vserverState: + # + # Run the API call to get the vserver state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/svm/svms?fields=state' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if record["state"].lower() != "running": + uniqueIdentifier = str(record["uuid"]) + "_" + vserverStateKey + if not eventExist(events, uniqueIdentifier): + message = f'SVM State Alert: SVM {record["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + if nfsProtocolState is not None and nfsProtocolState: + # + # Run the API call to get the NFS protocol state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/protocols/nfs/services?fields=state' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if record["state"].lower() != "online": + uniqueIdentifier = str(record["svm"]["uuid"]) + "_" + nfsProtocolStateKey + if not eventExist(events, uniqueIdentifier): + message = f'NFS Protocol State Alert: NFS protocol on {record["svm"]["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + if cifsProtocolState is not None and cifsProtocolState: + # + # Run the API call to get the NFS protocol state for each vserver. + endpoint = f'https://{config["OntapAdminServer"]}/api/protocols/cifs/services?fields=enabled' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if not record["enabled"]: + uniqueIdentifier = str(record["svm"]["uuid"]) + "_" + cifsProtocolStateKey + if not eventExist(events, uniqueIdentifier): + message = f'CIFS Protocol State Alert: CIFS protocol on {record["svm"]["name"]} on {clusterName} is not online.' + sendAlert(message, "WARNING") + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + events.append(event) + else: + logger.error(f'API call to {endpoint} failed. HTTP status code {response.status}.') + + # + # After processing the records, see if any events need to be removed. + i=0 + while i < len(events): + if events[i]["refresh"] <= 0: + logger.debug(f'Deleting event: {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if(changedEvents): + s3Client.put_object(Key=config["vserverEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) ################################################################################ # This function returns the index of the service in the conditions dictionary. @@ -846,7 +1244,7 @@ def getServiceIndex(targetService, conditions): if conditions["services"][i]["name"] == targetService: return i i += 1 - + return None ################################################################################ @@ -864,7 +1262,8 @@ def buildDefaultMatchingConditions(): {"name": "ems", "rules": []}, {"name": "snapmirror", "rules": []}, {"name": "storage", "rules": []}, - {"name": "quota", "rules": []} + {"name": "quota", "rules": []}, + {"name": "vserver", "rules": []} ]} # # Now, add rules based on the environment variables. @@ -896,6 +1295,10 @@ def buildDefaultMatchingConditions(): value = int(value) if value > 0: conditions["services"][getServiceIndex("snapmirror", conditions)]["rules"].append({"maxLagTime": value}) + elif name == "initialSnapMirrorLagTimePercentAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("snapmirror", conditions)]["rules"].append({"maxLagTimePercent": value}) elif name == "initialSnapMirrorStalledAlert": value = int(value) if value > 0: @@ -916,6 +1319,19 @@ def buildDefaultMatchingConditions(): value = int(value) if value > 0: conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeCriticalPercentUsed": value}) + elif name == "initialVolumeFileUtilizationWarnAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeWarnFilesPercentUsed": value}) + elif name == "initialVolumeFileUtilizationCriticalAlert": + value = int(value) + if value > 0: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"volumeCriticalFilesPercentUsed": value}) + elif name == "initialVolumeOfflineAlert": + if value == "true": + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"offline": True}) + else: + conditions["services"][getServiceIndex("storage", conditions)]["rules"].append({"offline": False}) elif name == "initialSoftQuotaUtilizationAlert": value = int(value) if value > 0: @@ -928,6 +1344,21 @@ def buildDefaultMatchingConditions(): value = int(value) if value > 0: conditions["services"][getServiceIndex("quota", conditions)]["rules"].append({"maxQuotaInodesPercentUsed": value}) + elif name == "initialVserverStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"vserverState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"vserverState": False}) + elif name == "initialVserverNFSProtocolStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"nfsProtocolState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"nfsProtocolState": False}) + elif name == "initialVserverCIFSProtocolStateAlert": + if value == "true": + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"cifsProtocolState": True}) + else: + conditions["services"][getServiceIndex("vserver", conditions)]["rules"].append({"cifsProtocolState": False}) return conditions @@ -955,7 +1386,9 @@ def readInConfig(): "configFilename": None, "secretsManagerEndPointHostname": None, "snsEndPointHostname": None, + "cloudWatchLogsEndPointHostname": None, "syslogIP": None, + "cloudWatchLogGroupArn": None, "awsAccountId": None } @@ -966,7 +1399,8 @@ def readInConfig(): "conditionsFilename": None, "storageEventsFilename": None, "quotaEventsFilename": None, - "systemStatusFilename": None + "systemStatusFilename": None, + "vserverEventsFilename": None } config = { @@ -983,13 +1417,21 @@ def readInConfig(): for var in config: config[var] = os.environ.get(var) # - # Check to see if s3BacketArn was provided instead of s3BucketName. - if config["s3BucketName"] == None and os.environ.get("s3BucketArn") != None: + # Since the CloudFormation template will set the environment variables + # to an empty string if someone doesn't provide a value, reset the + # values back to None. + for var in config: + if config[var] == "": + config[var] = None + # + # Since CloudFormation has to pass an ARN, get the Bucket name from it. + # Too bad the bucket ARN doesn't include the region, like most (all?) the others do. + if config["s3BucketName"] is None and os.environ.get("s3BucketArn") is not None: config["s3BucketName"] = os.environ.get("s3BucketArn").split(":")[-1] # # Check that required environmental variables are there. for var in requiredEnvVariables: - if config[var] == None: + if config[var] is None: raise Exception (f'\n\nMissing required environment variable "{var}".') # # Open a client to the s3 service. @@ -997,14 +1439,9 @@ def readInConfig(): # # Calculate the config filename if it hasn't already been provided. defaultConfigFilename = config["OntapAdminServer"] + "-config" - if config["configFilename"] == None: + if config["configFilename"] is None: config["configFilename"] = defaultConfigFilename # - # Calculate the conditions filename if it hasn't already been provided. - defaultConditionsFilename = config["OntapAdminServer"] + "-conditions" - if config["conditionsFilename"] == None: - config["conditionsFilename"] = defaultConditionsFilename - # # Process the config file if it exist. try: lines = s3Client.get_object(Key=config["configFilename"], Bucket=config["s3BucketName"])['Body'].iter_lines() @@ -1013,7 +1450,7 @@ def readInConfig(): raise err else: if config["configFilename"] != defaultConfigFilename: - print(f"Warning, did not find file '{config['configFilename']}' in s3 bucket '{config['s3BucketName']}' in region '{config['s3BucketRegion']}'.") + logger.warning(f"Warning, did not find file '{config['configFilename']}' in s3 bucket '{config['s3BucketName']}' in region '{config['s3BucketRegion']}'.") else: # # While iterating through the file, get rid of any "export ", comments, blank lines, or anything else that isn't key=value. @@ -1028,42 +1465,39 @@ def readInConfig(): (key, value) = line.split("=") key = key.strip() value = value.strip() - # - # Preserve any environment variables settings. - if key in config: - if config[key] == None: - config[key] = value + if len(value) == 0: + logger.warning(f"Warning, empty value for key '{key}'. Ignored.") else: - print(f"Warning, unknown config parameter '{key}'.") + # + # Preserve any environment variables settings. + if key in config: + if config[key] is None: + config[key] = value + else: + logger.warning(f"Warning, unknown config parameter '{key}'.") # # Now, fill in the filenames for any that aren't already defined. for filename in filenameVariables: - if config[filename] == None: + if config[filename] is None: config[filename] = config["OntapAdminServer"] + "-" + filename.replace("Filename", "") # - # Define the endpoints if alternates weren't provided. - if config.get("secretArn") != None: + # Define endpoints if alternates weren't provided. + if config.get("secretArn") is not None and config["secretsManagerEndPointHostname"] is None: secretRegion = config["secretArn"].split(":")[3] - else: - # - # Give it a value so secretsManagerEndPointHostname can be set. The check for all variables will correctly error out because secretArn is missing. - secretRegion = "No-secretArn-was-provided" - if config["secretsManagerEndPointHostname"] == None or config["secretsManagerEndPointHostname"] == "": config["secretsManagerEndPointHostname"] = f'secretsmanager.{secretRegion}.amazonaws.com' - if config.get("snsTopicArn") != None: + if config.get("snsTopicArn") is not None and config["snsEndPointHostname"] is None: snsRegion = config["snsTopicArn"].split(":")[3] - else: - # - # Give it a value so snsEndPointHostname can be set. The check for all variables will correctly error out because snsTopicArn is missing. - snsRegion = "No-snsTopicArn-was-provided" - if config["snsEndPointHostname"] == None or config["snsEndPointHostname"] == "": config["snsEndPointHostname"] = f'sns.{snsRegion}.amazonaws.com' + + if config.get("cloudWatchLogGroupArn") is not None and config["cloudWatchLogsEndPointHostname"] is None: + cloudWatchRegion = config["cloudWatchLogGroupArn"].split(":")[3] + config["cloudWatchLogsEndPointHostname"] = f'logs.{cloudWatchRegion}.amazonaws.com' # # Now, check that all the configuration parameters have been set. for key in config: - if config[key] == None and key not in optionalVariables: - raise Exception(f'Missing configuration parameter "{key}".') + if config[key] is None and key not in optionalVariables: + raise Exception(f'\n\nMissing configuration parameter "{key}".\n\n') ################################################################################ # Main logic @@ -1071,18 +1505,30 @@ def readInConfig(): def lambda_handler(event, context): # # Define global variables so we don't have to pass them to all the functions. - global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger, cloudWatchClient, clusterTimezone + # + # Set up logging. + logger = logging.getLogger("mon_fsxn_service") + if lambdaFunction: + logger.setLevel(logging.INFO) # Anything at this level and above this get logged. + else: # Assume we are running in a test environment. + logger.setLevel(logging.DEBUG) # Anything at this level and above this get logged. + formatter = logging.Formatter( + fmt="%(name)s:%(funcName)s - Level:%(levelname)s - Message:%(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + loggerscreen = logging.StreamHandler() + loggerscreen.setFormatter(formatter) + logger.addHandler(loggerscreen) # # Read in the configuraiton. readInConfig() # This defines the s3Client variable. # - # Set up loging. - logger = logging.getLogger("mon_fsxn_service") - logger.setLevel(logging.DEBUG) # Anything at this level and above this get logged. - if config["syslogIP"] != None: + # Set up the logger to log to a file and to syslog. + if config["syslogIP"] is not None: # # Due to a bug with the SysLogHandler() of not sending proper framing with a message - # when using TCP (it should end it with a LF and not a NUL like it does now) you must add + # when using TCP (it should end it with a LF and not a NUL like it does now) you must add # an additional frame delimiter to the receiving syslog server. With rsyslog, you add # a AddtlFrameDelimiter="0" directive to the "input()" line where they have it listen # to a TCP port. For example: @@ -1090,7 +1536,7 @@ def lambda_handler(event, context): # # provides TCP syslog reception # module(load="imtcp") # input(type="imtcp" port="514" AddtlFrameDelimiter="0") - # + # # Because of this bug, I am going to stick with UDP, the default protocol used by # the syslog handler. If TCP is required, then the above changes will have to be made # to the syslog server. Or, the program will have to handle closing and opening the @@ -1120,12 +1566,12 @@ def lambda_handler(event, context): # Get the username and password of the ONTAP/FSxN system. secretsInfo = client.get_secret_value(SecretId=config["secretArn"]) secrets = json.loads(secretsInfo['SecretString']) - if secrets.get(config['secretUsernameKey']) == None: - print(f'Error, "{config["secretUsernameKey"]}" not found in secret "{config["secretArn"]}".') + if secrets.get(config['secretUsernameKey']) is None: + logger.critical(f'Error, "{config["secretUsernameKey"]}" not found in secret "{config["secretArn"]}".') return - if secrets.get(config['secretPasswordKey']) == None: - print(f'Error, "{config["secretPasswordKey"]}" not found in secret "{config["secretArn"]}".') + if secrets.get(config['secretPasswordKey']) is None: + logger.critical(f'Error, "{config["secretPasswordKey"]}" not found in secret "{config["secretArn"]}".') return username = secrets[config['secretUsernameKey']] @@ -1135,6 +1581,10 @@ def lambda_handler(event, context): #s3Client = boto3.client('s3', config["s3BucketRegion"]) # Defined in readInConfig() snsRegion = config["snsTopicArn"].split(":")[3] snsClient = boto3.client('sns', region_name=snsRegion, endpoint_url=f'https://{config["snsEndPointHostname"]}') + cloudWatchClient = None + if config["cloudWatchLogGroupArn"] is not None: + cloudWatchRegion = config["cloudWatchLogGroupArn"].split(":")[3] + cloudWatchClient = boto3.client('logs', region_name=cloudWatchRegion, endpoint_url=f'https://{config["cloudWatchLogsEndPointHostname"]}') # # Create a http handle to make ONTAP/FSxN API calls with. auth = urllib3.make_headers(basic_auth=f'{username}:{password}') @@ -1148,15 +1598,17 @@ def lambda_handler(event, context): # Get the conditions we know what to alert on. try: data = s3Client.get_object(Key=config["conditionsFilename"], Bucket=config["s3BucketName"]) + matchingConditions = json.loads(data["Body"].read().decode('UTF-8')) except botocore.exceptions.ClientError as err: if err.response['Error']['Code'] != "NoSuchKey": - print(f'\n\nError, could not retrieve configuration file {config["conditionsFilename"]} from: s3://{config["s3BucketName"]}.\nBelow is additional information:\n\n') + logger.error(f'Error, could not retrieve configuration file {config["conditionsFilename"]} from: s3://{config["s3BucketName"]}.\nBelow is additional information:') raise err else: matchingConditions = buildDefaultMatchingConditions() s3Client.put_object(Key=config["conditionsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(matchingConditions, indent=4).encode('UTF-8')) - else: - matchingConditions = json.loads(data["Body"].read().decode('UTF-8')) + except json.decoder.JSONDecodeError as err: + logger.error(f'Error, could not decode JSON from configuration file "{config["conditionsFilename"]}". The error message from the decoder:\n{err}\n') + return if(checkSystem()): # @@ -1172,11 +1624,13 @@ def lambda_handler(event, context): processStorageUtilization(service) elif service["name"].lower() == "quota": processQuotaUtilization(service) + elif service["name"].lower() == "vserver": + processVserver(service) else: - print(f'Unknown service "{service["name"]}".') + logger.warning(f'Unknown service "{service["name"]}".') return -if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None: +if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') is None: lambdaFunction = False lambda_handler(None, None) else: