Skip to content

Commit 8d1b2a5

Browse files
authored
CAP-2761 Add recommended ecs_fargate monitors (DataDog#21015)
* Add ecs_fargate monitors * address validation issues * shorten monitor description * Link back to ecs explorer
1 parent ce344fb commit 8d1b2a5

6 files changed

Lines changed: 137 additions & 0 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-08-08",
4+
"last_updated_at": "2025-08-08",
5+
"title": "ECS Fargate CPU utilization exceeds threshold",
6+
"description": "CPU usage represents the percentage of CPU resources consumed by an ECS Fargate task relative to its allocated limit. This monitor tracks when CPU utilization exceeds the configured threshold to identify performance bottlenecks that could lead to increased response times and service disruptions.",
7+
"definition": {
8+
"name": "[ECS Fargate] AWS ECS Task CPU utilization is high",
9+
"type": "query alert",
10+
"query": "avg(last_15m):sum:ecs.fargate.cpu.usage{*} by {ecs_cluster,task_arn,ecs_service} / sum:ecs.fargate.cpu.task.limit{*} by {ecs_cluster,task_arn,ecs_service} * 100 > 80",
11+
"message": "{{#is_warning}}\nAWS ECS Task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) is approaching CPU Utilization threshold\n{{/is_warning}}\n\n{{#is_alert}}\nAWS ECS Task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) has crossed CPU Utilization threshold\n{{/is_alert}}\n\nTo investigate further, view the affected task in the [ECS Explorer](/orchestration/explorer/ecsTask?inspect={{task_arn.name}})",
12+
"tags": ["integration:ecs_fargate"],
13+
"options": {
14+
"thresholds": {
15+
"critical": 80
16+
},
17+
"notify_audit": false,
18+
"on_missing_data": "default",
19+
"include_tags": true,
20+
"new_group_delay": 300
21+
}
22+
},
23+
"tags": [
24+
"integration:aws-fargate"
25+
]
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-08-08",
4+
"last_updated_at": "2025-08-08",
5+
"title": "ECS Fargate ephemeral storage utilization exceeds threshold",
6+
"description": "Ephemeral storage utilization represents the percentage of temporary storage space consumed by an ECS Fargate task relative to its allocated limit. This monitor tracks when storage utilization exceeds the threshold to prevent storage exhaustion that could lead to task failures and data loss.",
7+
"definition": {
8+
"name": "[ECS Fargate] Ephemeral storage utilization is high for task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}})",
9+
"type": "query alert",
10+
"query": "avg(last_15m):sum:ecs.fargate.ephemeral_storage.utilized{*} by {ecs_cluster,task_arn,ecs_service} / sum:ecs.fargate.ephemeral_storage.reserved{*} by {ecs_cluster,task_arn,ecs_service} * 100 > 80",
11+
"message": "{{#is_warning}}\nAWS ECS Fargate task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) is approaching ephemeral storage utilization threshold\n\nCurrent Usage: {{value}}%\n{{/is_warning}}\n\n{{#is_alert}}\nAWS ECS Fargate task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) has exceeded ephemeral storage utilization threshold\n\nCurrent Usage: {{value}}%\n{{/is_alert}}\n\nTo investigate further, view the affected task in the [ECS Explorer](/orchestration/explorer/ecsTask?inspect={{task_arn.name}})",
12+
"tags": ["integration:ecs_fargate"],
13+
"options": {
14+
"thresholds": {
15+
"critical": 80
16+
},
17+
"notify_audit": false,
18+
"on_missing_data": "default",
19+
"include_tags": true,
20+
"new_group_delay": 300
21+
}
22+
},
23+
"tags": [
24+
"integration:aws-fargate"
25+
]
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-08-08",
4+
"last_updated_at": "2025-08-08",
5+
"title": "ECS Fargate memory utilization exceeds threshold",
6+
"description": "Memory usage represents the percentage of memory resources consumed by an ECS Fargate task relative to its allocated limit. This monitor tracks when memory utilization exceeds the configured threshold to prevent out-of-memory errors that could lead to task crashes and service unavailability.",
7+
"definition": {
8+
"name": "[ECS Fargate] AWS ECS Task Memory utilization is high",
9+
"type": "query alert",
10+
"query": "avg(last_15m):sum:ecs.fargate.mem.usage{*} by {ecs_cluster,task_arn,ecs_service} / sum:ecs.fargate.mem.task.limit{*} by {ecs_cluster,task_arn,ecs_service} * 100 > 80",
11+
"message": "{{#is_warning}}\nAWS ECS Task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) is approaching Memory Utilization threshold\n{{/is_warning}}\n\n{{#is_alert}}\nAWS ECS Task {{task_arn.name}} in service {{ecs_service.name}} (cluster {{ecs_cluster.name}}) has crossed Memory Utilization threshold\n{{/is_alert}}\n\nTo investigate further, view the affected task in the [ECS Explorer](/orchestration/explorer/ecsTask?inspect={{task_arn.name}})",
12+
"tags": ["integration:ecs_fargate"],
13+
"options": {
14+
"thresholds": {
15+
"critical": 80
16+
},
17+
"notify_audit": false,
18+
"on_missing_data": "default",
19+
"include_tags": true,
20+
"new_group_delay": 300
21+
}
22+
},
23+
"tags": [
24+
"integration:aws-fargate"
25+
]
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-08-08",
4+
"last_updated_at": "2025-08-08",
5+
"title": "ECS Fargate network received error rate exceeds threshold",
6+
"description": "Network received error rate represents the percentage of network packets that failed to be received successfully. This monitor tracks when the error rate exceeds the configured threshold to identify network connectivity issues that could lead to data transmission failures and service degradation.",
7+
"definition": {
8+
"name": "[ECS Fargate] Network received error rate is high for service: {{ecs_service.name}} in cluster: {{ecs_cluster.name}}",
9+
"type": "query alert",
10+
"query": "sum(last_5m):sum:ecs.fargate.net.rcvd_errors{*} by {ecs_service,ecs_cluster} / sum:ecs.fargate.net.bytes_rcvd{*} by {ecs_service,ecs_cluster} * 100 > 5",
11+
"message": "ECS Fargate service {{ecs_service.name}} in cluster {{ecs_cluster.name}} has a network received error rate of {{value}}%, which exceeds the 5% threshold.\n\nThis indicates potential network connectivity issues. To investigate further, view the affected service in the [ECS Explorer](/orchestration/explorer/ecsService?query=ecs_service:{{ecs_service.name}}+ecs_cluster:{{ecs_cluster.name}})",
12+
"tags": ["integration:ecs_fargate"],
13+
"options": {
14+
"thresholds": {
15+
"critical": 5
16+
},
17+
"notify_audit": false,
18+
"on_missing_data": "default",
19+
"include_tags": true,
20+
"new_group_delay": 60
21+
}
22+
},
23+
"tags": [
24+
"integration:aws-fargate"
25+
]
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-08-08",
4+
"last_updated_at": "2025-08-08",
5+
"title": "ECS Fargate network sent error rate exceeds threshold",
6+
"description": "Network sent error rate represents the percentage of network packets that failed to be transmitted successfully. This monitor tracks when the error rate exceeds the configured threshold to identify network connectivity issues that could lead to data transmission failures and service degradation.",
7+
"definition": {
8+
"name": "[ECS Fargate] Network sent error rate is high for service: {{ecs_service.name}} in cluster: {{ecs_cluster.name}}",
9+
"type": "query alert",
10+
"query": "sum(last_5m):sum:ecs.fargate.net.sent_errors{*} by {ecs_service,ecs_cluster} / sum:ecs.fargate.net.bytes_sent{*} by {ecs_service,ecs_cluster} * 100 > 5",
11+
"message": "ECS Fargate service {{ecs_service.name}} in cluster {{ecs_cluster.name}} has a network sent error rate of {{value}}%, which exceeds the 5% threshold.\n\nThis indicates potential network connectivity issues. To investigate further, view the affected service in the [ECS Explorer](/orchestration/explorer/ecsService?query=ecs_service:{{ecs_service.name}}+ecs_cluster:{{ecs_cluster.name}})",
12+
"tags": ["integration:ecs_fargate"],
13+
"options": {
14+
"thresholds": {
15+
"critical": 5
16+
},
17+
"notify_audit": false,
18+
"on_missing_data": "default",
19+
"include_tags": true,
20+
"new_group_delay": 60
21+
}
22+
},
23+
"tags": [
24+
"integration:aws-fargate"
25+
]
26+
}

ecs_fargate/manifest.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@
7979
},
8080
"dashboards": {
8181
"Amazon Fargate": "assets/dashboards/amazon_fargate_overview.json"
82+
},
83+
"monitors": {
84+
"ECS Fargate CPU Usage": "assets/monitors/ecs_fargate_cpu_usage.json",
85+
"ECS Fargate Memory Usage": "assets/monitors/ecs_fargate_mem_usage.json",
86+
"ECS Fargate Ephemeral Storage Utilization": "assets/monitors/ecs_fargate_ephemeral_storage.json",
87+
"ECS Fargate Network Received Error Rate": "assets/monitors/ecs_fargate_net_rcvd.json",
88+
"ECS Fargate Network Sent Error Rate": "assets/monitors/ecs_fargate_net_sent.json"
8289
}
8390
}
8491
}

0 commit comments

Comments
 (0)