Skip to content

Commit 8a9b324

Browse files
authored
feat(aws): Adding scheduled retry of failed events (#11)
* feat(aws): Adding scheduled retry of failed events Signed-off-by: Vincent Boutour <vincent.boutour@datadoghq.com> * docs(aws): Adding more details about the retry Signed-off-by: Vincent Boutour <vincent.boutour@datadoghq.com> --------- Signed-off-by: Vincent Boutour <vincent.boutour@datadoghq.com>
1 parent c80079e commit 8a9b324

3 files changed

Lines changed: 95 additions & 11 deletions

File tree

README.md

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ This Terraform module creates the Datadog Log Lambda Forwarder infrastructure in
1010
- **Lambda Permissions**: For invocation by CloudWatch Logs, S3, SNS, and EventBridge
1111
- **Secrets Management**: Support for storing Datadog API key in Secrets Manager or SSM Parameter Store
1212
- **VPC Support**: Deploy forwarder in VPC with proxy
13+
- **Scheduler**: For scheduled retry of stored failed events
1314

1415
## Usage
1516

@@ -120,17 +121,19 @@ For complete usage examples demonstrating different configuration scenarios, see
120121

121122
### Advanced Configuration
122123

123-
| Name | Description | Type | Default |
124-
| --------------------------------- | -------------------------------- | -------- | ------- |
125-
| dd_compression_level | Compression level (0-9) | `string` | `null` |
126-
| dd_max_workers | Max concurrent workers | `string` | `null` |
127-
| dd_log_level | Log level | `string` | `null` |
128-
| dd_store_failed_events | Store failed events in S3 | `bool` | `null` |
129-
| dd_forwarder_bucket_name | Custom S3 bucket name | `string` | `null` |
130-
| dd_forwarder_existing_bucket_name | Existing S3 bucket name | `string` | `null` |
131-
| dd_api_url | Custom API URL | `string` | `null` |
132-
| dd_trace_intake_url | Custom trace intake URL | `string` | `null` |
133-
| additional_target_lambda_arns | Additional Lambda ARNs to invoke | `string` | `null` |
124+
| Name | Description | Type | Default |
125+
| --------------------------------- | ------------------------------------------------------ | -------- | ------- |
126+
| dd_compression_level | Compression level (0-9) | `string` | `null` |
127+
| dd_max_workers | Max concurrent workers | `string` | `null` |
128+
| dd_log_level | Log level | `string` | `null` |
129+
| dd_store_failed_events | Store failed events in S3 | `bool` | `null` |
130+
| dd_schedule_retry_failed_events | Periodically retry failed events (via AWS EventBridge) | `bool` | `null` |
131+
| dd_schedule_retry_interval | Retry interval in hours for failed events | `number` | `6` |
132+
| dd_forwarder_bucket_name | Custom S3 bucket name | `string` | `null` |
133+
| dd_forwarder_existing_bucket_name | Existing S3 bucket name | `string` | `null` |
134+
| dd_api_url | Custom API URL | `string` | `null` |
135+
| dd_trace_intake_url | Custom trace intake URL | `string` | `null` |
136+
| additional_target_lambda_arns | Additional Lambda ARNs to invoke | `string` | `null` |
134137

135138
### IAM Configuration
136139

@@ -273,6 +276,14 @@ module "datadog_forwarder_us_west_2" {
273276
- Your IAM role must have appropriate permissions for resources in each target region
274277
- Secrets/parameters containing the Datadog API key should exist in each target region
275278

279+
## Scheduled retry
280+
281+
When you enable `dd_store_failed_events`, the Lambda forwarder stores any events that couldn’t be sent to Datadog in an S3 bucket. These events can be logs, metrics, or traces. They aren’t automatically re‑processed on each Lambda invocation; instead, you must trigger a [manual Lambda run](https://docs.datadoghq.com/logs/guide/forwarder/?tab=manual) to process them again.
282+
283+
You can automate this re‑processing by enabling `dd_schedule_retry_failed_events` parameter, creating a scheduled Lambda invocation through [AWS EventBridge](https://docs.aws.amazon.com/lambda/latest/dg/with-eventbridge-scheduler.html). By default, the forwarder attempts re‑processing every six hours.
284+
285+
Keep in mind that log events can only be submitted with [timestamps up to 18 hours in the past](https://docs.datadoghq.com/logs/log_collection/?tab=host#custom-log-forwarding); older timestamps will cause the events to be discarded.
286+
276287
## Troubleshooting
277288

278289
### Common Issues

main.tf

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,64 @@ resource "aws_cloudwatch_log_group" "forwarder_log_group" {
290290

291291
tags = var.tags
292292
}
293+
294+
# Scheduled retry
295+
296+
resource "aws_iam_role" "scheduled_retry" {
297+
count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0
298+
299+
name = "${var.function_name}-${local.region}-retry"
300+
301+
assume_role_policy = jsonencode({
302+
Version = "2012-10-17"
303+
Statement = [
304+
{
305+
Action = "sts:AssumeRole"
306+
Effect = "Allow"
307+
Principal = {
308+
Service = data.aws_partition.current.partition == "aws-cn" ? "scheduler.amazonaws.com.cn" : "scheduler.amazonaws.com"
309+
}
310+
}
311+
]
312+
})
313+
314+
permissions_boundary = var.permissions_boundary_arn != null ? var.permissions_boundary_arn : null
315+
316+
tags = var.tags
317+
}
318+
319+
resource "aws_iam_role_policy" "scheduled_retry" {
320+
count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0
321+
322+
name = "${var.function_name}-${local.region}-retry-policy"
323+
role = aws_iam_role.scheduled_retry[0].id
324+
325+
policy = jsonencode({
326+
Version = "2012-10-17"
327+
Statement = [
328+
{
329+
Action = [
330+
"lambda:InvokeFunction",
331+
]
332+
Effect = "Allow"
333+
Resource = aws_lambda_function.forwarder.arn
334+
},
335+
]
336+
})
337+
}
338+
339+
resource "aws_scheduler_schedule" "scheduled_retry" {
340+
count = var.dd_store_failed_events && var.dd_schedule_retry_failed_events ? 1 : 0
341+
342+
name = "${var.function_name}-${local.region}-retry"
343+
description = "Retry the failed events from the Datadog Lambda Forwarder ${var.function_name}"
344+
schedule_expression = "rate(${var.dd_schedule_retry_interval} hours)"
345+
flexible_time_window {
346+
mode = "OFF"
347+
}
348+
target {
349+
arn = aws_lambda_function.forwarder.arn
350+
role_arn = aws_iam_role.scheduled_retry[0].arn
351+
input = jsonencode({ retry = true })
352+
}
353+
}

variables.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,18 @@ variable "dd_store_failed_events" {
370370
description = "Set to true to enable the forwarder to store events that failed to send to Datadog."
371371
}
372372

373+
variable "dd_schedule_retry_failed_events" {
374+
type = bool
375+
default = null
376+
description = "Set to true to enable a scheduled forwarder invocation (via AWS EventBridge) to process stored failed events."
377+
}
378+
379+
variable "dd_schedule_retry_interval" {
380+
type = number
381+
default = 6
382+
description = "Interval in hours for scheduled forwarder invocation (via AWS EventBridge)."
383+
}
384+
373385
variable "dd_forwarder_existing_bucket_name" {
374386
type = string
375387
default = null

0 commit comments

Comments
 (0)