From dd4ca382051b61fe7349f2400203a2a5fbe89289 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Wed, 26 Nov 2025 16:43:54 +0100 Subject: [PATCH 1/7] feat(aws): Add scheduled invocation for retry event Signed-off-by: Vincent Boutour --- aws/logs_monitoring/README.md | 8 ++- aws/logs_monitoring/lambda_function.py | 12 ++++- aws/logs_monitoring/template.yaml | 68 ++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 8 deletions(-) diff --git a/aws/logs_monitoring/README.md b/aws/logs_monitoring/README.md index 493385d7d..1afae95a9 100644 --- a/aws/logs_monitoring/README.md +++ b/aws/logs_monitoring/README.md @@ -92,10 +92,14 @@ If you can't install the Forwarder using the provided CloudFormation template, y 5. Some AWS accounts are configured such that triggers will not automatically create resource-based policies allowing Cloudwatch log groups to invoke the forwarder. Reference the [CloudWatchLogPermissions][103] to see which permissions are required for the forwarder to be invoked by Cloudwatch Log Events. 6. [Configure triggers][104]. 7. Create an S3 bucket, and set environment variable `DD_S3_BUCKET_NAME` to the bucket name. Also provide `s3:GetObject`, `s3:PutObject`, `s3:ListBucket`, and `s3:DeleteObject` permissions on this bucket to the Lambda execution role. This bucket is used to store the different tags cache i.e. Lambda, S3, Step Function and Log Group. Additionally, this bucket will be used to store unforwarded events incase of forwarding exceptions. -8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with the `retry` keyword set to a non empty string (which can be manually triggered - see below), the forwarder will retry sending the stored events. When successful it will clear up the storage in the bucket. +8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with only the `retry` keyword set to true, the forwarder will retry sending the stored events. When successful it will clear up the storage in the bucket. ```bash -aws lambda invoke --function-name --payload '{"retry":"true"}' --cli-binary-format raw-in-base64-out --log-type Tail /dev/stdout +aws lambda invoke --function-name \ + --payload '{"retry":true}' \ + --cli-binary-format raw-in-base64-out \ + --log-type Tail /dev/stdout | + jq -r 'select(.LogResult) | .LogResult' | base64 -d | xargs -0 printf "%s" ```
diff --git a/aws/logs_monitoring/lambda_function.py b/aws/logs_monitoring/lambda_function.py index 154ff6c84..1ca6be234 100644 --- a/aws/logs_monitoring/lambda_function.py +++ b/aws/logs_monitoring/lambda_function.py @@ -62,6 +62,15 @@ def datadog_forwarder(event, context): init_cache_layer(function_prefix) init_forwarder(function_prefix) + if len(event) == 1 and str(event.get(DD_RETRY_KEYWORD, "false")).lower() == "true": + try: + forwarder.retry() + except Exception as e: + if logger.isEnabledFor(logging.DEBUG): + logger.debug(f"Failed to retry forwarding {e}") + + return + parsed = parse(event, context, cache_layer) enriched = enrich(parsed, cache_layer) transformed = transform(enriched) @@ -71,12 +80,11 @@ def datadog_forwarder(event, context): parse_and_submit_enhanced_metrics(logs, cache_layer) try: - if bool(event.get(DD_RETRY_KEYWORD, False)) is True: + if str(event.get(DD_RETRY_KEYWORD, "false")).lower() == "true": forwarder.retry() except Exception as e: if logger.isEnabledFor(logging.DEBUG): logger.debug(f"Failed to retry forwarding {e}") - pass def init_cache_layer(function_prefix): diff --git a/aws/logs_monitoring/template.yaml b/aws/logs_monitoring/template.yaml index 05758fa9f..2a037d12c 100644 --- a/aws/logs_monitoring/template.yaml +++ b/aws/logs_monitoring/template.yaml @@ -261,6 +261,17 @@ Parameters: - true - false Description: Set to true to enable the forwarder to store events that failed to send to Datadog. + DdScheduleRetryFailedEvents: + Type: String + Default: false + AllowedValues: + - true + - false + Description: Set to true to enable a scheduled forwarder invocation (via AWS EventBridge) to process stored failed events. + DdScheduleRetryInterval: + Type: Number + Default: 6 + Description: Interval in hours for scheduled forwarder invocation (via AWS EventBridge). DdForwarderExistingBucketName: Type: String Default: "" @@ -292,7 +303,7 @@ Parameters: KmsKeyList: Type: CommaDelimitedList Default: "" - Description: List of KMS Key ARNs the Lambda forwarder function can use to decrypt, seperated by comma + Description: List of KMS Key ARNs the Lambda forwarder function can use to decrypt, seperated by comma Conditions: IsAWSChina: !Equals [!Ref "AWS::Partition", aws-cn] IsGovCloud: !Equals [!Ref "AWS::Partition", aws-us-gov] @@ -348,7 +359,8 @@ Conditions: SetLayerARN: !Not - !Equals [!Ref LayerARN, ""] SetDdForwardLog: !Equals [!Ref DdForwardLog, false] - SetDdStepFunctionsTraceEnabled: !Equals [!Ref DdStepFunctionsTraceEnabled, true] + SetDdStepFunctionsTraceEnabled: + !Equals [!Ref DdStepFunctionsTraceEnabled, true] SetDdUseCompression: !Equals [!Ref DdUseCompression, false] SetDdCompressionLevel: !Not - !Equals [!Ref DdCompressionLevel, 6] @@ -384,6 +396,7 @@ Conditions: - !Equals [!Ref DdLogLevel, ""] SetDdForwarderDecryptKeys: !Not - !Equals [!Join ["", !Ref KmsKeyList], ""] + CreateRetryScheduler: !Equals [!Ref DdScheduleRetryFailedEvents, true] Rules: MustSetDdApiKey: Assertions: @@ -431,7 +444,10 @@ Resources: - !Ref DdForwarderExistingBucketName S3Key: !Sub - "aws-dd-forwarder-${DdForwarderVersion}.zip" - - {DdForwarderVersion: !FindInMap [Constants, DdForwarder, Version]} + - { + DdForwarderVersion: + !FindInMap [Constants, DdForwarder, Version], + } - ZipFile: " " MemorySize: !Ref MemorySize Runtime: python3.13 @@ -831,7 +847,7 @@ Resources: - !Ref SourceZipUrl - !Sub - "https://github.com/DataDog/datadog-serverless-functions/releases/download/aws-dd-forwarder-${DdForwarderVersion}/aws-dd-forwarder-${DdForwarderVersion}.zip" - - {DdForwarderVersion: !FindInMap [Constants, DdForwarder, Version]} + - { DdForwarderVersion: !FindInMap [Constants, DdForwarder, Version] } # The Forwarder's source code is too big to fit the inline code size limit for CloudFormation. In most of AWS # partitions and regions, the Forwarder is able to load its source code from a Lambda layer attached to it. # In places where Datadog can't/doesn't yet publish Lambda layers, use another Lambda to copy the source code @@ -970,6 +986,50 @@ Resources: - - "arn:*:s3:::" - !Select [1, !Split ["s3://", !Ref SourceZipUrl]] - !Ref AWS::NoValue + SchedulerRole: + Type: AWS::IAM::Role + Condition: CreateRetryScheduler + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Action: + - sts:AssumeRole + Effect: Allow + Principal: + Service: !If + - IsAWSChina + - "scheduler.amazonaws.com.cn" + - "scheduler.amazonaws.com" + PermissionsBoundary: !If + - SetPermissionsBoundary + - !Ref PermissionsBoundaryArn + - !Ref AWS::NoValue + Policies: + - PolicyName: ForwarderZipCopierRolePolicy0 + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - lambda:InvokeFunction + Resource: + - !GetAtt + - Forwarder + - Arn + Scheduler: + Type: AWS::Scheduler::Schedule + Condition: CreateRetryScheduler + Properties: + Name: !Sub "${AWS::StackName}-retry" + Description: Retry the failed events from the Datadog Lambda Forwarder + ScheduleExpression: !Sub "rate(${DdScheduleRetryInterval} hours)" + FlexibleTimeWindow: + Mode: "OFF" + Target: + Arn: !GetAtt "Forwarder.Arn" + RoleArn: !GetAtt "SchedulerRole.Arn" + Input: '{"retry": true}' Outputs: DatadogForwarderArn: Description: Datadog Forwarder Lambda Function ARN From c366dfe1723c1cae4e108feaf370361bf2cd0aa9 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Thu, 27 Nov 2025 11:11:06 +0100 Subject: [PATCH 2/7] fixup! feat(aws): Add scheduled invocation for retry event Signed-off-by: Vincent Boutour --- aws/logs_monitoring/lambda_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aws/logs_monitoring/lambda_function.py b/aws/logs_monitoring/lambda_function.py index 1ca6be234..5b7592a23 100644 --- a/aws/logs_monitoring/lambda_function.py +++ b/aws/logs_monitoring/lambda_function.py @@ -63,6 +63,8 @@ def datadog_forwarder(event, context): init_forwarder(function_prefix) if len(event) == 1 and str(event.get(DD_RETRY_KEYWORD, "false")).lower() == "true": + logger.info("Retry-only invocation") + try: forwarder.retry() except Exception as e: From ca4f03326cb41f8aa05c031c1ff4aff8fa4cc15b Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Thu, 27 Nov 2025 14:53:46 +0100 Subject: [PATCH 3/7] docs(aws): Update the doc related to retry Co-authored-by: Georgi --- aws/logs_monitoring/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/logs_monitoring/README.md b/aws/logs_monitoring/README.md index 1afae95a9..30730b8cd 100644 --- a/aws/logs_monitoring/README.md +++ b/aws/logs_monitoring/README.md @@ -92,7 +92,7 @@ If you can't install the Forwarder using the provided CloudFormation template, y 5. Some AWS accounts are configured such that triggers will not automatically create resource-based policies allowing Cloudwatch log groups to invoke the forwarder. Reference the [CloudWatchLogPermissions][103] to see which permissions are required for the forwarder to be invoked by Cloudwatch Log Events. 6. [Configure triggers][104]. 7. Create an S3 bucket, and set environment variable `DD_S3_BUCKET_NAME` to the bucket name. Also provide `s3:GetObject`, `s3:PutObject`, `s3:ListBucket`, and `s3:DeleteObject` permissions on this bucket to the Lambda execution role. This bucket is used to store the different tags cache i.e. Lambda, S3, Step Function and Log Group. Additionally, this bucket will be used to store unforwarded events incase of forwarding exceptions. -8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with only the `retry` keyword set to true, the forwarder will retry sending the stored events. When successful it will clear up the storage in the bucket. +8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with the `retry` keyword explicitly set to `true`, the forwarder will retry sending the stored events. Stored logs will be cleanup upon a successful forwarding. ```bash aws lambda invoke --function-name \ From b1f552eefd4872fda41653a7fef913dc436c42de Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Thu, 27 Nov 2025 16:54:26 +0100 Subject: [PATCH 4/7] fixup! feat(aws): Add scheduled invocation for retry event Signed-off-by: Vincent Boutour --- aws/logs_monitoring/template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/logs_monitoring/template.yaml b/aws/logs_monitoring/template.yaml index 2a037d12c..f608da81e 100644 --- a/aws/logs_monitoring/template.yaml +++ b/aws/logs_monitoring/template.yaml @@ -1006,7 +1006,7 @@ Resources: - !Ref PermissionsBoundaryArn - !Ref AWS::NoValue Policies: - - PolicyName: ForwarderZipCopierRolePolicy0 + - PolicyName: SchedulerRolePolicy0 PolicyDocument: Version: "2012-10-17" Statement: From 30116514054c15da805be2f53b844e026fbfcfd8 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Thu, 27 Nov 2025 18:19:27 +0100 Subject: [PATCH 5/7] docs(aws): Reword section on store failed events Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> --- aws/logs_monitoring/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/logs_monitoring/README.md b/aws/logs_monitoring/README.md index 30730b8cd..a21fa3235 100644 --- a/aws/logs_monitoring/README.md +++ b/aws/logs_monitoring/README.md @@ -92,7 +92,7 @@ If you can't install the Forwarder using the provided CloudFormation template, y 5. Some AWS accounts are configured such that triggers will not automatically create resource-based policies allowing Cloudwatch log groups to invoke the forwarder. Reference the [CloudWatchLogPermissions][103] to see which permissions are required for the forwarder to be invoked by Cloudwatch Log Events. 6. [Configure triggers][104]. 7. Create an S3 bucket, and set environment variable `DD_S3_BUCKET_NAME` to the bucket name. Also provide `s3:GetObject`, `s3:PutObject`, `s3:ListBucket`, and `s3:DeleteObject` permissions on this bucket to the Lambda execution role. This bucket is used to store the different tags cache i.e. Lambda, S3, Step Function and Log Group. Additionally, this bucket will be used to store unforwarded events incase of forwarding exceptions. -8. Set environment variable `DD_STORE_FAILED_EVENTS` to `true` to enable the forwarder to also store event data in the S3 bucket. In case of exceptions when sending logs, metrics or traces to intake, the forwarder will store relevant data in the S3 bucket. On custom invocations i.e. on receiving an event with the `retry` keyword explicitly set to `true`, the forwarder will retry sending the stored events. Stored logs will be cleanup upon a successful forwarding. +8. Set the environment variable `DD_STORE_FAILED_EVENTS` to `true`, so you can enable the forwarder to also store event data in the S3 bucket. If an exception occurs when sending logs, metrics, or traces to intake, the forwarder stores relevant data in the S3 bucket. On custom invocations, such as on receiving an event with the `retry` keyword explicitly set to `true`, the forwarder retries sending the stored events. Upon a successful forwarding, the forwarder cleans up the stored logs. ```bash aws lambda invoke --function-name \ From f2298b24566e3518ff0dfaaf141f28e459c39d36 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Fri, 28 Nov 2025 10:59:23 +0100 Subject: [PATCH 6/7] docs(aws): Adding more documentation on the retry mechanism Signed-off-by: Vincent Boutour --- aws/logs_monitoring/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/aws/logs_monitoring/README.md b/aws/logs_monitoring/README.md index a21fa3235..c28a38467 100644 --- a/aws/logs_monitoring/README.md +++ b/aws/logs_monitoring/README.md @@ -316,6 +316,14 @@ Otherwise, if you are using Web Proxy: 7. Set `DdNoSsl` to `true` if connecting to the proxy using `http`. 8. Set `DdSkipSslValidation` to `true` if connecting to the proxy using `https` with a self-signed certificate. +### Scheduled retry + +When you enable `DdStoreFailedEvents`, the Lambda forwarder stores any events that couldn’t be sent to Datadog in an S3 bucket. These events can be logs, metrics, or traces. They aren’t automatically re‑processed on each Lambda invocation; instead, you must trigger a [manual Lambda run](https://docs.datadoghq.com/logs/guide/forwarder/?tab=manual) to process them again. + +You can automate this re‑processing by enabling `DdScheduleRetryFailedEvents` parameter, creating a scheduled Lambda invocation through [AWS EventBridge](https://docs.aws.amazon.com/lambda/latest/dg/with-eventbridge-scheduler.html). By default, the forwarder attempts re‑processing every six hours. + +Keep in mind that log events can only be submitted with [timestamps up to 18 hours in the past](https://docs.datadoghq.com/logs/log_collection/?tab=host#custom-log-forwarding); older timestamps will cause the events to be discarded. + ### Code signing The Datadog Forwarder is signed by Datadog. To verify the integrity of the Forwarder, use the manual installation method. [Create a Code Signing Configuration][19] that includes Datadog’s Signing Profile ARN (`arn:aws:signer:us-east-1:464622532012:/signing-profiles/DatadogLambdaSigningProfile/9vMI9ZAGLc`) and associate it with the Forwarder Lambda function before uploading the Forwarder ZIP file. @@ -460,6 +468,15 @@ To test different patterns against your logs, turn on [debug logs](#troubleshoot `AdditionalTargetLambdaArns` : Comma separated list of Lambda ARNs that will get called asynchronously with the same `event` the Datadog Forwarder receives. +`DdStoreFailedEvents` +: Set to true to enable the forwarder to store events that failed to send to Datadog. + +`DdScheduleRetryFailedEvents` +: Set to true to enable a scheduled forwarder invocation (via AWS EventBridge) to process stored failed events. + +`DdScheduleRetryInterval` +: Interval in hours for scheduled forwarder invocation (via AWS EventBridge). + `InstallAsLayer` : Whether to use the layer-based installation flow. Set to false to use the legacy installation flow, which installs a second function that copies the forwarder code from GitHub to an S3 bucket. Defaults to true. @@ -626,6 +643,9 @@ To test different patterns against your logs, turn on [debug logs](#troubleshoot `ADDITIONAL_TARGET_LAMBDA_ARNS` : Comma separated list of Lambda ARNs that will get called asynchronously with the same `event` the Datadog Forwarder receives. +`DD_STORE_FAILED_EVENTS` +: Set to true to enable the forwarder to store events that failed to send to Datadog. + `INSTALL_AS_LAYER` : Whether to use the layer-based installation flow. Set to false to use the legacy installation flow, which installs a second function that copies the forwarder code from GitHub to an S3 bucket. Defaults to true. From d4ee4ea4be3646a46e8888ec26f4430b0aab7eb3 Mon Sep 17 00:00:00 2001 From: Vincent Boutour Date: Fri, 28 Nov 2025 11:09:55 +0100 Subject: [PATCH 7/7] feat(aws): Ensure both storage and retry are enabled for creating scheduler Signed-off-by: Vincent Boutour --- aws/logs_monitoring/template.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aws/logs_monitoring/template.yaml b/aws/logs_monitoring/template.yaml index f608da81e..ca10346ad 100644 --- a/aws/logs_monitoring/template.yaml +++ b/aws/logs_monitoring/template.yaml @@ -396,7 +396,9 @@ Conditions: - !Equals [!Ref DdLogLevel, ""] SetDdForwarderDecryptKeys: !Not - !Equals [!Join ["", !Ref KmsKeyList], ""] - CreateRetryScheduler: !Equals [!Ref DdScheduleRetryFailedEvents, true] + CreateRetryScheduler: !And + - !Equals [!Ref DdStoreFailedEvents, true] + - !Equals [!Ref DdScheduleRetryFailedEvents, true] Rules: MustSetDdApiKey: Assertions: