Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
name: e2e

on:
push:
branches: [main]
paths:
- 'src/**'
- 'e2e/**'
- 'package.json'
- 'yarn.lock'
- '.github/workflows/e2e.yml'
pull_request:
paths:
- 'src/**'
- 'e2e/**'
- 'package.json'
- 'yarn.lock'
- '.github/workflows/e2e.yml'
workflow_dispatch:

# One in-flight e2e run per ref; cancel older runs so ephemeral resources don't pile up.
concurrency:
group: e2e-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
id-token: write

jobs:
check-lambda-changes:
name: Check lambda changes
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.filter.outputs.lambda }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4
id: filter
with:
filters: |
lambda:
- 'src/**'
- 'e2e/**'
- '.github/workflows/e2e.yml'

e2e-lambda:
name: Lambda e2e (Node ${{ matrix.node-version }})
runs-on: ubuntu-latest
needs: [check-lambda-changes]
strategy:
matrix:
node-version: [20]
permissions:
contents: read
id-token: write
env:
FORCE_COLOR: 1
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Set up Node ${{ matrix.node-version }}
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: ${{ matrix.node-version }}

- name: Set up Yarn
run: corepack enable && corepack prepare yarn@4.10.3 --activate

- name: Install plugin dependencies
run: yarn install --immutable

# OIDC federation: GitHub -> AWS. The role is assumable only from this repo's
# workflows and scoped to the e2e sandbox account.
- name: AWS auth (OIDC)
if: needs.check-lambda-changes.outputs.should_run == 'true'
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
role-to-assume: ${{ vars.AWS_ROLE_ARN_E2E }}
aws-region: ${{ vars.AWS_REGION_E2E || 'us-east-1' }}

- name: Install e2e dependencies
run: npm install
working-directory: e2e

# Short-lived Datadog API + App keys via OIDC federation (dd-sts), governed by
# the serverless-plugin-datadog-e2e policy. No static Datadog keys in this repo.
- name: Get Datadog credentials (dd-sts)
id: dd-sts
if: needs.check-lambda-changes.outputs.should_run == 'true'
uses: DataDog/dd-sts-action@2e8187910199bd93129520183c093e19aa585c75 # v1.0.0
with:
policy: serverless-plugin-datadog-e2e

- name: Run e2e suite
run: npm test
working-directory: e2e
env:
DATADOG_API_KEY: ${{ steps.dd-sts.outputs.api_key }}
DATADOG_APP_KEY: ${{ steps.dd-sts.outputs.app_key }}
DATADOG_SITE: ${{ vars.DD_SITE_E2E || 'datadoghq.com' }}
AWS_REGION: ${{ vars.AWS_REGION_E2E || 'us-east-1' }}
# Skipped only when nothing relevant changed. When relevant files change the
# suite runs for real and the dd-sts / AWS OIDC steps must succeed -- an
# auth/federation failure fails the job loudly rather than skipping green.
SKIP_LAMBDA_TESTS: ${{ needs.check-lambda-changes.outputs.should_run != 'true' }}
17 changes: 17 additions & 0 deletions e2e/.env.local.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copy to e2e/.env.local (gitignored) for local runs. Real shell env vars take
# precedence, so this file is purely a convenience.

# Datadog API + APP keys. The API key is wired into the Lambda extension AND used
# to authenticate the API client; the APP key is needed to poll spans/logs.
DATADOG_API_KEY=
DATADOG_APP_KEY=

# Datadog site the org lives in (datadoghq.com, datadoghq.eu, us3.datadoghq.com, ...).
DATADOG_SITE=datadoghq.com

# AWS region to deploy the ephemeral Lambda into. Must have pinned layers in
# ../src/layers.json (us-east-1 is the default).
AWS_REGION=us-east-1

# Set to skip the suite entirely.
# SKIP_LAMBDA_TESTS=true
7 changes: 7 additions & 0 deletions e2e/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
node_modules
.env.local
fixtures/**/node_modules
fixtures/**/.serverless
fixtures/**/package-lock.json
serverless-plugin-datadog-*.tgz
*.log
95 changes: 95 additions & 0 deletions e2e/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# serverless-plugin-datadog e2e suite

End-to-end coverage for the AWS Lambda instrumentation this plugin performs. It
deploys a real, ephemeral Lambda with the plugin enabled, verifies the deployed
config and the telemetry it produces in Datadog, proves re-deploy is idempotent,
then tears the stack down and verifies a clean end-state.

Conforms to the shared contract in `serverless-ci/e2e/spec.md`; mirrors the
`datadog-ci` reference suite (`e2e/cloud-run.test.ts` + `e2e/helpers/*`).

## What it does

```
sls deploy (APPLY: provision + instrument) -> verify CONFIG
-> aws lambda invoke (trigger) -> verify TELEMETRY (traces + logs)
-> sls deploy again -> assert IDEMPOTENT (no diff/dup)
-> sls remove (REMOVE) -> verify CLEAN (function gone)
-> teardown (always, even on failure)
```

For this tool the plugin runs as part of `sls deploy`, so provisioning the
uninstrumented workload and APPLY are the same step. REMOVE deletes the whole
CloudFormation stack, so the clean end-state is the function (and all its DD
config) being absent -- asserted explicitly.

**Config verified** (`helpers/lambda-verifier.ts`): the pinned Datadog Node layer
+ extension layer (versions read from `../src/layers.json`, so drift blames the
plugin), the redirected handler with the original preserved in `DD_LAMBDA_HANDLER`,
the required `DD_*` env vars, and the `service` / `env` / `version` / `dd_sls_plugin`
tags. Identity (run-id service name, env, version) is asserted -- not mere presence.

**Telemetry verified** (`helpers/lambda-telemetry-checker.ts`): spans and logs are
polled (15s × 20) filtered by the unique service name, and the matched records must
carry the full identity (service + env + version), not just exist.

## Resource hygiene

Every run uses a unique name `one-e2e-slsplugin-lambda-<runid>` and stamps a
`one_e2e_created:<unix-ts>` tag at creation (`helpers/naming.ts`). The shared
cross-repo sweeper ages out anything older than the grace window. In-test teardown
runs in `afterAll` regardless of outcome.

## Prerequisites

- **Node 20** and **npm** (the suite is a standalone npm project, isolated from the
plugin's Yarn Berry setup).
- The plugin is built and the fixture is installed automatically by `pretest`
(`npm test` runs `yarn build` at the repo root, then `npm install` in the fixture).
- **AWS auth** with permission to deploy Lambda / CloudFormation in the target
account. Locally, wrap the run with `aws-vault`:
```
aws-vault exec sso-serverless-sandbox-account-admin -- npm test
```
In CI, credentials come from GitHub→AWS OIDC (no static keys).
- **Datadog keys**: `DATADOG_API_KEY` (wired into the extension and used for the API
client) and `DATADOG_APP_KEY` (used to poll spans/logs).

## Run locally

```
cd e2e
cp .env.local.example .env.local # fill in DATADOG_API_KEY / DATADOG_APP_KEY
npm install
aws-vault exec sso-serverless-sandbox-account-admin -- npm test
```

`.env.local` is loaded automatically (real env vars win). Set `SKIP_LAMBDA_TESTS=true`
to skip the suite.

## Configuration

| Env var | Required | Default | Purpose |
| ------------------ | -------- | ---------------- | ---------------------------------------------- |
| `DATADOG_API_KEY` | yes | -- | Wired into the extension + API-client auth |
| `DATADOG_APP_KEY` | yes | -- | API-client auth for span/log polling |
| `DATADOG_SITE` | no | `datadoghq.com` | Datadog site |
| `AWS_REGION` | no | `us-east-1` | Deploy region (must be pinned in `layers.json`)|
| `SKIP_LAMBDA_TESTS`| no | -- | `true` skips the suite |

(AWS credentials come from the ambient AWS env / `aws-vault` / OIDC.)

## CI

`.github/workflows/e2e.yml` runs the suite behind a `dorny/paths-filter` gate
(`src/**`, `e2e/**`, the workflow file) and the `SKIP_LAMBDA_TESTS` flag, with
GitHub→AWS OIDC (`aws-actions/configure-aws-credentials`). Required repo settings:

- Datadog auth (dd-sts): short-lived API + App keys minted at runtime via
[`DataDog/dd-sts-action`](https://github.com/DataDog/dd-sts-action) under the
`serverless-plugin-datadog-e2e` policy -- no static Datadog keys in this repo
- Variables: `AWS_ROLE_ARN_E2E` (the OIDC deploy role), `AWS_REGION_E2E` (default
`us-east-1`), optionally `DD_SITE_E2E`

The OIDC deploy role and the policy backing it are cataloged in
`serverless-ci/e2e/iam-infra.md`.
16 changes: 16 additions & 0 deletions e2e/fixtures/lambda-node/handler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Minimal Node.js workload for the e2e suite. Duplicated from the default
// handler in serverless-self-monitoring (lambda-managed-instances/handlers/default/nodejs),
// with one log line added so a log record is emitted on every invocation.
//
// No tracer setup lives here on purpose: the serverless-plugin-datadog wiring
// (Datadog Node layer + extension + redirected handler) auto-instruments the
// invocation and auto-collects logs. The e2e suite tests that wiring, not the
// runtime, so this handler stays trivial.
exports.handler = async function (_event, _context) {
console.log(`one-e2e serverless-plugin-datadog lambda invocation service=${process.env.DD_SERVICE}`);

return {
statusCode: 200,
body: "hello, world",
};
};
9 changes: 9 additions & 0 deletions e2e/fixtures/lambda-node/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "one-e2e-lambda-node-workload",
"version": "0.0.0",
"private": true,
"description": "Ephemeral Lambda workload deployed by the serverless-plugin-datadog e2e suite. The serverless-plugin-datadog dependency is installed from a packed tarball by ../../setup.sh (not listed here, to avoid a recursive file: link).",
"dependencies": {
"serverless": "3.39.0"
}
}
49 changes: 49 additions & 0 deletions e2e/fixtures/lambda-node/serverless.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Uninstrumented workload + the serverless-plugin-datadog instrumentation it plugs
# into. The plugin runs as part of `sls deploy` (APPLY) and is torn down with the
# whole stack on `sls remove` (REMOVE). Everything that has to be unique or stamped
# per run is injected via env so this file stays static (see e2e/helpers/naming.ts).
#
# frameworkVersion is pinned to 3 to avoid the Serverless Framework v4 login/license
# flow in CI. The runtime is pinned to one canonical Node.js version (see RULES in
# the spec: one canonical runtime per platform).
service: ${env:E2E_SERVICE_NAME}

frameworkVersion: "3"

plugins:
- serverless-plugin-datadog

provider:
name: aws
runtime: nodejs20.x
region: ${env:AWS_REGION, "us-east-1"}
stage: e2e
# Hygiene: freshness tag set atomically at creation, on both the CloudFormation
# stack and every resource, so the cross-repo sweeper can age it out.
stackTags:
one_e2e_created: ${env:E2E_CREATED_TS}
tags:
one_e2e_created: ${env:E2E_CREATED_TS}

custom:
datadog:
# API key wiring + site: required for the extension to ship telemetry.
apiKey: ${env:DD_API_KEY}
site: ${env:DD_SITE, "datadoghq.com"}
# Identity: service carries the unique run id, env + version are fixed markers.
# These flow onto ingested telemetry and let us assert identity, not existence.
service: ${env:E2E_SERVICE_NAME}
env: e2e
version: "1.0.0"
# The mechanism under test: library layer + extension layer + tracing + logs.
addLayers: true
addExtension: true
enableDDTracing: true
enableDDLogs: true
# Keep the deploy hermetic: no git metadata upload (needs app key + git ctx).
enableSourceCodeIntegration: false
uploadGitMetadata: false

functions:
hello:
handler: handler.handler
89 changes: 89 additions & 0 deletions e2e/helpers/exec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import child_process from 'node:child_process';

// Runner-agnostic: no jest/vitest imports here so the same helpers can back any
// test runner. Mirrors the datadog-ci reference helper (e2e/helpers/exec.ts).

export interface ExecResult {
exitCode: number;
stdout: string;
stderr: string;
}

export interface ExecOptions {
env?: Record<string, string | undefined>;
// Serverless / AWS calls can run long; default generous but bounded.
cwd?: string;
maxBuffer?: number;
}

export const execPromise = async (command: string, options: ExecOptions = {}): Promise<ExecResult> => {
const {env, cwd, maxBuffer = 50 * 1024 * 1024} = options;

return new Promise((resolve) => {
child_process.exec(command, {env: {...process.env, ...env}, cwd, maxBuffer}, (error, stdout, stderr) => {
resolve({
exitCode: error ? (typeof error.code === 'number' ? error.code : 1) : 0,
stdout: stdout.trim(),
stderr: stderr.trim(),
});
});
});
};

// Transient cloud-provider errors that are safe to retry. "Retry the cloud, not
// the assertions" -- these are throttling/timeout/conflict signals, never real
// failures. AWS-specific patterns are added on top of the shared cross-cloud set.
const RETRYABLE_PATTERNS = [
// Generic / cross-cloud
'GatewayTimeout',
'Operation was canceled',
'ETIMEDOUT',
'ECONNRESET',
'temporarily unavailable',
// AWS Lambda / CloudFormation / STS
'ThrottlingException',
'TooManyRequestsException',
'Rate exceeded',
'RequestLimitExceeded',
'ResourceConflictException',
'ServiceException',
'InternalFailure',
'ServiceUnavailable',
'is in progress', // CloudFormation stack op already running
'ProvisionedConcurrencyConfig', // eventual-consistency churn on update
];

const isRetryable = (result: ExecResult): boolean => {
const output = `${result.stdout} ${result.stderr}`;

return RETRYABLE_PATTERNS.some((pattern) => output.includes(pattern));
};

const waitFor = (seconds: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, seconds * 1000));

export const execPromiseWithRetries = async (
command: string,
options: ExecOptions = {},
{maxAttempts = 3, delaySeconds = 10}: {maxAttempts?: number; delaySeconds?: number} = {},
): Promise<ExecResult> => {
let result: ExecResult = {exitCode: 1, stdout: '', stderr: 'not run'};
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
result = await execPromise(command, options);
if (result.exitCode === 0) {
return result;
}
if (attempt < maxAttempts && isRetryable(result)) {
// eslint-disable-next-line no-console
console.log(`Command failed with retryable error (attempt ${attempt}/${maxAttempts}), retrying in ${delaySeconds}s...`);
// eslint-disable-next-line no-console
console.log(`stdout: ${result.stdout}`);
// eslint-disable-next-line no-console
console.log(`stderr: ${result.stderr}`);
await waitFor(delaySeconds);
} else {
return result;
}
}

return result;
};
Loading
Loading