diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..5e5e45bc --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,106 @@ +name: e2e + +on: + push: + branches: [main] + paths: + - 'src/**' + - 'e2e/**' + - 'package.json' + - 'yarn.lock' + - '.github/workflows/e2e.yml' + pull_request: + paths: + - 'src/**' + - 'e2e/**' + - 'package.json' + - 'yarn.lock' + - '.github/workflows/e2e.yml' + workflow_dispatch: + +# One in-flight e2e run per ref; cancel older runs so ephemeral resources don't pile up. +concurrency: + group: e2e-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + id-token: write + +jobs: + check-lambda-changes: + name: Check lambda changes + runs-on: ubuntu-latest + outputs: + should_run: ${{ steps.filter.outputs.lambda }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4 + id: filter + with: + filters: | + lambda: + - 'src/**' + - 'e2e/**' + - '.github/workflows/e2e.yml' + + e2e-lambda: + name: Lambda e2e (Node ${{ matrix.node-version }}) + runs-on: ubuntu-latest + needs: [check-lambda-changes] + strategy: + matrix: + node-version: [20] + permissions: + contents: read + id-token: write + env: + FORCE_COLOR: 1 + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Set up Node ${{ matrix.node-version }} + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: ${{ matrix.node-version }} + + - name: Set up Yarn + run: corepack enable && corepack prepare yarn@4.10.3 --activate + + - name: Install plugin dependencies + run: yarn install --immutable + + # OIDC federation: GitHub -> AWS. The role is assumable only from this repo's + # workflows and scoped to the e2e sandbox account. + - name: AWS auth (OIDC) + if: needs.check-lambda-changes.outputs.should_run == 'true' + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN_E2E }} + aws-region: ${{ vars.AWS_REGION_E2E || 'us-east-1' }} + + - name: Install e2e dependencies + run: npm install + working-directory: e2e + + # Short-lived Datadog API + App keys via OIDC federation (dd-sts), governed by + # the serverless-plugin-datadog-e2e policy. No static Datadog keys in this repo. + - name: Get Datadog credentials (dd-sts) + id: dd-sts + if: needs.check-lambda-changes.outputs.should_run == 'true' + uses: DataDog/dd-sts-action@2e8187910199bd93129520183c093e19aa585c75 # v1.0.0 + with: + policy: serverless-plugin-datadog-e2e + + - name: Run e2e suite + run: npm test + working-directory: e2e + env: + DATADOG_API_KEY: ${{ steps.dd-sts.outputs.api_key }} + DATADOG_APP_KEY: ${{ steps.dd-sts.outputs.app_key }} + DATADOG_SITE: ${{ vars.DD_SITE_E2E || 'datadoghq.com' }} + AWS_REGION: ${{ vars.AWS_REGION_E2E || 'us-east-1' }} + # Skipped only when nothing relevant changed. When relevant files change the + # suite runs for real and the dd-sts / AWS OIDC steps must succeed -- an + # auth/federation failure fails the job loudly rather than skipping green. + SKIP_LAMBDA_TESTS: ${{ needs.check-lambda-changes.outputs.should_run != 'true' }} diff --git a/e2e/.env.local.example b/e2e/.env.local.example new file mode 100644 index 00000000..5f0b76ae --- /dev/null +++ b/e2e/.env.local.example @@ -0,0 +1,17 @@ +# Copy to e2e/.env.local (gitignored) for local runs. Real shell env vars take +# precedence, so this file is purely a convenience. + +# Datadog API + APP keys. The API key is wired into the Lambda extension AND used +# to authenticate the API client; the APP key is needed to poll spans/logs. +DATADOG_API_KEY= +DATADOG_APP_KEY= + +# Datadog site the org lives in (datadoghq.com, datadoghq.eu, us3.datadoghq.com, ...). +DATADOG_SITE=datadoghq.com + +# AWS region to deploy the ephemeral Lambda into. Must have pinned layers in +# ../src/layers.json (us-east-1 is the default). +AWS_REGION=us-east-1 + +# Set to skip the suite entirely. +# SKIP_LAMBDA_TESTS=true diff --git a/e2e/.gitignore b/e2e/.gitignore new file mode 100644 index 00000000..3c16a963 --- /dev/null +++ b/e2e/.gitignore @@ -0,0 +1,7 @@ +node_modules +.env.local +fixtures/**/node_modules +fixtures/**/.serverless +fixtures/**/package-lock.json +serverless-plugin-datadog-*.tgz +*.log diff --git a/e2e/README.md b/e2e/README.md new file mode 100644 index 00000000..73a6b31c --- /dev/null +++ b/e2e/README.md @@ -0,0 +1,95 @@ +# serverless-plugin-datadog e2e suite + +End-to-end coverage for the AWS Lambda instrumentation this plugin performs. It +deploys a real, ephemeral Lambda with the plugin enabled, verifies the deployed +config and the telemetry it produces in Datadog, proves re-deploy is idempotent, +then tears the stack down and verifies a clean end-state. + +Conforms to the shared contract in `serverless-ci/e2e/spec.md`; mirrors the +`datadog-ci` reference suite (`e2e/cloud-run.test.ts` + `e2e/helpers/*`). + +## What it does + +``` +sls deploy (APPLY: provision + instrument) -> verify CONFIG + -> aws lambda invoke (trigger) -> verify TELEMETRY (traces + logs) + -> sls deploy again -> assert IDEMPOTENT (no diff/dup) + -> sls remove (REMOVE) -> verify CLEAN (function gone) + -> teardown (always, even on failure) +``` + +For this tool the plugin runs as part of `sls deploy`, so provisioning the +uninstrumented workload and APPLY are the same step. REMOVE deletes the whole +CloudFormation stack, so the clean end-state is the function (and all its DD +config) being absent -- asserted explicitly. + +**Config verified** (`helpers/lambda-verifier.ts`): the pinned Datadog Node layer ++ extension layer (versions read from `../src/layers.json`, so drift blames the +plugin), the redirected handler with the original preserved in `DD_LAMBDA_HANDLER`, +the required `DD_*` env vars, and the `service` / `env` / `version` / `dd_sls_plugin` +tags. Identity (run-id service name, env, version) is asserted -- not mere presence. + +**Telemetry verified** (`helpers/lambda-telemetry-checker.ts`): spans and logs are +polled (15s × 20) filtered by the unique service name, and the matched records must +carry the full identity (service + env + version), not just exist. + +## Resource hygiene + +Every run uses a unique name `one-e2e-slsplugin-lambda-` and stamps a +`one_e2e_created:` tag at creation (`helpers/naming.ts`). The shared +cross-repo sweeper ages out anything older than the grace window. In-test teardown +runs in `afterAll` regardless of outcome. + +## Prerequisites + +- **Node 20** and **npm** (the suite is a standalone npm project, isolated from the + plugin's Yarn Berry setup). +- The plugin is built and the fixture is installed automatically by `pretest` + (`npm test` runs `yarn build` at the repo root, then `npm install` in the fixture). +- **AWS auth** with permission to deploy Lambda / CloudFormation in the target + account. Locally, wrap the run with `aws-vault`: + ``` + aws-vault exec sso-serverless-sandbox-account-admin -- npm test + ``` + In CI, credentials come from GitHub→AWS OIDC (no static keys). +- **Datadog keys**: `DATADOG_API_KEY` (wired into the extension and used for the API + client) and `DATADOG_APP_KEY` (used to poll spans/logs). + +## Run locally + +``` +cd e2e +cp .env.local.example .env.local # fill in DATADOG_API_KEY / DATADOG_APP_KEY +npm install +aws-vault exec sso-serverless-sandbox-account-admin -- npm test +``` + +`.env.local` is loaded automatically (real env vars win). Set `SKIP_LAMBDA_TESTS=true` +to skip the suite. + +## Configuration + +| Env var | Required | Default | Purpose | +| ------------------ | -------- | ---------------- | ---------------------------------------------- | +| `DATADOG_API_KEY` | yes | -- | Wired into the extension + API-client auth | +| `DATADOG_APP_KEY` | yes | -- | API-client auth for span/log polling | +| `DATADOG_SITE` | no | `datadoghq.com` | Datadog site | +| `AWS_REGION` | no | `us-east-1` | Deploy region (must be pinned in `layers.json`)| +| `SKIP_LAMBDA_TESTS`| no | -- | `true` skips the suite | + +(AWS credentials come from the ambient AWS env / `aws-vault` / OIDC.) + +## CI + +`.github/workflows/e2e.yml` runs the suite behind a `dorny/paths-filter` gate +(`src/**`, `e2e/**`, the workflow file) and the `SKIP_LAMBDA_TESTS` flag, with +GitHub→AWS OIDC (`aws-actions/configure-aws-credentials`). Required repo settings: + +- Datadog auth (dd-sts): short-lived API + App keys minted at runtime via + [`DataDog/dd-sts-action`](https://github.com/DataDog/dd-sts-action) under the + `serverless-plugin-datadog-e2e` policy -- no static Datadog keys in this repo +- Variables: `AWS_ROLE_ARN_E2E` (the OIDC deploy role), `AWS_REGION_E2E` (default + `us-east-1`), optionally `DD_SITE_E2E` + +The OIDC deploy role and the policy backing it are cataloged in +`serverless-ci/e2e/iam-infra.md`. diff --git a/e2e/fixtures/lambda-node/handler.js b/e2e/fixtures/lambda-node/handler.js new file mode 100644 index 00000000..ad3214bc --- /dev/null +++ b/e2e/fixtures/lambda-node/handler.js @@ -0,0 +1,16 @@ +// Minimal Node.js workload for the e2e suite. Duplicated from the default +// handler in serverless-self-monitoring (lambda-managed-instances/handlers/default/nodejs), +// with one log line added so a log record is emitted on every invocation. +// +// No tracer setup lives here on purpose: the serverless-plugin-datadog wiring +// (Datadog Node layer + extension + redirected handler) auto-instruments the +// invocation and auto-collects logs. The e2e suite tests that wiring, not the +// runtime, so this handler stays trivial. +exports.handler = async function (_event, _context) { + console.log(`one-e2e serverless-plugin-datadog lambda invocation service=${process.env.DD_SERVICE}`); + + return { + statusCode: 200, + body: "hello, world", + }; +}; diff --git a/e2e/fixtures/lambda-node/package.json b/e2e/fixtures/lambda-node/package.json new file mode 100644 index 00000000..c4817b59 --- /dev/null +++ b/e2e/fixtures/lambda-node/package.json @@ -0,0 +1,9 @@ +{ + "name": "one-e2e-lambda-node-workload", + "version": "0.0.0", + "private": true, + "description": "Ephemeral Lambda workload deployed by the serverless-plugin-datadog e2e suite. The serverless-plugin-datadog dependency is installed from a packed tarball by ../../setup.sh (not listed here, to avoid a recursive file: link).", + "dependencies": { + "serverless": "3.39.0" + } +} diff --git a/e2e/fixtures/lambda-node/serverless.yml b/e2e/fixtures/lambda-node/serverless.yml new file mode 100644 index 00000000..b794e094 --- /dev/null +++ b/e2e/fixtures/lambda-node/serverless.yml @@ -0,0 +1,49 @@ +# Uninstrumented workload + the serverless-plugin-datadog instrumentation it plugs +# into. The plugin runs as part of `sls deploy` (APPLY) and is torn down with the +# whole stack on `sls remove` (REMOVE). Everything that has to be unique or stamped +# per run is injected via env so this file stays static (see e2e/helpers/naming.ts). +# +# frameworkVersion is pinned to 3 to avoid the Serverless Framework v4 login/license +# flow in CI. The runtime is pinned to one canonical Node.js version (see RULES in +# the spec: one canonical runtime per platform). +service: ${env:E2E_SERVICE_NAME} + +frameworkVersion: "3" + +plugins: + - serverless-plugin-datadog + +provider: + name: aws + runtime: nodejs20.x + region: ${env:AWS_REGION, "us-east-1"} + stage: e2e + # Hygiene: freshness tag set atomically at creation, on both the CloudFormation + # stack and every resource, so the cross-repo sweeper can age it out. + stackTags: + one_e2e_created: ${env:E2E_CREATED_TS} + tags: + one_e2e_created: ${env:E2E_CREATED_TS} + +custom: + datadog: + # API key wiring + site: required for the extension to ship telemetry. + apiKey: ${env:DD_API_KEY} + site: ${env:DD_SITE, "datadoghq.com"} + # Identity: service carries the unique run id, env + version are fixed markers. + # These flow onto ingested telemetry and let us assert identity, not existence. + service: ${env:E2E_SERVICE_NAME} + env: e2e + version: "1.0.0" + # The mechanism under test: library layer + extension layer + tracing + logs. + addLayers: true + addExtension: true + enableDDTracing: true + enableDDLogs: true + # Keep the deploy hermetic: no git metadata upload (needs app key + git ctx). + enableSourceCodeIntegration: false + uploadGitMetadata: false + +functions: + hello: + handler: handler.handler diff --git a/e2e/helpers/exec.ts b/e2e/helpers/exec.ts new file mode 100644 index 00000000..277299a5 --- /dev/null +++ b/e2e/helpers/exec.ts @@ -0,0 +1,89 @@ +import child_process from 'node:child_process'; + +// Runner-agnostic: no jest/vitest imports here so the same helpers can back any +// test runner. Mirrors the datadog-ci reference helper (e2e/helpers/exec.ts). + +export interface ExecResult { + exitCode: number; + stdout: string; + stderr: string; +} + +export interface ExecOptions { + env?: Record; + // Serverless / AWS calls can run long; default generous but bounded. + cwd?: string; + maxBuffer?: number; +} + +export const execPromise = async (command: string, options: ExecOptions = {}): Promise => { + const {env, cwd, maxBuffer = 50 * 1024 * 1024} = options; + + return new Promise((resolve) => { + child_process.exec(command, {env: {...process.env, ...env}, cwd, maxBuffer}, (error, stdout, stderr) => { + resolve({ + exitCode: error ? (typeof error.code === 'number' ? error.code : 1) : 0, + stdout: stdout.trim(), + stderr: stderr.trim(), + }); + }); + }); +}; + +// Transient cloud-provider errors that are safe to retry. "Retry the cloud, not +// the assertions" -- these are throttling/timeout/conflict signals, never real +// failures. AWS-specific patterns are added on top of the shared cross-cloud set. +const RETRYABLE_PATTERNS = [ + // Generic / cross-cloud + 'GatewayTimeout', + 'Operation was canceled', + 'ETIMEDOUT', + 'ECONNRESET', + 'temporarily unavailable', + // AWS Lambda / CloudFormation / STS + 'ThrottlingException', + 'TooManyRequestsException', + 'Rate exceeded', + 'RequestLimitExceeded', + 'ResourceConflictException', + 'ServiceException', + 'InternalFailure', + 'ServiceUnavailable', + 'is in progress', // CloudFormation stack op already running + 'ProvisionedConcurrencyConfig', // eventual-consistency churn on update +]; + +const isRetryable = (result: ExecResult): boolean => { + const output = `${result.stdout} ${result.stderr}`; + + return RETRYABLE_PATTERNS.some((pattern) => output.includes(pattern)); +}; + +const waitFor = (seconds: number): Promise => new Promise((resolve) => setTimeout(resolve, seconds * 1000)); + +export const execPromiseWithRetries = async ( + command: string, + options: ExecOptions = {}, + {maxAttempts = 3, delaySeconds = 10}: {maxAttempts?: number; delaySeconds?: number} = {}, +): Promise => { + let result: ExecResult = {exitCode: 1, stdout: '', stderr: 'not run'}; + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + result = await execPromise(command, options); + if (result.exitCode === 0) { + return result; + } + if (attempt < maxAttempts && isRetryable(result)) { + // eslint-disable-next-line no-console + console.log(`Command failed with retryable error (attempt ${attempt}/${maxAttempts}), retrying in ${delaySeconds}s...`); + // eslint-disable-next-line no-console + console.log(`stdout: ${result.stdout}`); + // eslint-disable-next-line no-console + console.log(`stderr: ${result.stderr}`); + await waitFor(delaySeconds); + } else { + return result; + } + } + + return result; +}; diff --git a/e2e/helpers/lambda-telemetry-checker.ts b/e2e/helpers/lambda-telemetry-checker.ts new file mode 100644 index 00000000..dcda235a --- /dev/null +++ b/e2e/helpers/lambda-telemetry-checker.ts @@ -0,0 +1,118 @@ +import {client, v2} from '@datadog/datadog-api-client'; + +// Runner-agnostic telemetry poller. Mirrors the datadog-ci reference +// (cloud-run-telemetry-checker.ts): poll spans + logs on a bounded budget, then +// assert *identity* on the matched records, not mere existence. + +const POLL_INTERVAL_SECONDS = 15; +const MAX_ATTEMPTS = 20; + +const waitFor = (seconds: number): Promise => new Promise((resolve) => setTimeout(resolve, seconds * 1000)); + +const buildConfiguration = (): client.Configuration => { + const configuration = client.createConfiguration({ + authMethods: { + apiKeyAuth: process.env.DATADOG_API_KEY ?? process.env.DD_API_KEY, + appKeyAuth: process.env.DATADOG_APP_KEY ?? process.env.DD_APP_KEY, + }, + }); + const site = process.env.DATADOG_SITE ?? process.env.DD_SITE; + if (site) { + configuration.setServerVariables({site}); + } + + return configuration; +}; + +// Poll until at least one returned record carries every identity marker. We filter +// in-process (rather than trusting the query alone) so a stray record that merely +// matches the service filter can't pass for one stamped with the full identity. +const pollUntilIdentity = async ( + label: string, + query: () => Promise, + markers: string[], +): Promise => { + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + // eslint-disable-next-line no-console + console.log(`[${label}] attempt ${attempt}/${MAX_ATTEMPTS}`); + try { + const results = await query(); + const matching = results.filter((record) => { + const serialized = JSON.stringify(record); + + return markers.every((marker) => serialized.includes(marker)); + }); + if (matching.length > 0) { + // eslint-disable-next-line no-console + console.log(`[${label}] found ${matching.length} record(s) with identity [${markers.join(', ')}]`); + + return; + } + } catch (error) { + // eslint-disable-next-line no-console + console.error(`[${label}] query error:`, error); + } + + if (attempt < MAX_ATTEMPTS) { + await waitFor(POLL_INTERVAL_SECONDS); + } + } + throw new Error( + `[${label}] timed out after ${MAX_ATTEMPTS} attempts (${MAX_ATTEMPTS * POLL_INTERVAL_SECONDS}s) ` + + `waiting for telemetry with identity [${markers.join(', ')}]`, + ); +}; + +const recentWindow = (): {from: string; to: string} => { + const now = new Date(); + const from = new Date(now.getTime() - 15 * 60 * 1000); + + return {from: from.toISOString(), to: now.toISOString()}; +}; + +const querySpans = async (configuration: client.Configuration, serviceName: string): Promise => { + const api = new v2.SpansApi(configuration); + const {from, to} = recentWindow(); + const response = await api.listSpans({ + body: { + data: { + attributes: { + filter: {query: `@service:${serviceName}`, from, to}, + page: {limit: 25}, + }, + type: 'search_request', + }, + }, + }); + + return response.data ?? []; +}; + +const queryLogs = async (configuration: client.Configuration, serviceName: string): Promise => { + const api = new v2.LogsApi(configuration); + const {from, to} = recentWindow(); + const response = await api.listLogs({ + body: { + filter: {query: `service:${serviceName}`, from, to}, + page: {limit: 25}, + }, + }); + + return response.data ?? []; +}; + +export interface TelemetryIdentity { + serviceName: string; + env: string; + version: string; +} + +export const checkTelemetryFlowing = async ({serviceName, env, version}: TelemetryIdentity): Promise => { + const configuration = buildConfiguration(); + await Promise.all([ + // Traces carry service + env + version identity. + pollUntilIdentity('spans', () => querySpans(configuration, serviceName), [serviceName, env, version]), + // Logs carry service + env identity. + pollUntilIdentity('logs', () => queryLogs(configuration, serviceName), [serviceName, env]), + ]); +}; diff --git a/e2e/helpers/lambda-verifier.ts b/e2e/helpers/lambda-verifier.ts new file mode 100644 index 00000000..93db3316 --- /dev/null +++ b/e2e/helpers/lambda-verifier.ts @@ -0,0 +1,186 @@ +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import {fileURLToPath} from 'node:url'; + +import {execPromise} from './exec'; +import {FRESHNESS_TAG_KEY} from './naming'; + +// Runner-agnostic config verifier: drives the AWS CLI and asserts the deployed +// function's identity with node:assert. "Config present" for Lambda (per spec) = +// DD layers + extension layer + DD_* env vars + tags. + +// One canonical runtime for the platform. +export const RUNTIME = 'nodejs20.x'; + +// The handler the plugin redirects to when wrapping a Node function with layers, +// and the env var that stores the user's original handler. See src/wrapper.ts. +const REDIRECT_HANDLER = '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler'; +const ORIGINAL_HANDLER = 'handler.handler'; + +// DD_* env vars the plugin must wire for a traced + logging Node function with the +// extension. We assert identity on DD_SERVICE/DD_ENV/DD_VERSION, presence on the +// rest. API key wiring is one of several mutually-exclusive forms. +const API_KEY_VARS = ['DD_API_KEY', 'DD_API_KEY_SECRET_ARN', 'DD_KMS_API_KEY', 'DD_API_KEY_SSM_ARN']; +const REQUIRED_PRESENT = ['DD_SITE', 'DD_TRACE_ENABLED', 'DD_SERVERLESS_LOGS_ENABLED']; + +// Tag keys. With the extension enabled, the plugin carries service/env/version +// identity as DD_* env vars (asserted below + on telemetry), and tags the function +// with its own version marker only. See src/index.ts: addExtension routes to +// addDDEnvVars (env-var identity), and addTags(shouldAddTags = addExtension !== true) +// applies just the plugin tag. +const TAG_PLUGIN = 'dd_sls_plugin'; + +interface LambdaLayer { + Arn: string; +} +interface LambdaConfiguration { + FunctionArn: string; + Handler: string; + Runtime: string; + Layers?: LambdaLayer[]; + Environment?: {Variables?: Record}; +} + +export interface FunctionSnapshot { + handler: string; + layerArns: string[]; + ddEnv: Record; +} + +// `sls deploy` names functions `--`. Stage is pinned to `e2e` +// and the only function is `hello`. +export const functionName = (serviceName: string): string => `${serviceName}-e2e-hello`; + +interface ExpectedLayers { + node: string; + extension: string; +} + +// Pinned artifact versions come from the plugin's own src/layers.json, so a version +// mismatch blames the plugin/registry, not upstream drift. +export const expectedLayerArns = (region: string): ExpectedLayers => { + const layersPath = fileURLToPath(new URL('../../src/layers.json', import.meta.url)); + const layers = JSON.parse(fs.readFileSync(layersPath, 'utf-8')) as { + regions: Record>; + }; + const regionLayers = layers.regions[region]; + assert.ok(regionLayers, `region ${region} not present in src/layers.json`); + const node = regionLayers[RUNTIME]; + const extension = regionLayers.extension; + assert.ok(node, `no ${RUNTIME} layer pinned for ${region} in src/layers.json`); + assert.ok(extension, `no extension layer pinned for ${region} in src/layers.json`); + + return {node, extension}; +}; + +const getConfiguration = async (fnName: string, region: string): Promise => { + const result = await execPromise( + `aws lambda get-function-configuration --function-name "${fnName}" --region "${region}" --output json`, + ); + assert.equal(result.exitCode, 0, `get-function-configuration failed: ${result.stderr}`); + + return JSON.parse(result.stdout) as LambdaConfiguration; +}; + +const getTags = async (functionArn: string, region: string): Promise> => { + const result = await execPromise( + `aws lambda list-tags --resource "${functionArn}" --region "${region}" --output json`, + ); + assert.equal(result.exitCode, 0, `list-tags failed: ${result.stderr}`); + + return (JSON.parse(result.stdout).Tags ?? {}) as Record; +}; + +// A normalized view used to assert idempotency (re-apply produces no diff). +export const functionSnapshot = async (fnName: string, region: string): Promise => { + const config = await getConfiguration(fnName, region); + const vars = config.Environment?.Variables ?? {}; + const ddEnv: Record = {}; + for (const key of Object.keys(vars).sort()) { + if (key.startsWith('DD_')) { + ddEnv[key] = vars[key]; + } + } + + return { + handler: config.Handler, + layerArns: (config.Layers ?? []).map((l) => l.Arn).sort(), + ddEnv, + }; +}; + +export const verifyInstrumented = async (serviceName: string, region: string): Promise => { + const fnName = functionName(serviceName); + // eslint-disable-next-line no-console + console.log(`Verifying instrumented state of "${fnName}"...`); + const config = await getConfiguration(fnName, region); + const vars = config.Environment?.Variables ?? {}; + const layerArns = (config.Layers ?? []).map((l) => l.Arn); + const expected = expectedLayerArns(region); + + // Handler is redirected to the Datadog wrapper, original is preserved. + assert.equal(config.Handler, REDIRECT_HANDLER, 'handler not redirected to the Datadog wrapper'); + assert.equal( + vars.DD_LAMBDA_HANDLER, + ORIGINAL_HANDLER, + 'DD_LAMBDA_HANDLER should hold the original handler (a different value means a double-wrap)', + ); + + // Layers: library layer + extension layer, each present exactly once (no dup), + // pinned to the versions in src/layers.json. + assert.ok( + layerArns.includes(expected.node), + `missing pinned Node layer ${expected.node}; got ${JSON.stringify(layerArns)}`, + ); + assert.ok( + layerArns.includes(expected.extension), + `missing pinned extension layer ${expected.extension}; got ${JSON.stringify(layerArns)}`, + ); + assert.equal(layerArns.filter((a) => a === expected.node).length, 1, 'Node layer attached more than once'); + assert.equal(layerArns.filter((a) => a === expected.extension).length, 1, 'extension layer attached more than once'); + + // Env: API key wiring + required DD_* vars + identity. + assert.ok( + API_KEY_VARS.some((k) => vars[k]), + `no API key wiring env var set (one of ${API_KEY_VARS.join(', ')})`, + ); + for (const key of REQUIRED_PRESENT) { + assert.ok(vars[key], `missing required env var ${key}`); + } + assert.equal(vars.DD_TRACE_ENABLED, 'true', 'DD_TRACE_ENABLED should be true'); + assert.equal(vars.DD_SERVERLESS_LOGS_ENABLED, 'true', 'DD_SERVERLESS_LOGS_ENABLED should be true'); + assert.equal(vars.DD_SERVICE, serviceName, 'DD_SERVICE should carry the run-id service name'); + assert.equal(vars.DD_ENV, 'e2e', 'DD_ENV should be e2e'); + assert.equal(vars.DD_VERSION, '1.0.0', 'DD_VERSION should be 1.0.0'); + + // Tags: plugin marker (proof the plugin tagged the function) + freshness tag + // (set atomically at creation for the sweeper). Service/env/version identity is + // carried by the DD_* env vars above and on the ingested telemetry. + const tags = await getTags(config.FunctionArn, region); + assert.match(tags[TAG_PLUGIN] ?? '', /^v\d+\.\d+\.\d+/, 'dd_sls_plugin tag should be v'); + assert.ok(tags[FRESHNESS_TAG_KEY], `missing freshness tag ${FRESHNESS_TAG_KEY}`); + + // eslint-disable-next-line no-console + console.log('All instrumented checks passed.'); +}; + +// After `sls remove` the whole stack is torn down -- the function itself is gone, +// which is the clean end-state for this mechanism (no per-resource un-instrument). +// Assert absence explicitly. +export const verifyUninstrumented = async (serviceName: string, region: string): Promise => { + const fnName = functionName(serviceName); + // eslint-disable-next-line no-console + console.log(`Verifying clean (removed) state of "${fnName}"...`); + const result = await execPromise( + `aws lambda get-function-configuration --function-name "${fnName}" --region "${region}" --output json`, + ); + assert.notEqual(result.exitCode, 0, 'function still exists after remove'); + assert.match( + `${result.stdout} ${result.stderr}`, + /ResourceNotFoundException|Function not found/, + `expected ResourceNotFoundException, got: ${result.stderr || result.stdout}`, + ); + + // eslint-disable-next-line no-console + console.log('Clean-state check passed (function and its DD config are gone).'); +}; diff --git a/e2e/helpers/naming.ts b/e2e/helpers/naming.ts new file mode 100644 index 00000000..40dc2426 --- /dev/null +++ b/e2e/helpers/naming.ts @@ -0,0 +1,20 @@ +import crypto from 'node:crypto'; + +// Resource-hygiene convention shared across the e2e suites (see spec "Resource +// Hygiene"). The name prefix is the identity + blast-radius guard the sweeper +// keys on; the freshness tag lets it age resources out safely. + +export const TOOL = 'slsplugin'; +export const PLATFORM = 'lambda'; + +// `one` = team marker (`dd-` implied). Prefix is set atomically at creation. +// Lambda function names (max 64) end up as `--`; with an +// 8-char run id this stays well under the limit. +export const namePrefix = (runId: string): string => `one-e2e-${TOOL}-${PLATFORM}-${runId}`; + +export const newRunId = (): string => crypto.randomBytes(4).toString('hex'); + +// Freshness tag value. Native creation time isn't usable cross-cloud, so we stamp +// it ourselves at create time. Key is `one_e2e_created`. +export const FRESHNESS_TAG_KEY = 'one_e2e_created'; +export const freshnessTimestamp = (): string => `${Math.floor(Date.now() / 1000)}`; diff --git a/e2e/lambda.test.ts b/e2e/lambda.test.ts new file mode 100644 index 00000000..564be7c1 --- /dev/null +++ b/e2e/lambda.test.ts @@ -0,0 +1,140 @@ +import assert from 'node:assert/strict'; +import os from 'node:os'; +import path from 'node:path'; +import {fileURLToPath} from 'node:url'; + +import {afterAll, beforeAll, describe, it} from 'vitest'; + +import {execPromise, execPromiseWithRetries} from './helpers/exec'; +import {checkTelemetryFlowing} from './helpers/lambda-telemetry-checker'; +import {functionName, functionSnapshot, verifyInstrumented, verifyUninstrumented, type FunctionSnapshot} from './helpers/lambda-verifier'; +import {freshnessTimestamp, namePrefix, newRunId} from './helpers/naming'; + +// Full lifecycle for the serverless-plugin-datadog AWS Lambda instrumentation: +// +// sls deploy (APPLY: provision + instrument) -> verify CONFIG +// -> invoke (trigger) -> verify TELEMETRY flows +// -> sls deploy again -> assert IDEMPOTENT (no diff) +// -> sls remove (REMOVE) -> verify CLEAN end-state +// -> teardown (afterAll, always) +// +// For this tool the plugin runs as part of `sls deploy`, so provisioning the +// uninstrumented workload and APPLY coincide -- there is no separately-deployed +// uninstrumented state to instrument later. REMOVE tears down the whole stack, +// so the clean end-state is the function (and all its DD config) being gone. + +const e2eDir = path.dirname(fileURLToPath(import.meta.url)); +const fixtureDir = path.join(e2eDir, 'fixtures', 'lambda-node'); + +const DEPLOY_TIMEOUT_MS = 900_000; +const TELEMETRY_TIMEOUT_MS = 600_000; +const ENV_VERSION = '1.0.0'; +const ENV_NAME = 'e2e'; + +const describeOrSkip = process.env.SKIP_LAMBDA_TESTS === 'true' ? describe.skip : describe; + +describeOrSkip('serverless-plugin-datadog lambda e2e', () => { + const region = process.env.AWS_REGION ?? 'us-east-1'; + const runId = newRunId(); + const serviceName = namePrefix(runId); + const apiKey = process.env.DATADOG_API_KEY ?? process.env.DD_API_KEY; + const appKey = process.env.DATADOG_APP_KEY ?? process.env.DD_APP_KEY; + const site = process.env.DATADOG_SITE ?? process.env.DD_SITE ?? 'datadoghq.com'; + + // Injected into `sls deploy` so serverless.yml stays static. Carries the unique + // name + freshness stamp (set atomically at creation) and the DD wiring inputs. + const deployEnv: Record = { + E2E_SERVICE_NAME: serviceName, + E2E_CREATED_TS: freshnessTimestamp(), + AWS_REGION: region, + DD_API_KEY: apiKey, + DD_SITE: site, + }; + const slsOptions = {env: deployEnv, cwd: fixtureDir}; + + const deploy = () => + execPromiseWithRetries('npx --no-install serverless deploy --stage e2e --conceal', slsOptions, { + maxAttempts: 2, + delaySeconds: 20, + }); + + let firstSnapshot: FunctionSnapshot; + + beforeAll(() => { + assert.ok(apiKey, 'DATADOG_API_KEY (or DD_API_KEY) must be set: used to wire the extension and authenticate the API client'); + assert.ok(appKey, 'DATADOG_APP_KEY (or DD_APP_KEY) must be set: used to poll spans/logs from the Datadog API'); + // eslint-disable-next-line no-console + console.log(`Run id ${runId} -> service "${serviceName}" in ${region} (site ${site})`); + }); + + afterAll(async () => { + // Teardown always runs, even if a test above failed mid-lifecycle. + const result = await execPromise('npx --no-install serverless remove --stage e2e', slsOptions); + if (result.exitCode !== 0) { + // eslint-disable-next-line no-console + console.warn(`Teardown remove returned ${result.exitCode} (ok if already removed): ${result.stderr}`); + } + }); + + it( + 'deploys and instruments the function', + async () => { + const result = await deploy(); + assert.equal(result.exitCode, 0, `sls deploy failed: ${result.stderr || result.stdout}`); + + await verifyInstrumented(serviceName, region); + firstSnapshot = await functionSnapshot(functionName(serviceName), region); + }, + DEPLOY_TIMEOUT_MS, + ); + + it( + 'flows traces and logs after invocation', + async () => { + const outFile = path.join(os.tmpdir(), `${serviceName}-invoke.json`); + // A few invocations to give the extension something to flush promptly. + for (let i = 0; i < 3; i++) { + const result = await execPromiseWithRetries( + `aws lambda invoke --function-name "${functionName(serviceName)}" --region "${region}"` + + ` --payload '{}' --cli-binary-format raw-in-base64-out --output json "${outFile}"`, + ); + assert.equal(result.exitCode, 0, `lambda invoke failed: ${result.stderr}`); + const meta = JSON.parse(result.stdout) as {StatusCode?: number; FunctionError?: string}; + assert.equal(meta.StatusCode, 200, `unexpected invoke status: ${result.stdout}`); + assert.ok(!meta.FunctionError, `invocation errored: ${meta.FunctionError}`); + } + + await checkTelemetryFlowing({serviceName, env: ENV_NAME, version: ENV_VERSION}); + }, + TELEMETRY_TIMEOUT_MS, + ); + + it( + 're-applies idempotently (no diff, no duplicate)', + async () => { + const result = await deploy(); + assert.equal(result.exitCode, 0, `re-deploy failed: ${result.stderr || result.stdout}`); + + // Still instrumented, still no double-wrap / duplicate layers... + await verifyInstrumented(serviceName, region); + // ...and byte-for-byte the same instrumentation as the first apply. + const secondSnapshot = await functionSnapshot(functionName(serviceName), region); + assert.deepEqual(secondSnapshot, firstSnapshot, 're-apply changed the function config'); + }, + DEPLOY_TIMEOUT_MS, + ); + + it( + 'removes cleanly with no residue', + async () => { + const result = await execPromiseWithRetries('npx --no-install serverless remove --stage e2e', slsOptions, { + maxAttempts: 2, + delaySeconds: 20, + }); + assert.equal(result.exitCode, 0, `sls remove failed: ${result.stderr || result.stdout}`); + + await verifyUninstrumented(serviceName, region); + }, + DEPLOY_TIMEOUT_MS, + ); +}); diff --git a/e2e/load-env.ts b/e2e/load-env.ts new file mode 100644 index 00000000..14185af5 --- /dev/null +++ b/e2e/load-env.ts @@ -0,0 +1,28 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import {fileURLToPath} from 'node:url'; + +// Local convenience: load e2e/.env.local (gitignored) into process.env without a +// dependency. Real environment variables always win, so this is a no-op in CI. +const envPath = path.join(path.dirname(fileURLToPath(import.meta.url)), '.env.local'); + +if (fs.existsSync(envPath)) { + for (const line of fs.readFileSync(envPath, 'utf-8').split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) { + continue; + } + const idx = trimmed.indexOf('='); + if (idx === -1) { + continue; + } + const key = trimmed.slice(0, idx).trim(); + let value = trimmed.slice(idx + 1).trim(); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + if (!(key in process.env)) { + process.env[key] = value; + } + } +} diff --git a/e2e/package.json b/e2e/package.json new file mode 100644 index 00000000..68a89623 --- /dev/null +++ b/e2e/package.json @@ -0,0 +1,19 @@ +{ + "name": "serverless-plugin-datadog-e2e", + "version": "0.0.0", + "private": true, + "type": "module", + "description": "End-to-end suite for the serverless-plugin-datadog AWS Lambda instrumentation.", + "scripts": { + "setup": "bash setup.sh", + "pretest": "npm run setup", + "test": "vitest run", + "typecheck": "tsc --noEmit" + }, + "devDependencies": { + "@datadog/datadog-api-client": "^1.40.0", + "@types/node": "^20.19.0", + "typescript": "^5.9.3", + "vitest": "^2.1.9" + } +} diff --git a/e2e/setup.sh b/e2e/setup.sh new file mode 100644 index 00000000..bd8191ca --- /dev/null +++ b/e2e/setup.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Builds the plugin, packs it into a tarball, and installs it into the workload +# fixture alongside the Serverless Framework. +# +# Why a tarball and not `file:../../..`: npm's file: protocol whole-dir-links the +# target, and the repo root contains this fixture -- which would link back to the +# repo, recursing forever. A packed tarball respects .npmignore (dist only) and +# extracts cleanly, the same approach the datadog-ci e2e suite uses for artifacts. +set -euo pipefail + +cd "$(dirname "$0")" +E2E_DIR="$PWD" + +echo "==> Building plugin" +(cd .. && COREPACK_ENABLE_DOWNLOAD_PROMPT=0 yarn build) + +echo "==> Packing plugin" +rm -f "$E2E_DIR"/serverless-plugin-datadog-*.tgz +TARBALL_NAME=$(cd .. && npm pack --silent --pack-destination "$E2E_DIR") +TARBALL="$E2E_DIR/$TARBALL_NAME" +echo " packed $TARBALL_NAME" + +echo "==> Installing workload fixture" +cd fixtures/lambda-node +npm install --no-audit --no-fund +# --no-save so the committed fixture package.json stays free of a local tarball path. +npm install --no-audit --no-fund --no-save "$TARBALL" + +echo "==> Setup complete" diff --git a/e2e/tsconfig.json b/e2e/tsconfig.json new file mode 100644 index 00000000..f7650ff9 --- /dev/null +++ b/e2e/tsconfig.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "Bundler", + "lib": ["ES2022"], + "resolveJsonModule": true, + "esModuleInterop": true, + "strict": true, + "skipLibCheck": true, + "noEmit": true, + "types": ["node"] + }, + "include": ["**/*.ts"], + "exclude": ["node_modules", "fixtures"] +} diff --git a/e2e/vitest.config.ts b/e2e/vitest.config.ts new file mode 100644 index 00000000..c6e7a56e --- /dev/null +++ b/e2e/vitest.config.ts @@ -0,0 +1,15 @@ +import {defineConfig} from 'vitest/config'; + +export default defineConfig({ + test: { + include: ['*.test.ts'], + // Cloud round-trips are slow; defaults are overridden per test/hook too. + testTimeout: 600_000, + hookTimeout: 900_000, + // The lifecycle tests share deployed state and must run in declared order. + fileParallelism: false, + pool: 'forks', + // Loads e2e/.env.local for local runs (no-op in CI, real env wins). + setupFiles: ['./load-env.ts'], + }, +});