Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions .github/scripts/retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env bash
#
# Retry a command with linear backoff, tolerating "already gone" outcomes.
#
# Usage: retry.sh <max_attempts> <base_delay_seconds> <description> -- <command> [args...]
#
# Behaviour:
# - Runs the command, streaming its output live.
# - Exit 0 -> success.
# - Output matches a "benign" regex -> the target is already gone; treated as
# success (e.g. PR closed before deploy).
# - Otherwise -> retry after (base_delay * attempt) seconds.
# - After <max_attempts> failures -> propagate the last non-zero exit code so
# the workflow step (and job) fails.
#
# This exists because AWS throttles destructive API calls aggressively
# (e.g. ApiGateway DeleteRestApi at ~1 req / 30s), which makes CloudFormation
# stack deletes return DELETE_FAILED with HTTP 429 "Too Many Requests".
# Re-running the destroy after a short pause lets the throttle window reset.
set -uo pipefail

max_attempts="${1:?max_attempts required}"; shift
base_delay="${1:?base_delay required}"; shift
desc="${1:?description required}"; shift
if [ "${1:-}" = "--" ]; then shift; fi

# Outputs that mean "there is nothing left to delete" -> not a failure.
benign_re='does not exist|ResourceNotFoundException|RepositoryNotFoundException|NoSuchEntity|could not be found|No stack|Stack .* does not exist'

attempt=1
while true; do
tmp="$(mktemp)"
"$@" 2>&1 | tee "$tmp"
status="${PIPESTATUS[0]}"

if [ "$status" -eq 0 ]; then
rm -f "$tmp"
exit 0
fi

if grep -qiE "$benign_re" "$tmp"; then
echo "::notice::${desc}: nothing to delete (already gone); treating as success"
rm -f "$tmp"
exit 0
fi
rm -f "$tmp"

if [ "$attempt" -ge "$max_attempts" ]; then
echo "::error::${desc} failed after ${attempt} attempts (exit ${status})"
exit "$status"
fi

delay=$(( base_delay * attempt ))
echo "::warning::${desc} attempt ${attempt}/${max_attempts} failed (exit ${status}); retrying in ${delay}s"
sleep "${delay}"
attempt=$(( attempt + 1 ))
done
36 changes: 30 additions & 6 deletions .github/workflows/cleanup-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ jobs:
environment: staging
env:
RESOURCE_PREFIX: pr-${{ github.event.pull_request.number }}-
# Make the AWS CLI / SDK back off and retry on throttling (HTTP 429)
# instead of failing immediately.
AWS_RETRY_MODE: adaptive
AWS_MAX_ATTEMPTS: "10"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
Expand All @@ -25,55 +29,75 @@ jobs:
with:
role-to-assume: ${{ vars.AWS_ROLE_ARN }}
aws-region: ${{ vars.AWS_REGION }}

# From here on every step runs even if a previous one failed (so we never
# leak resources because of one early failure), but any failure still
# fails the overall job.
- name: Install CDK dependencies
if: ${{ !cancelled() }}
working-directory: integration-tests/iac
run: npm ci
- name: Destroy CDK stack
if: ${{ !cancelled() }}
working-directory: integration-tests/iac
run: npx cdk destroy ${RESOURCE_PREFIX}IntegrationTestsStack --force || true
run: |
bash "$GITHUB_WORKSPACE/.github/scripts/retry.sh" 6 30 "CDK destroy" -- \
npx cdk destroy "${RESOURCE_PREFIX}IntegrationTestsStack" --force
env:
DASH0_DEV_API_TOKEN: ${{ secrets.DASH0_DEV_API_TOKEN }}
TEST_POSTGRESS_PASSWORD: ${{ secrets.TEST_POSTGRESS_PASSWORD }}
TEST_MYSQL_PASSWORD: ${{ secrets.TEST_MYSQL_PASSWORD }}

- name: Install plugin dependencies
if: ${{ !cancelled() }}
working-directory: sls-plugin
run: npm ci
- name: Build plugin
if: ${{ !cancelled() }}
working-directory: sls-plugin
run: npm run build
- name: Resolve layer version
if: ${{ !cancelled() }}
run: |
DASH0_LAYER_VERSION=$(aws lambda list-layer-versions \
--layer-name "${RESOURCE_PREFIX}dash0-extension-node" \
--query 'LayerVersions[0].Version' --output text 2>/dev/null || echo "1")
echo "DASH0_LAYER_VERSION=$DASH0_LAYER_VERSION" >> "$GITHUB_ENV"
- name: Destroy SLS v3 stack
if: ${{ !cancelled() }}
working-directory: integration-tests/sls/v3
run: |
npm ci
npx serverless remove || true
bash "$GITHUB_WORKSPACE/.github/scripts/retry.sh" 5 20 "SLS v3 remove" -- \
npx serverless remove
env:
DASH0_DEV_API_TOKEN: ${{ secrets.DASH0_DEV_API_TOKEN }}
DASH0_LAYER_VERSION: ${{ env.DASH0_LAYER_VERSION }}
- name: Destroy SLS v4 stack
if: ${{ !cancelled() }}
working-directory: integration-tests/sls/v4
run: |
npm ci
npx serverless remove || true
bash "$GITHUB_WORKSPACE/.github/scripts/retry.sh" 5 20 "SLS v4 remove" -- \
npx serverless remove
env:
DASH0_DEV_API_TOKEN: ${{ secrets.DASH0_DEV_API_TOKEN }}
DASH0_LAYER_VERSION: ${{ env.DASH0_LAYER_VERSION }}
SERVERLESS_ACCESS_KEY: ${{ secrets.SERVERLESS_ACCESS_KEY }}
- name: Delete PR-specific Lambda layers
if: ${{ !cancelled() }}
run: |
for layer in ${RESOURCE_PREFIX}dash0-extension-python ${RESOURCE_PREFIX}dash0-extension-node ${RESOURCE_PREFIX}dash0-extension-java ${RESOURCE_PREFIX}dash0-extension-manual; do
versions=$(aws lambda list-layer-versions --layer-name $layer --query 'LayerVersions[].Version' --output text 2>/dev/null || true)
versions=$(aws lambda list-layer-versions --layer-name "$layer" --query 'LayerVersions[].Version' --output text 2>/dev/null || true)
for v in $versions; do
aws lambda delete-layer-version --layer-name $layer --version-number $v || true
bash "$GITHUB_WORKSPACE/.github/scripts/retry.sh" 5 10 "Delete layer $layer v$v" -- \
aws lambda delete-layer-version --layer-name "$layer" --version-number "$v"
done
done
- name: Delete PR-specific ECR repos
if: ${{ !cancelled() }}
run: |
for repo in ${RESOURCE_PREFIX}dash0-extension-python ${RESOURCE_PREFIX}dash0-extension-node ${RESOURCE_PREFIX}dash0-extension-java; do
aws ecr delete-repository --repository-name $repo --force || true
bash "$GITHUB_WORKSPACE/.github/scripts/retry.sh" 5 10 "Delete ECR repo $repo" -- \
aws ecr delete-repository --repository-name "$repo" --force
done
16 changes: 10 additions & 6 deletions integration-tests/tests/src/test-python-outofmemory.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ import {
} from "./utils";


// Python 3.14+ reports out-of-memory as `Runtime.ExitError` instead of
// `Runtime.OutOfMemory`. Older runtimes keep the `Runtime.OutOfMemory` label.
const expectedOomExceptionType = (functionName: string): string => {
// Python 3.14's runtime is inconsistent about out-of-memory: it reports the
// exception as either `Runtime.OutOfMemory` or `Runtime.ExitError`. Older
// runtimes always use `Runtime.OutOfMemory`.
const acceptedOomExceptionTypes = (functionName: string): string[] => {
const match = functionName.match(/python3-(\d+)/);
const minor = match ? parseInt(match[1], 10) : 0;
return minor >= 14 ? 'Runtime.ExitError' : 'Runtime.OutOfMemory';
return minor >= 14
? ['Runtime.OutOfMemory', 'Runtime.ExitError']
: ['Runtime.OutOfMemory'];
};

const verifySuccessInvocation = async (functionName: string, invocationEnd: boolean, traced: boolean) => {
Expand All @@ -27,8 +30,9 @@ const verifySuccessInvocation = async (functionName: string, invocationEnd: bool
handlerScopeName: 'opentelemetry.instrumentation.aws_lambda',
});

const exceptionType = expectedOomExceptionType(functionName);
checkException(handlerSpan, exceptionType);
// checkException returns whichever accepted type the span actually reported,
// so the log check below stays consistent with this invocation.
const exceptionType = checkException(handlerSpan, acceptedOomExceptionTypes(functionName));

const logsToBeChecked: LogToCheck[] = [
{ message: 'START RequestId: ' },
Expand Down
9 changes: 6 additions & 3 deletions integration-tests/tests/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,8 @@ export const checkMetrics = async ({
}
}

export const checkException = (span: any, exception_type: string) => {
export const checkException = (span: any, exception_type: string | string[]): string => {
const acceptedTypes = Array.isArray(exception_type) ? exception_type : [exception_type];
const events = span.events;
expect(events.length).toEqual(1);
const exceptionEvent = events[0];
Expand All @@ -529,9 +530,11 @@ export const checkException = (span: any, exception_type: string) => {
for (const attr of eventAttributes) {
eventAttrMap[attr.key] = attr.value;
}
expect(eventAttrMap['exception.type'].stringValue).toEqual(exception_type);
const actualType = eventAttrMap['exception.type'].stringValue;
expect(acceptedTypes, `Unexpected exception.type: ${actualType}`).toContain(actualType);
expect(span.status.code).toEqual(2); // 2 = ERROR
expect(span.status.message).toEqual(exception_type);
expect(span.status.message).toEqual(actualType);
return actualType;
}

export const runAllTests = (scenario: string, runtimes: readonly string[], verifySuccessInvocation: Function) => {
Expand Down
Loading