Skip to content

Commit b62ad1e

Browse files
committed
feat(otel): Add X-Ray e2e integration tests for span validation
1 parent 78987e9 commit b62ad1e

9 files changed

Lines changed: 645 additions & 1 deletion

File tree

.github/workflows/delete-stack.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
name: Delete CFN Stack
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
java_version:
7+
description: 'Java version for the stack to delete (17, 21, or 25)'
8+
required: true
9+
default: '17'
10+
type: choice
11+
options:
12+
- '17'
13+
- '21'
14+
- '25'
15+
16+
permissions:
17+
id-token: write
18+
contents: read
19+
20+
jobs:
21+
delete-stack:
22+
env:
23+
AWS_REGION: us-west-2
24+
runs-on: ubuntu-latest
25+
steps:
26+
- name: Configure AWS credentials
27+
uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0
28+
with:
29+
role-to-assume: "${{ secrets.ACTIONS_INTEGRATION_ROLE_NAME }}"
30+
role-session-name: java-language-sdk-delete-stack
31+
aws-region: ${{ env.AWS_REGION }}
32+
- name: Delete CloudFormation stack
33+
run: |
34+
STACK_NAME="JavaSDKCloudBasedIntegrationTestStack-Java${{ inputs.java_version }}Runtime"
35+
echo "Checking stack status..."
36+
STATUS=$(aws cloudformation describe-stacks --stack-name "${STACK_NAME}" --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "NOT_FOUND")
37+
echo "Stack status: ${STATUS}"
38+
39+
if [ "${STATUS}" = "NOT_FOUND" ]; then
40+
echo "Stack does not exist, nothing to delete"
41+
exit 0
42+
fi
43+
44+
# Handle UPDATE_ROLLBACK_FAILED - need to continue rollback first
45+
if [ "${STATUS}" = "UPDATE_ROLLBACK_FAILED" ]; then
46+
echo "Stack is in UPDATE_ROLLBACK_FAILED state, continuing rollback..."
47+
FAILED_RESOURCES=$(aws cloudformation describe-stack-resources --stack-name "${STACK_NAME}" \
48+
--query "StackResources[?ResourceStatus=='UPDATE_FAILED'].LogicalResourceId" --output text)
49+
echo "Skipping failed resources: ${FAILED_RESOURCES}"
50+
SKIP_ARGS=""
51+
for r in ${FAILED_RESOURCES}; do
52+
SKIP_ARGS="${SKIP_ARGS} ${r}"
53+
done
54+
aws cloudformation continue-update-rollback --stack-name "${STACK_NAME}" --resources-to-skip ${SKIP_ARGS}
55+
echo "Waiting for rollback to complete..."
56+
aws cloudformation wait stack-rollback-complete --stack-name "${STACK_NAME}"
57+
echo "Rollback complete, proceeding to delete..."
58+
fi
59+
60+
# Handle UPDATE_FAILED - need to continue rollback first
61+
if [ "${STATUS}" = "UPDATE_FAILED" ]; then
62+
echo "Stack is in UPDATE_FAILED state, continuing rollback..."
63+
FAILED_RESOURCES=$(aws cloudformation describe-stack-resources --stack-name "${STACK_NAME}" \
64+
--query "StackResources[?ResourceStatus=='UPDATE_FAILED'].LogicalResourceId" --output text)
65+
echo "Skipping failed resources: ${FAILED_RESOURCES}"
66+
SKIP_ARGS=""
67+
for r in ${FAILED_RESOURCES}; do
68+
SKIP_ARGS="${SKIP_ARGS} ${r}"
69+
done
70+
aws cloudformation continue-update-rollback --stack-name "${STACK_NAME}" --resources-to-skip ${SKIP_ARGS}
71+
echo "Waiting for rollback to complete..."
72+
aws cloudformation wait stack-rollback-complete --stack-name "${STACK_NAME}"
73+
echo "Rollback complete, proceeding to delete..."
74+
fi
75+
76+
# Refresh status after potential rollback
77+
STATUS=$(aws cloudformation describe-stacks --stack-name "${STACK_NAME}" --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "NOT_FOUND")
78+
echo "Current stack status: ${STATUS}"
79+
80+
if [ "${STATUS}" = "NOT_FOUND" ]; then
81+
echo "Stack no longer exists"
82+
exit 0
83+
fi
84+
85+
# Handle DELETE_FAILED - retry with retain
86+
if [ "${STATUS}" = "DELETE_FAILED" ]; then
87+
echo "Stack is in DELETE_FAILED state, retrying delete with --retain-resources..."
88+
FAILED_RESOURCES=$(aws cloudformation describe-stack-resources --stack-name "${STACK_NAME}" \
89+
--query "StackResources[?ResourceStatus=='DELETE_FAILED'].LogicalResourceId" --output text)
90+
echo "Retaining failed resources: ${FAILED_RESOURCES}"
91+
RETAIN_ARGS=""
92+
for r in ${FAILED_RESOURCES}; do
93+
RETAIN_ARGS="${RETAIN_ARGS} ${r}"
94+
done
95+
aws cloudformation delete-stack --stack-name "${STACK_NAME}" --retain-resources ${RETAIN_ARGS}
96+
else
97+
echo "Deleting stack: ${STACK_NAME}"
98+
aws cloudformation delete-stack --stack-name "${STACK_NAME}"
99+
fi
100+
101+
echo "Waiting for stack deletion to complete..."
102+
aws cloudformation wait stack-delete-complete --stack-name "${STACK_NAME}"
103+
echo "Stack ${STACK_NAME} deleted successfully"

.github/workflows/e2e-tests.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,45 @@ jobs:
7171
sam build --debug --parameter-overrides \
7272
'ParameterKey=Architecture,ParameterValue=x86_64 ParameterKey=JavaVersion,ParameterValue=java${{ matrix.java }} ParameterKey=RoleArn,ParameterValue=${{ secrets.DURABLE_INTEGRATION_TEST_ROLE_ARN }}'
7373
working-directory: ./examples
74+
- name: Clean up stuck stack if needed
75+
run: |
76+
STACK_NAME="JavaSDKCloudBasedIntegrationTestStack-Java${{ matrix.java }}Runtime"
77+
STATUS=$(aws cloudformation describe-stacks --stack-name "${STACK_NAME}" --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "NOT_FOUND")
78+
echo "Stack status: ${STATUS}"
79+
80+
if [[ "${STATUS}" == "UPDATE_FAILED" || "${STATUS}" == "UPDATE_ROLLBACK_FAILED" ]]; then
81+
echo "Stack is in ${STATUS} state, attempting rollback..."
82+
# Get resources that failed to update
83+
FAILED_RESOURCES=$(aws cloudformation describe-stack-resources --stack-name "${STACK_NAME}" \
84+
--query "StackResources[?ResourceStatus=='UPDATE_FAILED'].LogicalResourceId" --output text)
85+
echo "Failed resources: ${FAILED_RESOURCES}"
86+
SKIP_ARGS=""
87+
for r in ${FAILED_RESOURCES}; do
88+
SKIP_ARGS="${SKIP_ARGS} ${r}"
89+
done
90+
if [ -n "${SKIP_ARGS}" ]; then
91+
aws cloudformation continue-update-rollback --stack-name "${STACK_NAME}" --resources-to-skip ${SKIP_ARGS}
92+
else
93+
aws cloudformation continue-update-rollback --stack-name "${STACK_NAME}"
94+
fi
95+
echo "Waiting for rollback to complete..."
96+
aws cloudformation wait stack-rollback-complete --stack-name "${STACK_NAME}"
97+
echo "Rollback complete, deleting stack..."
98+
aws cloudformation delete-stack --stack-name "${STACK_NAME}"
99+
aws cloudformation wait stack-delete-complete --stack-name "${STACK_NAME}"
100+
echo "Stack deleted, SAM will create fresh"
101+
elif [ "${STATUS}" = "DELETE_FAILED" ]; then
102+
echo "Stack is in DELETE_FAILED state, retrying with retain..."
103+
FAILED_RESOURCES=$(aws cloudformation describe-stack-resources --stack-name "${STACK_NAME}" \
104+
--query "StackResources[?ResourceStatus=='DELETE_FAILED'].LogicalResourceId" --output text)
105+
RETAIN_ARGS=""
106+
for r in ${FAILED_RESOURCES}; do
107+
RETAIN_ARGS="${RETAIN_ARGS} ${r}"
108+
done
109+
aws cloudformation delete-stack --stack-name "${STACK_NAME}" --retain-resources ${RETAIN_ARGS}
110+
aws cloudformation wait stack-delete-complete --stack-name "${STACK_NAME}"
111+
echo "Stack deleted, SAM will create fresh"
112+
fi
74113
- name: sam deploy
75114
run: |
76115
sam deploy --stack-name JavaSDKCloudBasedIntegrationTestStack-Java${{ matrix.java }}Runtime \

examples/pom.xml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@
5050
<version>1.63.0</version>
5151
</dependency>
5252

53+
<!-- OTLP gRPC exporter (required for X-Ray via ADOT collector) -->
54+
<dependency>
55+
<groupId>io.opentelemetry</groupId>
56+
<artifactId>opentelemetry-exporter-otlp</artifactId>
57+
<version>1.63.0</version>
58+
</dependency>
59+
<dependency>
60+
<groupId>io.grpc</groupId>
61+
<artifactId>grpc-netty-shaded</artifactId>
62+
<version>1.72.0</version>
63+
</dependency>
64+
5365
<!-- AWS Lambda Java Core -->
5466
<dependency>
5567
<groupId>com.amazonaws</groupId>
@@ -91,6 +103,11 @@
91103
<artifactId>sts</artifactId>
92104
<scope>test</scope>
93105
</dependency>
106+
<dependency>
107+
<groupId>software.amazon.awssdk</groupId>
108+
<artifactId>xray</artifactId>
109+
<scope>test</scope>
110+
</dependency>
94111
<dependency>
95112
<groupId>org.junit.jupiter</groupId>
96113
<artifactId>junit-jupiter</artifactId>
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
package software.amazon.lambda.durable.examples.otel;
4+
5+
import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
6+
import io.opentelemetry.sdk.trace.SdkTracerProvider;
7+
import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor;
8+
import software.amazon.lambda.durable.DurableConfig;
9+
import software.amazon.lambda.durable.DurableContext;
10+
import software.amazon.lambda.durable.DurableHandler;
11+
import software.amazon.lambda.durable.examples.types.GreetingRequest;
12+
import software.amazon.lambda.durable.otel.OpenTelemetryDurablePlugin;
13+
14+
/**
15+
* OTel + X-Ray example: simple steps in a single invocation.
16+
*
17+
* <p>Exports spans via OTLP gRPC to the ADOT collector extension (Lambda layer), which forwards to X-Ray. Used by
18+
* {@code OtelXRayIntegrationTest} to verify spans appear correctly in X-Ray.
19+
*
20+
* <p>Expected trace structure in X-Ray:
21+
*
22+
* <pre>
23+
* durable.invocation
24+
* ├── durable.step:create-greeting
25+
* │ └── durable.step:create-greeting [attempt 1]
26+
* └── durable.step:transform
27+
* └── durable.step:transform [attempt 1]
28+
* </pre>
29+
*/
30+
public class OtelXRayStepExample extends DurableHandler<GreetingRequest, String> {
31+
32+
@Override
33+
protected DurableConfig createConfiguration() {
34+
var otlpExporter = OtlpGrpcSpanExporter.builder()
35+
.setEndpoint("http://localhost:4317")
36+
.build();
37+
38+
var otelPlugin = new OpenTelemetryDurablePlugin(
39+
SdkTracerProvider.builder().addSpanProcessor(SimpleSpanProcessor.create(otlpExporter)));
40+
41+
return DurableConfig.builder().withPlugins(otelPlugin).build();
42+
}
43+
44+
@Override
45+
public String handleRequest(GreetingRequest input, DurableContext context) {
46+
context.getLogger().info("Starting OTel X-Ray step example for {}", input.getName());
47+
48+
var greeting = context.step("create-greeting", String.class, stepCtx -> "Hello, " + input.getName());
49+
50+
var result = context.step("transform", String.class, stepCtx -> greeting.toUpperCase() + "!");
51+
52+
context.getLogger().info("OTel X-Ray step example complete: {}", result);
53+
return result;
54+
}
55+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
package software.amazon.lambda.durable.examples.otel;
4+
5+
import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
6+
import io.opentelemetry.sdk.trace.SdkTracerProvider;
7+
import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor;
8+
import java.time.Duration;
9+
import software.amazon.lambda.durable.DurableConfig;
10+
import software.amazon.lambda.durable.DurableContext;
11+
import software.amazon.lambda.durable.DurableHandler;
12+
import software.amazon.lambda.durable.examples.types.GreetingRequest;
13+
import software.amazon.lambda.durable.otel.OpenTelemetryDurablePlugin;
14+
15+
/**
16+
* OTel + X-Ray example: step → wait → step pattern that forces multiple Lambda invocations.
17+
*
18+
* <p>This handler exercises the critical multi-invocation tracing scenario:
19+
*
20+
* <ol>
21+
* <li>Invocation 1: "before-wait" step completes → wait suspends execution
22+
* <li>Invocation 2: replays "before-wait" (no-op) → wait completes → "after-wait" step runs
23+
* </ol>
24+
*
25+
* <p>Exports spans via OTLP gRPC to the ADOT collector extension (Lambda layer), which forwards to X-Ray.
26+
*
27+
* <p>Used by {@code OtelXRayIntegrationTest} to verify that deterministic trace IDs correctly stitch spans from
28+
* multiple invocations into a single X-Ray trace.
29+
*
30+
* <p>Expected trace structure in X-Ray:
31+
*
32+
* <pre>
33+
* Trace (single trace ID across both invocations)
34+
* ├── durable.invocation (invocation 1)
35+
* │ ├── durable.step:before-wait
36+
* │ │ └── durable.step:before-wait [attempt 1]
37+
* │ └── durable.wait:pause (ended as PENDING)
38+
* └── durable.invocation (invocation 2)
39+
* ├── durable.wait:pause (completed)
40+
* └── durable.step:after-wait
41+
* └── durable.step:after-wait [attempt 1]
42+
* </pre>
43+
*
44+
* <p>All spans share the same deterministic trace ID derived from the execution ARN.
45+
*/
46+
public class OtelXRayWaitExample extends DurableHandler<GreetingRequest, String> {
47+
48+
@Override
49+
protected DurableConfig createConfiguration() {
50+
var otlpExporter = OtlpGrpcSpanExporter.builder()
51+
.setEndpoint("http://localhost:4317")
52+
.build();
53+
54+
var otelPlugin = new OpenTelemetryDurablePlugin(
55+
SdkTracerProvider.builder().addSpanProcessor(SimpleSpanProcessor.create(otlpExporter)));
56+
57+
return DurableConfig.builder().withPlugins(otelPlugin).build();
58+
}
59+
60+
@Override
61+
public String handleRequest(GreetingRequest input, DurableContext context) {
62+
context.getLogger().info("Starting OTel X-Ray wait example for {}", input.getName());
63+
64+
var before = context.step("before-wait", String.class, stepCtx -> "Prepared: " + input.getName());
65+
66+
// This wait forces Lambda to suspend and re-invoke after the duration
67+
context.wait("pause", Duration.ofSeconds(5));
68+
69+
var after = context.step("after-wait", String.class, stepCtx -> before + " | Resumed and completed");
70+
71+
context.getLogger().info("OTel X-Ray wait example complete: {}", after);
72+
return after;
73+
}
74+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
receivers:
2+
otlp:
3+
protocols:
4+
grpc:
5+
endpoint: "localhost:4317"
6+
http:
7+
endpoint: "localhost:4318"
8+
9+
exporters:
10+
awsxray:
11+
12+
service:
13+
pipelines:
14+
traces:
15+
receivers: [otlp]
16+
exporters: [awsxray]

0 commit comments

Comments
 (0)