diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2645f55 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,79 @@ +name: Deploy using AWS CLI + +on: + push: + paths-ignore: + - 'README.md' + branches: + - main + pull_request: + branches: + - main + +jobs: + integration-tests: + name: Run Integration Tests + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '21' + + - name: Install dependencies + run: make install + + - name: Spin up LocalStack + run: | + make start + make ready + env: + LOCALSTACK_AUTH_TOKEN: ${{ secrets.LOCALSTACK_AUTH_TOKEN }} + + - name: Setup the solutions + run: | + pip install awscli-local + make deploy + + - name: Run Integration Tests + run: | + make test + env: + AWS_DEFAULT_REGION: us-east-1 + AWS_REGION: us-east-1 + AWS_ACCESS_KEY_ID: test + AWS_SECRET_ACCESS_KEY: test + + - name: Logs out of LocalStack + if: always() + run: | + make logs + cat logs.txt + + - name: Send a Slack notification + if: failure() || github.event_name != 'pull_request' + uses: ravsamhq/notify-slack-action@v2 + with: + status: ${{ job.status }} + token: ${{ secrets.GITHUB_TOKEN }} + notification_title: "{workflow} has {status_message}" + message_format: "{emoji} *{workflow}* {status_message} in <{repo_url}|{repo}>" + footer: "Linked Repo <{repo_url}|{repo}> | <{run_url}|View Workflow run>" + notify_when: "failure" + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + + - name: Generate a Diagnostic Report + if: failure() + run: | + curl -s localhost:4566/_localstack/diagnose | gzip -cf > diagnose.json.gz + + - name: Upload the Diagnostic Report + if: failure() + uses: actions/upload-artifact@v4 + with: + name: diagnose.json.gz + path: ./diagnose.json.gz diff --git a/.github/workflows/keepalive.yml b/.github/workflows/keepalive.yml new file mode 100644 index 0000000..76b4b40 --- /dev/null +++ b/.github/workflows/keepalive.yml @@ -0,0 +1,17 @@ +name: Keep Alive +on: + schedule: + - cron: "0 0 * * *" +jobs: + main-job: + name: Main Job + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + workflow-keepalive: + if: github.event_name == 'schedule' + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - uses: liskin/gh-workflow-keepalive@v1 diff --git a/.gitignore b/.gitignore index 878b046..7e5a8e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,28 +1,7 @@ - -FIS-experiments/.idea/ -route53-failover/.idea/ -extension-outages/.idea/ - -FIS-experiments/volume/ -route53-failover/volume/ -extension-outages/volume/ - -FIS-experiments/lambda-functions/.idea/ -route53-failover/lambda-functions/.idea/ -extension-outages/lambda-functions/.idea/ - -FIS-experiments/lambda-functions/.aws-sam/ -route53-failover/lambda-functions/.aws-sam/ -extension-outages/lambda-functions/.aws-sam/ - - -FIS-experiments/lambda-functions/product-lambda.iml -route53-failover/lambda-functions/product-lambda.iml - -FIS-experiments/lambda-functions/lambda-functions.iml -route53-failover/lambda-functions/lambda-functions.iml -/extension-outages/terraform/.terraform -/extension-outages/terraform/.terraform.lock.hcl -/extension-outages/terraform/terraform.tfstate - target/ +.idea/ +.vscode/ +.DS_Store +volume/ +.pytest_cache/ +__pycache__/ \ No newline at end of file diff --git a/FIS-experiments/README.md b/FIS-experiments/README.md deleted file mode 100644 index a8ef832..0000000 --- a/FIS-experiments/README.md +++ /dev/null @@ -1,270 +0,0 @@ - -# Product Managing Sample with API Gateway, Lambda, DynamoDB and FIS - - -| Environment | | -|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| __Services__ | Amazon API Gateway, Lambda, DynamoDB, SNS, SQS, FIS | -| __Categories__ | LocalStack Pro, Init Hooks, Java SDK | - - -## Description - -In this example of utilizing AWS Fault Injection Simulator (FIS) to cause controlled outages to a DynamoDB database we will -demonstrate testing software behavior and error handling. This kind of test helps to ensure that the software can handle -database downtime gracefully by implementing strategies such as queuing requests to prevent data loss. This proactive error -handling ensures that the system can maintain its operations despite partial failures. - -> [!WARNING] -> This sample uses the FIS action `localstack:generic:api-error` which is deprecated and marked for removal. -> Please migrate to the Chaos API which supports this capability and more. - -![fis-experiment-1](images/fis-experiment-1.png) - -## Prerequisites - -- [Maven 3.8.5](https://maven.apache.org/install.html) & [Java 17](https://www.java.com/en/download/help/download_options.html) -- [LocalStack](https://localstack.cloud/) -- [Docker](https://docs.docker.com/get-docker/) - for running LocalStack - -## Before starting - -Make sure to build the Lambda function by running the following command in the root folder - -``` -cd lambda-functions && mvn clean package shade:shade -``` - -### Starting LocalStack - -```bash -export LOCALSTACK_AUTH_TOKEN = -docker compose up -``` - -### Creating the resources - -The resources are created via `init hooks` at startup, using the `init-resources.sh` file. - -### Creating a Product - -Using cURL we can create a Product entity: - -```bash -curl --location 'http://12345.execute-api.localhost.localstack.cloud:4566/dev/productApi' \ ---header 'Content-Type: application/json' \ ---data '{ - "id": "prod-2004", - "name": "Ultimate Gadget", - "price": "49.99", - "description": "The Ultimate Gadget is the perfect tool for tech enthusiasts looking for the next level in gadgetry. Compact, powerful, and loaded with features." -} -' - -Product added/updated successfully. -``` - -### Creating an experiment - -There's a file containing the experiment called `experiment-ddb.json`. This has a JSON configuration that will be utilized -during the subsequent invocation of the `CreateExperimentTemplate` API. - -```bash - cat experiment-ddb.json -{ - "actions": { - "Test action 1": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "dynamodb", - "api": "all", - "percentage": "100", - "exception": "DynamoDbException", - "errorCode": "500" - } - } - }, - "description": "Template for interfering with the DynamoDB service", - "stopConditions": [{ - "source": "none" - }], - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" -} -``` - -Here we are targeting all APIs of the DynamoDb resource. Specific operations, such as `PutItem` or `GetItem` could also -be specified, but in this case, we just want to cut off the database completely. This configuration will result in a 100% failure rate -for all API calls, each accompanied by an HTTP 500 status code, with a DynamoDbException. - -```bash -awslocal fis create-experiment-template --cli-input-json file://experiment-ddb.json -{ - "experimentTemplate": { - "id": "895591e8-11e6-44c4-adc3-86592010562b", - "description": "Template for interfering with the DynamoDB service", - "actions": { - "Test action 1": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "dynamodb", - "api": "all", - "percentage": "100", - "exception": "DynamoDbException", - "errorCode": "500" - } - } - }, - "stopConditions": [ - { - "source": "none" - } - ], - "creationTime": 1699308754.415716, - "lastUpdateTime": 1699308754.415716, - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" - } -} -``` -We take note of the template ID for the next command: - -```bash - awslocal fis start-experiment --experiment-template-id 895591e8-11e6-44c4-adc3-86592010562b -{ - "experiment": { - "id": "1b1238fd-316d-4956-93e7-5ada677a6f69", - "experimentTemplateId": "895591e8-11e6-44c4-adc3-86592010562b", - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole", - "state": { - "status": "running" - }, - "actions": { - "Test action 1": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "dynamodb", - "api": "all", - "percentage": "100", - "exception": "DynamoDbException", - "errorCode": "500" - } - } - }, - "stopConditions": [ - { - "source": "none" - } - ], - "creationTime": 1699308823.74327, - "startTime": 1699308823.74327 - } -} -``` - -Now that the experiment is started, the database will be inaccessible, meaning the user can't get and can't post any new -product. The API Gateway will return an Internal Server Error. This is obviously problematic, but luckily, this potential issue -has been caught early enough in the development phase, that the engineer can include proper error handling and a mechanism -that prevents data loss in case of an outage of the database. This of course is not limited to DynamoDB, an outage can be -simulated for any storage resource. - - -![fis-experiment-2](images/fis-experiment-2.png) - -The solution includes an SNS topic, an SQS queue and a Lambda function that will pick up the queued element and retry the -`PutItem` on the database. In case DynamoDB is still unavailable, the item will be re-queued. - -```bash -curl --location 'http://12345.execute-api.localhost.localstack.cloud:4566/dev/productApi' \ - --header 'Content-Type: application/json' \ - --data '{ - "id": "prod-1003", - "name": "Super Widget", - "price": "29.99", - "description": "A versatile widget that can be used for a variety of purposes. Durable, reliable, and affordable." - } - ' - -A DynamoDB error occurred. Message sent to queue.⏎ - -``` - -Now this element sits in the queue, until the outage is over. -We can stop the experiment by using the following command: - -```bash - awslocal fis stop-experiment --id 1b1238fd-316d-4956-93e7-5ada677a6f69 -{ - "experiment": { - "id": "1b1238fd-316d-4956-93e7-5ada677a6f69", - "experimentTemplateId": "895591e8-11e6-44c4-adc3-86592010562b", - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole", - "state": { - "status": "stopped" - }, - "actions": { - "Test action 1": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "dynamodb", - "api": "all", - "percentage": "100", - "exception": "DynamoDbException", - "errorCode": "500" - }, - "startTime": 1699308823.750742, - "endTime": 1699309736.259625 - } - }, - "stopConditions": [ - { - "source": "none" - } - ], - "creationTime": 1699308823.74327, - "startTime": 1699308823.74327, - "endTime": 1699309736.259646 - } -} -``` - -The experiment ID comes from the prior used `start-experiment` command. -The experiment has been stopped, meaning that the Product that initially has not reached the database, has finally reached -the destination. We can verify that by scanning the database: - -```bash -awslocal dynamodb scan --table-name Products -{ - "Items": [ - { - "name": { - "S": "Super Widget" - }, - "description": { - "S": "A versatile widget that can be used for a variety of purposes. Durable, reliable, and affordable." - }, - "id": { - "S": "prod-1003" - }, - "price": { - "N": "29.99" - } - }, - { - "name": { - "S": "Ultimate Gadget" - }, - "description": { - "S": "The Ultimate Gadget is the perfect tool for tech enthusiasts looking for the next level in gadgetry. Compact, powerful, and loaded with features." - }, - "id": { - "S": "prod-2004" - }, - "price": { - "N": "49.99" - } - } - ], - "Count": 2, - "ScannedCount": 2, - "ConsumedCapacity": null -} -``` diff --git a/FIS-experiments/docker-compose.yml b/FIS-experiments/docker-compose.yml deleted file mode 100644 index cc3f858..0000000 --- a/FIS-experiments/docker-compose.yml +++ /dev/null @@ -1,32 +0,0 @@ -version: "3.9" - -services: - localstack: - networks: - - ls_network - container_name: localstack - image: localstack/localstack-pro:latest - ports: - - "127.0.0.1:4566:4566" # LocalStack Gateway - - "127.0.0.1:4510-4559:4510-4559" # external services port range - - "127.0.0.1:53:53" - - "127.0.0.1:53:53/udp" - environment: - - DEBUG=1 # enable more verbose logs - - DOCKER_HOST=unix:///var/run/docker.sock #unix socket to communicate with the docker daemon - - LOCALSTACK_HOST=localstack # where services are available from other containers - - ENFORCE_IAM=0 # enforce IAM policies - - LAMBDA_DOCKER_NETWORK=ls_network - - LOCALSTACK_AUTH_TOKEN=${LOCALSTACK_AUTH_TOKEN} - - LAMBDA_RUNTIME_ENVIRONMENT_TIMEOUT=60 - - PERSIST_ALL=false - - volumes: - - "./volume:/var/lib/localstack" - - "/var/run/docker.sock:/var/run/docker.sock" - - "./lambda-functions/target/product-lambda.jar:/etc/localstack/init/ready.d/target/product-lambda.jar" - - "./init-resources.sh:/etc/localstack/init/ready.d/init-resources.sh" - -networks: - ls_network: - name: ls_network diff --git a/FIS-experiments/experiment-ddb.json b/FIS-experiments/experiment-ddb.json deleted file mode 100644 index fccc57b..0000000 --- a/FIS-experiments/experiment-ddb.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "actions": { - "Test action 1": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "dynamodb", - "api": "all", - "percentage": "100", - "exception": "DynamoDbException", - "errorCode": "500" - } - } - }, - "description": "Template for interfering with the DynamoDB service", - "stopConditions": [{ - "source": "none" - }], - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" -} diff --git a/FIS-experiments/experiment-lambda.json b/FIS-experiments/experiment-lambda.json deleted file mode 100644 index 2d1ba55..0000000 --- a/FIS-experiments/experiment-lambda.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "actions": { - "Some test action": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "lambda", - "operation": "Invoke", - "percentage": "100", - "exception": "Internal Server Error", - "errorCode": "500" - } - } - }, - "description": "Template for error return on Lambda invoke.", - "stopConditions": [{ - "source": "none" - }], - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" -} diff --git a/FIS-experiments/images/dark-fis-after-sqs.png b/FIS-experiments/images/dark-fis-after-sqs.png deleted file mode 100644 index 3a7ce54..0000000 Binary files a/FIS-experiments/images/dark-fis-after-sqs.png and /dev/null differ diff --git a/FIS-experiments/images/dark-fis-before-sqs.png b/FIS-experiments/images/dark-fis-before-sqs.png deleted file mode 100644 index 984b23a..0000000 Binary files a/FIS-experiments/images/dark-fis-before-sqs.png and /dev/null differ diff --git a/FIS-experiments/images/fis-experiment-1.png b/FIS-experiments/images/fis-experiment-1.png deleted file mode 100644 index a2d832f..0000000 Binary files a/FIS-experiments/images/fis-experiment-1.png and /dev/null differ diff --git a/FIS-experiments/images/fis-experiment-2.png b/FIS-experiments/images/fis-experiment-2.png deleted file mode 100644 index e1b6650..0000000 Binary files a/FIS-experiments/images/fis-experiment-2.png and /dev/null differ diff --git a/FIS-experiments/init-resources.sh b/FIS-experiments/init-resources.sh deleted file mode 100755 index 40a8cb0..0000000 --- a/FIS-experiments/init-resources.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/sh - -LAMBDAS_DIR=/etc/localstack/init/ready.d -if [[ ! -e $LAMBDAS_DIR ]]; then - # for local testing, running the script directly on the host (without init hooks) - LAMBDAS_DIR=./lambda-functions -fi - -# set region globally -export AWS_DEFAULT_REGION=us-east-1 - -# install `jq`, if not yet available -which jq || apt-get -y install jq - -# create table -echo "Create DynamoDB table..." -awslocal dynamodb create-table \ - --table-name Products \ - --attribute-definitions AttributeName=id,AttributeType=S \ - --key-schema AttributeName=id,KeyType=HASH \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 - - -# create Lambdas - -echo "Add Product Lambda..." -awslocal lambda create-function \ - --function-name add-product \ - --runtime java17 \ - --handler lambda.AddProduct::handleRequest \ - --memory-size 1024 \ - --timeout 45 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole \ - --environment Variables={AWS_REGION=$AWS_DEFAULT_REGION} - - -echo "Get Product Lambda..." -awslocal lambda create-function \ - --function-name get-product \ - --runtime java17 \ - --handler lambda.GetProduct::handleRequest \ - --memory-size 1024 \ - --timeout 45 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole \ - --environment Variables={AWS_REGION=$AWS_DEFAULT_REGION} - -export REST_API_ID=12345 - -# create rest api gateway -echo "Create Rest API..." -awslocal apigateway create-rest-api --name quote-api-gateway --tags '{"_custom_id_":"12345"}' - -# get parent id of resource -echo "Export Parent ID..." -export PARENT_ID=$(awslocal apigateway get-resources --rest-api-id $REST_API_ID | jq -r '.items[0].id') - -# get resource id -echo "Export Resource ID..." -export RESOURCE_ID=$(awslocal apigateway create-resource --rest-api-id $REST_API_ID --parent-id $PARENT_ID --path-part "productApi" | jq -r '.id') - -echo "RESOURCE ID: $RESOURCE_ID" - -echo "Put GET Method..." -awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" - -echo "Put POST Method..." -awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method POST \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" - - -echo "Update GET Method..." -awslocal apigateway update-method \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method GET \ - --patch-operations "op=replace,path=/requestParameters/method.request.querystring.param,value=true" - - -echo "Put POST Method Integration..." -awslocal apigateway put-integration \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method POST \ - --type AWS_PROXY \ - --integration-http-method POST \ - --uri arn:aws:apigateway:$AWS_DEFAULT_REGION:lambda:path/2015-03-31/functions/arn:aws:lambda:$AWS_DEFAULT_REGION:000000000000:function:add-product/invocations \ - --passthrough-behavior WHEN_NO_MATCH - -echo "Put GET Method Integration..." -awslocal apigateway put-integration \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method GET \ - --type AWS_PROXY \ - --integration-http-method GET \ - --uri arn:aws:apigateway:$AWS_DEFAULT_REGION:lambda:path/2015-03-31/functions/arn:aws:lambda:$AWS_DEFAULT_REGION:000000000000:function:get-product/invocations \ - --passthrough-behavior WHEN_NO_MATCH - -echo "Create DEV Deployment..." -awslocal apigateway create-deployment \ - --rest-api-id $REST_API_ID \ - --stage-name dev - -awslocal sns create-topic --name ProductEventsTopic - -awslocal sqs create-queue --queue-name ProductEventsQueue - -awslocal sqs get-queue-attributes --queue-url http://localhost:4566/000000000000/ProductEventsQueue --attribute-names QueueArn - -awslocal sns subscribe \ - --topic-arn arn:aws:sns:$AWS_DEFAULT_REGION:000000000000:ProductEventsTopic \ - --protocol sqs \ - --notification-endpoint arn:aws:sqs:$AWS_DEFAULT_REGION:000000000000:ProductEventsQueue - -awslocal lambda create-function \ - --function-name process-product-events \ - --runtime java17 \ - --handler lambda.DynamoDBWriterLambda::handleRequest \ - --memory-size 1024 \ - --timeout 20 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole - -awslocal lambda create-event-source-mapping \ - --function-name process-product-events \ - --batch-size 10 \ - --event-source-arn arn:aws:sqs:$AWS_DEFAULT_REGION:000000000000:ProductEventsQueue - -awslocal sqs set-queue-attributes \ - --queue-url http://localhost:4566/000000000000/ProductEventsQueue \ - --attributes VisibilityTimeout=10 - - diff --git a/FIS-experiments/lambda-functions/pom.xml b/FIS-experiments/lambda-functions/pom.xml deleted file mode 100644 index 420160f..0000000 --- a/FIS-experiments/lambda-functions/pom.xml +++ /dev/null @@ -1,128 +0,0 @@ - - - - 4.0.0 - - product-lambda - cloud.localstack - jar - 1.0-SNAPSHOT - - - 11 - 11 - false - - - - - software.amazon.awssdk - lambda - - - com.amazonaws - aws-lambda-java-core - 1.2.2 - - - software.amazon.awssdk - protocol-core - 2.20.69 - - - software.amazon.awssdk - s3 - - - software.amazon.awssdk - dynamodb - 2.20.68 - - - com.amazonaws - aws-lambda-java-events - 3.11.3 - - - - com.fasterxml.jackson.core - jackson-core - 2.13.3 - - - - - com.fasterxml.jackson.core - jackson-databind - 2.13.3 - - - - - com.fasterxml.jackson.core - jackson-annotations - 2.15.1 - - - software.amazon.awssdk - sns - 2.20.69 - - - - org.slf4j - slf4j-api - 2.0.7 - - - - ch.qos.logback - logback-classic - 1.4.7 - - - - - - - - software.amazon.awssdk - bom - 2.20.47 - pom - import - - - - - - product-lambda - - - src/main/resources - true - - - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - false - - - - package - - shade - - - - - - - \ No newline at end of file diff --git a/FIS-experiments/lambda-functions/src/main/java/lambda/ProductApi.java b/FIS-experiments/lambda-functions/src/main/java/lambda/ProductApi.java deleted file mode 100644 index eeafcae..0000000 --- a/FIS-experiments/lambda-functions/src/main/java/lambda/ProductApi.java +++ /dev/null @@ -1,47 +0,0 @@ -package lambda; - -import com.fasterxml.jackson.databind.ObjectMapper; -import java.net.URI; - -import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; -import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; -import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; -import software.amazon.awssdk.core.retry.RetryPolicy; -import software.amazon.awssdk.regions.Region; -import software.amazon.awssdk.services.dynamodb.DynamoDbClient; -import software.amazon.awssdk.services.sns.SnsClient; - -public class ProductApi { - - protected static final String LOCALSTACK_HOSTNAME = System.getenv("LOCALSTACK_HOSTNAME"); - protected static final String AWS_REGION = System.getenv("AWS_REGION"); - protected static final String topicArn = "arn:aws:sns:us-east-1:000000000000:ProductEventsTopic"; - protected ObjectMapper objectMapper = new ObjectMapper(); - - // Define a custom retry policy - // Set maximum number of retries - RetryPolicy customRetryPolicy = RetryPolicy.builder() - .numRetries(3) - .build(); - - // Apply the custom retry policy to ClientOverrideConfiguration - ClientOverrideConfiguration clientOverrideConfig = ClientOverrideConfiguration.builder() - .retryPolicy(customRetryPolicy) - .build(); - - protected SnsClient snsClient = SnsClient.builder() - .endpointOverride(URI.create(String.format("http://%s:4566", LOCALSTACK_HOSTNAME))) - .credentialsProvider( - StaticCredentialsProvider.create(AwsBasicCredentials.create("test", "test"))) - .region(Region.of(AWS_REGION)) - .build(); - - protected DynamoDbClient ddb = DynamoDbClient.builder() - .endpointOverride(URI.create(String.format("http://%s:4566", LOCALSTACK_HOSTNAME))) - .credentialsProvider( - StaticCredentialsProvider.create(AwsBasicCredentials.create("test", "test"))) - .region(Region.of(AWS_REGION)) - .endpointDiscoveryEnabled(true) - .overrideConfiguration(clientOverrideConfig) - .build(); -} diff --git a/FIS-experiments/lambda-functions/target/product-lambda.jar b/FIS-experiments/lambda-functions/target/product-lambda.jar deleted file mode 100644 index 158257c..0000000 Binary files a/FIS-experiments/lambda-functions/target/product-lambda.jar and /dev/null differ diff --git a/FIS-experiments/latency-experiment.json b/FIS-experiments/latency-experiment.json deleted file mode 100644 index f6f196a..0000000 --- a/FIS-experiments/latency-experiment.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "description": "template for testing delays in API calls", - "actions": { - "latency": { - "actionId": "localstack:generic:api-error", - "parameters": { - "latency": "4" - } - } - }, - "stopConditions": [ - { - "source": "none" - } - ], - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" -} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9fefa59 --- /dev/null +++ b/Makefile @@ -0,0 +1,57 @@ +export AWS_ACCESS_KEY_ID ?= test +export AWS_SECRET_ACCESS_KEY ?= test +export AWS_DEFAULT_REGION=us-east-1 +SHELL := /bin/bash + +usage: ## Show this help in table format + @echo "| Target | Description |" + @echo "|------------------------|-------------------------------------------------------------------|" + @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/:.*##\s*/##/g' | awk -F'##' '{ printf "| %-22s | %-65s |\n", $$1, $$2 }' + + +check: ## Check if all required prerequisites are installed + @command -v docker > /dev/null 2>&1 || { echo "Docker is not installed. Please install Docker and try again."; exit 1; } + @command -v localstack > /dev/null 2>&1 || { echo "LocalStack is not installed. Please install LocalStack and try again."; exit 1; } + @command -v terraform > /dev/null 2>&1 || { echo "Terraform is not installed. Please install Terraform and try again."; exit 1; } + @command -v mvn > /dev/null 2>&1 || { echo "Maven is not installed. Please install Maven and try again."; exit 1; } + @command -v java > /dev/null 2>&1 || { echo "Java is not installed. Please install Java and try again."; exit 1; } + @command -v aws > /dev/null 2>&1 || { echo "AWS CLI is not installed. Please install AWS CLI and try again."; exit 1; } + @command -v awslocal > /dev/null 2>&1 || { echo "awslocal is not installed. Please install awslocal and try again."; exit 1; } + @command -v python3 > /dev/null 2>&1 || { echo "Python 3 is not installed. Please install Python 3 and try again."; exit 1; } + @echo "All required prerequisites are available." + +install: ## Install all required dependencies + @echo "Installing all required dependencies..." + cd lambda-functions && mvn clean package shade:shade; + cd tests && pip install -r requirements-dev.txt; + @echo "All required dependencies installed successfully." + +test: ## Run all tests + @echo "Running all tests..." + pytest tests/ + @echo "All tests completed successfully." + +deploy: ## Deploy all solutions + @echo "Deploying all solutions..." + ./solutions/dynamodb-outage.sh + ./solutions/route53-failover.sh + @echo "All solutions deployed successfully." + +start: ## Start localstack + LOCALSTACK_AUTH_TOKEN=$(LOCALSTACK_AUTH_TOKEN) docker compose up --build --detach --wait + +stop: ## Stop localstack + docker compose down + +logs: ## Show logs from LocalStack + docker compose logs > logs.txt + +ready: ## Wait until LocalStack is ready + @echo "Waiting for LocalStack to be ready..." + @while [[ "$$(curl -s localhost:4566/_localstack/init/ready | jq -r .completed)" != "true" ]]; do \ + echo "LocalStack not ready yet, waiting..."; \ + sleep 2; \ + done + @echo "LocalStack is ready!" + +.PHONY: usage check install start ready deploy test logs stop diff --git a/README.md b/README.md index b9e3317..e69de29 100644 --- a/README.md +++ b/README.md @@ -1,8 +0,0 @@ -# Chaos Engineering with LocalStack - -Chaos engineering with LocalStack presents a proactive approach to building resilient systems by introducing controlled disruptions. -This versatile practice varies in its application; for software developers, it might mean application behavior and error handling, for architects, ensuring the robustness of system design, and for operations teams, examining the reliability of infrastructure provisioning. -By integrating chaos experiments early in the development cycle, teams can uncover and address potential weaknesses, forging systems that withstand turbulent conditions. - -This repo contains various samples that demonstrate the capabilities of LocalStack Chaos Engineering tool suite. -It suppliments the documentation available [here](https://docs.localstack.cloud/user-guide/chaos-engineering/). diff --git a/chaos-api/README.md b/chaos-api/README.md deleted file mode 100644 index 33e44f6..0000000 --- a/chaos-api/README.md +++ /dev/null @@ -1,138 +0,0 @@ -# Chaos API - -| Environment | | -|------------------|-----------------------------------------| -| __Services__ | API Gateway, Lambda, DynamoDB, SNS, SQS | -| __Categories__ | LocalStack Pro, Init Hooks, Java SDK | - - -## Description - -In this sample, we use LocalStack Chaos API to cause controlled outages in the DynamoDB service to study the resiliency of the architecture and improve fault tolerance. -This kind of test helps to ensure that the software can handle database downtime gracefully by implementing strategies such as queuing requests to prevent data loss. -This proactive error handling ensures that the system can maintain its operations despite partial failures. - -![arch-1](images/arch-1.png) - - -## Prerequisites - -- [Maven 3.8.5](https://maven.apache.org/install.html) & [Java 17](https://www.java.com/en/download/help/download_options.html) -- [LocalStack](https://localstack.cloud/) -- [Docker](https://docs.docker.com/get-docker/) - for running LocalStack - - -## Before starting - -Make sure to build the Lambda function by running the following command in the root folder - -``` -cd lambda-functions && mvn clean package shade:shade -``` - - -### Starting LocalStack - -```bash -export LOCALSTACK_AUTH_TOKEN = -docker compose up -``` - -### Creating the resources - -The resources are created via `init hooks` at startup, using the `init-resources.sh` file. - -### Creating a Product - -Using cURL we can create a Product entity: - -```text -$ curl --location 'http://12345.execute-api.localhost.localstack.cloud:4566/dev/productApi' \ ---header 'Content-Type: application/json' \ ---data '{ - "id": "prod-2004", - "name": "Ultimate Gadget", - "price": "49.99", - "description": "The Ultimate Gadget is the perfect tool for tech enthusiasts looking for the next level in gadgetry. Compact, powerful, and loaded with features." -}' -⏎ -Product added/updated successfully. -``` - -### Creating an experiment - -The shell script [outage-dynamodb-start.sh](./outage-dynamodb-start.sh) configures the Chaos API to cause faults within DynamoDB. -The configuration targets all operations in the DynamoDB service. -If required, you may filter specific operations such as `PutItem` or `GetItem`, but in this case we just want to cut off the database service completely. -This configuration will result in a 100% failure rate for all API calls to DynamoDB, each accompanied by an HTTP 500 status code with a `DatacentreNotFound` error. - -When the script is run, the database becomes inaccessible not only for external clients but also for all services within LocalStack. -This means that service integrations can no longer retrieve or create new products. -API Gateway will return an Internal Server Error. -This is obviously problematic, but luckily, this potential issue has been caught early enough in the development phase, that the engineer can include proper error handling and a mechanism -that prevents data loss in case of an outage of the database. - -![arch-2.png](images/arch-2.png) - -At this point, we can try to make the architecture more resilient to such failures. -The solution includes an SNS topic, an SQS queue and a Lambda function that will pick up the queued element and retry the `PutItem` on the database. -In case DynamoDB is still unavailable, the item will be re-queued. - -```text -$ curl --location 'http://12345.execute-api.localhost.localstack.cloud:4566/dev/productApi' \ - --header 'Content-Type: application/json' \ - --data '{ - "id": "prod-1003", - "name": "Super Widget", - "price": "29.99", - "description": "A versatile widget that can be used for a variety of purposes. Durable, reliable, and affordable." - }' -⏎ -A DynamoDB error occurred. Message sent to queue. -``` - -Now this element sits in the queue, until the outage is over and the database is accessible again. - -The outage can be ended by running the shell script [outage-dynamodb-end.sh](./outage-dynamodb-end.sh). -Now, the Product element that initially has not reached the database, should reach its destination. -This can be verified by scanning the database: - -```text -$ awslocal dynamodb scan --table-name Products -⏎ -{ - "Items": [ - { - "name": { - "S": "Super Widget" - }, - "description": { - "S": "A versatile widget that can be used for a variety of purposes. Durable, reliable, and affordable." - }, - "id": { - "S": "prod-1003" - }, - "price": { - "N": "29.99" - } - }, - { - "name": { - "S": "Ultimate Gadget" - }, - "description": { - "S": "The Ultimate Gadget is the perfect tool for tech enthusiasts looking for the next level in gadgetry. Compact, powerful, and loaded with features." - }, - "id": { - "S": "prod-2004" - }, - "price": { - "N": "49.99" - } - } - ], - "Count": 2, - "ScannedCount": 2, - "ConsumedCapacity": null -} -``` diff --git a/chaos-api/docker-compose.yml b/chaos-api/docker-compose.yml deleted file mode 100644 index cc3f858..0000000 --- a/chaos-api/docker-compose.yml +++ /dev/null @@ -1,32 +0,0 @@ -version: "3.9" - -services: - localstack: - networks: - - ls_network - container_name: localstack - image: localstack/localstack-pro:latest - ports: - - "127.0.0.1:4566:4566" # LocalStack Gateway - - "127.0.0.1:4510-4559:4510-4559" # external services port range - - "127.0.0.1:53:53" - - "127.0.0.1:53:53/udp" - environment: - - DEBUG=1 # enable more verbose logs - - DOCKER_HOST=unix:///var/run/docker.sock #unix socket to communicate with the docker daemon - - LOCALSTACK_HOST=localstack # where services are available from other containers - - ENFORCE_IAM=0 # enforce IAM policies - - LAMBDA_DOCKER_NETWORK=ls_network - - LOCALSTACK_AUTH_TOKEN=${LOCALSTACK_AUTH_TOKEN} - - LAMBDA_RUNTIME_ENVIRONMENT_TIMEOUT=60 - - PERSIST_ALL=false - - volumes: - - "./volume:/var/lib/localstack" - - "/var/run/docker.sock:/var/run/docker.sock" - - "./lambda-functions/target/product-lambda.jar:/etc/localstack/init/ready.d/target/product-lambda.jar" - - "./init-resources.sh:/etc/localstack/init/ready.d/init-resources.sh" - -networks: - ls_network: - name: ls_network diff --git a/chaos-api/images/arch-1.png b/chaos-api/images/arch-1.png deleted file mode 100644 index 1833d94..0000000 Binary files a/chaos-api/images/arch-1.png and /dev/null differ diff --git a/chaos-api/images/arch-2.png b/chaos-api/images/arch-2.png deleted file mode 100644 index 46d4e4a..0000000 Binary files a/chaos-api/images/arch-2.png and /dev/null differ diff --git a/chaos-api/init-resources.sh b/chaos-api/init-resources.sh deleted file mode 100755 index 044807b..0000000 --- a/chaos-api/init-resources.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/sh -xeu - -LAMBDAS_DIR=/etc/localstack/init/ready.d -if [[ ! -e $LAMBDAS_DIR ]]; then - # for local testing, running the script directly on the host (without init hooks) - LAMBDAS_DIR=./lambda-functions -fi - -# set region globally -export AWS_DEFAULT_REGION=us-east-1 - -# install `jq`, if not yet available -which jq || apt-get -y install jq - -# create table -echo "Create DynamoDB table..." -awslocal dynamodb create-table \ - --table-name Products \ - --attribute-definitions AttributeName=id,AttributeType=S \ - --key-schema AttributeName=id,KeyType=HASH \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 - - -# create Lambdas - -echo "Add Product Lambda..." -awslocal lambda create-function \ - --function-name add-product \ - --runtime java17 \ - --handler lambda.AddProduct::handleRequest \ - --memory-size 1024 \ - --timeout 45 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole \ - --environment Variables={AWS_REGION=$AWS_DEFAULT_REGION} - - -echo "Get Product Lambda..." -awslocal lambda create-function \ - --function-name get-product \ - --runtime java17 \ - --handler lambda.GetProduct::handleRequest \ - --memory-size 1024 \ - --timeout 45 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole \ - --environment Variables={AWS_REGION=$AWS_DEFAULT_REGION} - -export REST_API_ID=12345 - -# create rest api gateway -echo "Create Rest API..." -awslocal apigateway create-rest-api --name quote-api-gateway --tags '{"_custom_id_":"12345"}' - -# get parent id of resource -echo "Export Parent ID..." -export PARENT_ID=$(awslocal apigateway get-resources --rest-api-id $REST_API_ID | jq -r '.items[0].id') - -# get resource id -echo "Export Resource ID..." -export RESOURCE_ID=$(awslocal apigateway create-resource --rest-api-id $REST_API_ID --parent-id $PARENT_ID --path-part "productApi" | jq -r '.id') - -echo "RESOURCE ID: $RESOURCE_ID" - -echo "Put GET Method..." -awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" - -echo "Put POST Method..." -awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method POST \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" - - -echo "Update GET Method..." -awslocal apigateway update-method \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method GET \ - --patch-operations "op=replace,path=/requestParameters/method.request.querystring.param,value=true" - - -echo "Put POST Method Integration..." -awslocal apigateway put-integration \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method POST \ - --type AWS_PROXY \ - --integration-http-method POST \ - --uri arn:aws:apigateway:$AWS_DEFAULT_REGION:lambda:path/2015-03-31/functions/arn:aws:lambda:$AWS_DEFAULT_REGION:000000000000:function:add-product/invocations \ - --passthrough-behavior WHEN_NO_MATCH - -echo "Put GET Method Integration..." -awslocal apigateway put-integration \ - --rest-api-id $REST_API_ID \ - --resource-id $RESOURCE_ID \ - --http-method GET \ - --type AWS_PROXY \ - --integration-http-method GET \ - --uri arn:aws:apigateway:$AWS_DEFAULT_REGION:lambda:path/2015-03-31/functions/arn:aws:lambda:$AWS_DEFAULT_REGION:000000000000:function:get-product/invocations \ - --passthrough-behavior WHEN_NO_MATCH - -echo "Create DEV Deployment..." -awslocal apigateway create-deployment \ - --rest-api-id $REST_API_ID \ - --stage-name dev - -awslocal sns create-topic --name ProductEventsTopic - -awslocal sqs create-queue --queue-name ProductEventsQueue - -awslocal sqs get-queue-attributes --queue-url http://localhost:4566/000000000000/ProductEventsQueue --attribute-names QueueArn - -awslocal sns subscribe \ - --topic-arn arn:aws:sns:$AWS_DEFAULT_REGION:000000000000:ProductEventsTopic \ - --protocol sqs \ - --notification-endpoint arn:aws:sqs:$AWS_DEFAULT_REGION:000000000000:ProductEventsQueue - -awslocal lambda create-function \ - --function-name process-product-events \ - --runtime java17 \ - --handler lambda.DynamoDBWriterLambda::handleRequest \ - --memory-size 1024 \ - --timeout 20 \ - --zip-file fileb://$LAMBDAS_DIR/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole - -awslocal lambda create-event-source-mapping \ - --function-name process-product-events \ - --batch-size 10 \ - --event-source-arn arn:aws:sqs:$AWS_DEFAULT_REGION:000000000000:ProductEventsQueue - -awslocal sqs set-queue-attributes \ - --queue-url http://localhost:4566/000000000000/ProductEventsQueue \ - --attributes VisibilityTimeout=10 - - diff --git a/chaos-api/lambda-functions/pom.xml b/chaos-api/lambda-functions/pom.xml deleted file mode 100644 index 420160f..0000000 --- a/chaos-api/lambda-functions/pom.xml +++ /dev/null @@ -1,128 +0,0 @@ - - - - 4.0.0 - - product-lambda - cloud.localstack - jar - 1.0-SNAPSHOT - - - 11 - 11 - false - - - - - software.amazon.awssdk - lambda - - - com.amazonaws - aws-lambda-java-core - 1.2.2 - - - software.amazon.awssdk - protocol-core - 2.20.69 - - - software.amazon.awssdk - s3 - - - software.amazon.awssdk - dynamodb - 2.20.68 - - - com.amazonaws - aws-lambda-java-events - 3.11.3 - - - - com.fasterxml.jackson.core - jackson-core - 2.13.3 - - - - - com.fasterxml.jackson.core - jackson-databind - 2.13.3 - - - - - com.fasterxml.jackson.core - jackson-annotations - 2.15.1 - - - software.amazon.awssdk - sns - 2.20.69 - - - - org.slf4j - slf4j-api - 2.0.7 - - - - ch.qos.logback - logback-classic - 1.4.7 - - - - - - - - software.amazon.awssdk - bom - 2.20.47 - pom - import - - - - - - product-lambda - - - src/main/resources - true - - - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4.3 - - false - - - - package - - shade - - - - - - - \ No newline at end of file diff --git a/chaos-api/lambda-functions/src/main/java/lambda/AddProduct.java b/chaos-api/lambda-functions/src/main/java/lambda/AddProduct.java deleted file mode 100644 index 95c5511..0000000 --- a/chaos-api/lambda-functions/src/main/java/lambda/AddProduct.java +++ /dev/null @@ -1,102 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyRequestEvent; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyResponseEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonMappingException; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.awscore.exception.AwsServiceException; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.PutItemRequest; -import software.amazon.awssdk.services.sns.model.PublishRequest; - -public class AddProduct extends ProductApi implements - RequestHandler { - - private static final String TABLE_NAME = "Products"; - private static final String PRODUCT_ID = "id"; - - @Override - public APIGatewayProxyResponseEvent handleRequest(APIGatewayProxyRequestEvent requestEvent, - Context context) { - - Map productData; - try { - productData = objectMapper.readValue(requestEvent.getBody(), HashMap.class); - } catch (JsonMappingException e) { - throw new RuntimeException(e); - } catch (com.fasterxml.jackson.core.JsonProcessingException e) { - throw new RuntimeException(e); - } - - HashMap itemValues = new HashMap<>(); - itemValues.put("id", AttributeValue.builder().s(productData.get("id")).build()); - itemValues.put("name", AttributeValue.builder().s(productData.get("name")).build()); - itemValues.put("price", AttributeValue.builder().n(productData.get("price")).build()); - itemValues.put("description", - AttributeValue.builder().s(productData.get("description")).build()); - - PutItemRequest putItemRequest = PutItemRequest.builder() - .tableName(TABLE_NAME) - .item(itemValues) - .conditionExpression("attribute_not_exists(id) OR id = :id") - .expressionAttributeValues( - Map.of(":id", AttributeValue.builder().s(productData.get("id")).build())) - .build(); - - Map headers = new HashMap<>(); - headers.put("Content-Type", "application/json"); - - try { - ddb.putItem(putItemRequest); - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody("Product added/updated successfully.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (ConditionalCheckFailedException e) { - return new APIGatewayProxyResponseEvent().withStatusCode(409) - .withBody("Product with the given ID already exists.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (DynamoDbException e) { - context.getLogger().log("Error: " + e.getMessage()); - // Publish message to SNS topic if DynamoDB operation fails. - String productDataJson; - try { - productDataJson = objectMapper.writeValueAsString(productData); - } catch (JsonProcessingException ex) { - throw new RuntimeException(ex); - } - PublishRequest publishRequest = PublishRequest.builder() - .message(productDataJson) - .topicArn(topicArn) - .build(); - context.getLogger().log("Sending to queue: " + productDataJson); - - snsClient.publish(publishRequest); - - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody("A DynamoDB error occurred. Message sent to queue.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (AwsServiceException ex) { - context.getLogger().log("AwsServiceException exception: " + ex.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(ex.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } catch (RuntimeException e) { - context.getLogger().log("Runtime exception: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(e.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } catch (Exception e) { - context.getLogger().log("Generic exception: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(e.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } - - } -} \ No newline at end of file diff --git a/chaos-api/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java b/chaos-api/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java deleted file mode 100644 index 5a6af98..0000000 --- a/chaos-api/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java +++ /dev/null @@ -1,64 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.SQSEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.PutItemRequest; -import software.amazon.awssdk.services.dynamodb.model.PutItemResponse; - -public class DynamoDBWriterLambda extends ProductApi implements RequestHandler { - - private final ObjectMapper objectMapper = new ObjectMapper(); - - private static final String TABLE_NAME = "Products"; - - @Override - public Void handleRequest(SQSEvent event, Context context) { - - for (SQSEvent.SQSMessage msg : event.getRecords()) { - try { - JsonNode rootNode = objectMapper.readTree(msg.getBody()); - String messageContent = rootNode.get("Message").asText(); - - Map productData; - try { - productData = objectMapper.readValue(messageContent, HashMap.class); - } catch (JsonProcessingException e) { - throw new RuntimeException(e); - } - HashMap itemValues = new HashMap<>(); - itemValues.put("id", AttributeValue.builder().s(productData.get("id")).build()); - itemValues.put("name", AttributeValue.builder().s(productData.get("name")).build()); - itemValues.put("price", AttributeValue.builder().n(productData.get("price")).build()); - itemValues.put("description", - AttributeValue.builder().s(productData.get("description")).build()); - - // Put the item into the DynamoDB table - PutItemRequest putItemRequest = PutItemRequest.builder() - .tableName(TABLE_NAME) - .item(itemValues) - .build(); - PutItemResponse putItemResult = ddb.putItem(putItemRequest); - context.getLogger().log("Successfully processed message, result: " + putItemResult); - - } catch (DynamoDbException dbe) { - // Service unavailable, let the message go back to the queue after visibility timeout - context.getLogger().log( - "DynamoDB service is unavailable, message will be retried. Error: " - + dbe.getMessage()); - throw dbe; - } catch (Exception e) { - context.getLogger().log("Exception: Error processing the message: " + e.getMessage()); - } - } - return null; - } - -} diff --git a/chaos-api/lambda-functions/src/main/java/lambda/GetProduct.java b/chaos-api/lambda-functions/src/main/java/lambda/GetProduct.java deleted file mode 100644 index ef9558d..0000000 --- a/chaos-api/lambda-functions/src/main/java/lambda/GetProduct.java +++ /dev/null @@ -1,71 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyRequestEvent; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyResponseEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.GetItemRequest; -import software.amazon.awssdk.services.dynamodb.model.GetItemResponse; - -public class GetProduct extends ProductApi implements - RequestHandler { - - private static final String TABLE_NAME = "Products"; - private static final String PRODUCT_ID = "id"; - private final ObjectMapper objectMapper = new ObjectMapper(); - - - @Override - public APIGatewayProxyResponseEvent handleRequest(APIGatewayProxyRequestEvent requestEvent, - Context context) { - String productId = requestEvent.getQueryStringParameters().get(PRODUCT_ID); - System.out.println(requestEvent); - System.out.println("PRODUCT ID: " + productId); - - HashMap valueMap = new HashMap<>(); - valueMap.put("id", AttributeValue.fromS(productId)); - - GetItemRequest getItemRequest = GetItemRequest.builder() - .tableName(TABLE_NAME) - .key(valueMap) - .build(); - - try { - GetItemResponse getItemResponse = ddb.getItem(getItemRequest); - if (getItemResponse.item() != null && !getItemResponse.item().isEmpty()) { - // Convert the result to JSON format - - Map responseBody = new HashMap<>(); - getItemResponse.item().forEach((k, v) -> responseBody.put(k, convertAttributeValue(v))); - - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody(objectMapper.writeValueAsString(responseBody)); - } else { - return new APIGatewayProxyResponseEvent().withStatusCode(404).withBody("Product not found"); - } - } catch (DynamoDbException | JsonProcessingException e) { - context.getLogger().log("Error: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody("Internal server error"); - } - } - - private Object convertAttributeValue(AttributeValue value) { - if (value.s() != null) { - return value.s(); - } - if (value.n() != null) { - return value.n(); - } - if (value.b() != null) { - return value.b(); - } - return null; - } -} \ No newline at end of file diff --git a/chaos-api/lambda-functions/src/main/java/lambda/ProductApi.java b/chaos-api/lambda-functions/src/main/java/lambda/ProductApi.java deleted file mode 100644 index eeafcae..0000000 --- a/chaos-api/lambda-functions/src/main/java/lambda/ProductApi.java +++ /dev/null @@ -1,47 +0,0 @@ -package lambda; - -import com.fasterxml.jackson.databind.ObjectMapper; -import java.net.URI; - -import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; -import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; -import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; -import software.amazon.awssdk.core.retry.RetryPolicy; -import software.amazon.awssdk.regions.Region; -import software.amazon.awssdk.services.dynamodb.DynamoDbClient; -import software.amazon.awssdk.services.sns.SnsClient; - -public class ProductApi { - - protected static final String LOCALSTACK_HOSTNAME = System.getenv("LOCALSTACK_HOSTNAME"); - protected static final String AWS_REGION = System.getenv("AWS_REGION"); - protected static final String topicArn = "arn:aws:sns:us-east-1:000000000000:ProductEventsTopic"; - protected ObjectMapper objectMapper = new ObjectMapper(); - - // Define a custom retry policy - // Set maximum number of retries - RetryPolicy customRetryPolicy = RetryPolicy.builder() - .numRetries(3) - .build(); - - // Apply the custom retry policy to ClientOverrideConfiguration - ClientOverrideConfiguration clientOverrideConfig = ClientOverrideConfiguration.builder() - .retryPolicy(customRetryPolicy) - .build(); - - protected SnsClient snsClient = SnsClient.builder() - .endpointOverride(URI.create(String.format("http://%s:4566", LOCALSTACK_HOSTNAME))) - .credentialsProvider( - StaticCredentialsProvider.create(AwsBasicCredentials.create("test", "test"))) - .region(Region.of(AWS_REGION)) - .build(); - - protected DynamoDbClient ddb = DynamoDbClient.builder() - .endpointOverride(URI.create(String.format("http://%s:4566", LOCALSTACK_HOSTNAME))) - .credentialsProvider( - StaticCredentialsProvider.create(AwsBasicCredentials.create("test", "test"))) - .region(Region.of(AWS_REGION)) - .endpointDiscoveryEnabled(true) - .overrideConfiguration(clientOverrideConfig) - .build(); -} diff --git a/chaos-api/lambda-functions/src/main/resources/lambda_update_script.sh b/chaos-api/lambda-functions/src/main/resources/lambda_update_script.sh deleted file mode 100644 index 38b7d93..0000000 --- a/chaos-api/lambda-functions/src/main/resources/lambda_update_script.sh +++ /dev/null @@ -1,4 +0,0 @@ - -awslocal lambda update-function-code --function-name process-product-events \ - --zip-file fileb://target/product-lambda.jar \ - --region us-east-1 diff --git a/chaos-api/latency-5-sec.sh b/chaos-api/latency-5-sec.sh deleted file mode 100755 index 2f1fade..0000000 --- a/chaos-api/latency-5-sec.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -xeu - -curl --location --request POST 'http://localhost.localstack.cloud:4566/_localstack/chaos/effects' \ ---header 'Content-Type: application/json' \ ---data '{ - "latency": 5000 -}' diff --git a/chaos-api/outage-dynamodb-end.sh b/chaos-api/outage-dynamodb-end.sh deleted file mode 100755 index 7392ec0..0000000 --- a/chaos-api/outage-dynamodb-end.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -xeu - -curl --location --request DELETE 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ ---header 'Content-Type: application/json' \ ---data ' -[ - { - "service": "dynamodb", - "probability": 1.0, - "error": { - "statusCode": 500, - "code": "DatacentreNotFound" - } - } -]' diff --git a/chaos-api/outage-dynamodb-start.sh b/chaos-api/outage-dynamodb-start.sh deleted file mode 100755 index 83fe5a6..0000000 --- a/chaos-api/outage-dynamodb-start.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -xeu - -curl --location --request PATCH 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ ---header 'Content-Type: application/json' \ ---data ' -[ - { - "service": "dynamodb", - "probability": 1.0, - "error": { - "statusCode": 500, - "code": "DatacentreNotFound" - } - } -]' diff --git a/chaos-api/outage-lambda-invoke-end.sh b/chaos-api/outage-lambda-invoke-end.sh deleted file mode 100755 index 892921d..0000000 --- a/chaos-api/outage-lambda-invoke-end.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -xeu - -curl --location --request DELETE 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ ---header 'Content-Type: application/json' \ ---data ' -[ - { - "service": "lambda", - "operation": "Invoke", - "probability": 1.0, - "error": { - "statusCode": 500, - "code": "InternalServerError" - } - } -]' diff --git a/chaos-api/outage-lambda-invoke-start.sh b/chaos-api/outage-lambda-invoke-start.sh deleted file mode 100755 index bb58202..0000000 --- a/chaos-api/outage-lambda-invoke-start.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -xeu - -curl --location --request PATCH 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ ---header 'Content-Type: application/json' \ ---data ' -[ - { - "service": "lambda", - "operation": "Invoke", - "probability": 1.0, - "error": { - "statusCode": 500, - "code": "InternalServerError" - } - } -]' diff --git a/check.sh b/check.sh new file mode 100644 index 0000000..8317a6c --- /dev/null +++ b/check.sh @@ -0,0 +1,51 @@ +echo "--- Step 7: Verify Initial DNS Resolution (Primary) ---" +echo "Waiting a bit for DNS changes to apply..." +sleep 15 +echo "Querying $FAILOVER_RECORD_NAME (should point to primary CNAME/target):" +dig @127.0.0.1 "$FAILOVER_RECORD_NAME" CNAME +short +echo + +echo "--- Step 8: Simulate Primary API Gateway & Lambda Failure in $PRIMARY_API_REGION using Chaos API ---" +echo "Injecting faults for apigateway and lambda services in $PRIMARY_API_REGION..." +curl -L --request POST 'http://localhost:4566/_localstack/chaos/faults' \ +--header 'Content-Type: application/json' \ +--data "[ + {\"service\": \"apigateway\", \"region\": \"${PRIMARY_API_REGION}\"}, + {\"service\": \"lambda\", \"region\": \"${PRIMARY_API_REGION}\"} +]" +echo # for newline +echo + +echo "Waiting for Route 53 to detect health check failure and failover (approx 30-40s)..." +sleep 40 + +echo "--- Step 9: Verify DNS Failover to Secondary ---" +echo "Querying $FAILOVER_RECORD_NAME (should now point to secondary CNAME/target):" +dig @127.0.0.1 "$FAILOVER_RECORD_NAME" CNAME +short +echo +echo "You can also try fetching the health check status again:" +awslocal route53 get-health-check-status --health-check-id "$HEALTH_CHECK_ID" --region "$HEALTH_CHECK_RESOURCE_REGION" +echo +echo + +echo "--- Step 10: Clear Service-Specific Faults (Simulate Primary Recovery) ---" +echo "Clearing faults for apigateway and lambda services in $PRIMARY_API_REGION..." +curl --location --request POST 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ +--header 'Content-Type: application/json' \ +--data '[]' + +echo # for newline +echo + +echo "Waiting for Route 53 to detect health check recovery and failback (approx 30-40s)..." +sleep 40 + +echo "--- Step 11: Verify DNS Failback to Primary ---" +echo "Querying $FAILOVER_RECORD_NAME (should point back to primary CNAME/target):" +dig @127.0.0.1 "$FAILOVER_RECORD_NAME" CNAME +short +echo +echo "Final health check status:" +awslocal route53 get-health-check-status --health-check-id "$HEALTH_CHECK_ID" --region "$HEALTH_CHECK_RESOURCE_REGION" +echo + +echo "Script finished." diff --git a/route53-failover/docker-compose.yml b/docker-compose.yml similarity index 100% rename from route53-failover/docker-compose.yml rename to docker-compose.yml diff --git a/route53-failover/images/fis-api-gw-region.png b/images/fis-api-gw-region.png similarity index 100% rename from route53-failover/images/fis-api-gw-region.png rename to images/fis-api-gw-region.png diff --git a/route53-failover/images/route53-product-stack.png b/images/route53-product-stack.png similarity index 100% rename from route53-failover/images/route53-product-stack.png rename to images/route53-product-stack.png diff --git a/route53-failover/init-resources.sh b/init-resources.sh similarity index 72% rename from route53-failover/init-resources.sh rename to init-resources.sh index df2a43d..fd9cd0d 100755 --- a/route53-failover/init-resources.sh +++ b/init-resources.sh @@ -2,38 +2,43 @@ apt-get -y install jq -# Create resources in the 1st region +# --------------------------------------------------- +# Region: us-east-1 +# --------------------------------------------------- +# Create DynamoDB table echo "Create DynamoDB table..." awslocal dynamodb create-table \ - --table-name Products \ - --attribute-definitions AttributeName=id,AttributeType=S \ - --key-schema AttributeName=id,KeyType=HASH \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 \ - --region us-east-1 + --table-name Products \ + --attribute-definitions AttributeName=id,AttributeType=S \ + --key-schema AttributeName=id,KeyType=HASH \ + --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 \ + --region us-east-1 +# Enable DynamoDB Streams awslocal dynamodb update-table \ - --table-name Products \ - --stream-specification StreamEnabled=true,StreamViewType=NEW_AND_OLD_IMAGES \ - --region us-east-1 + --table-name Products \ + --stream-specification StreamEnabled=true,StreamViewType=NEW_AND_OLD_IMAGES \ + --region us-east-1 +# Create Lambda for DynamoDB Stream awslocal lambda create-function \ - --function-name dynamodb-streams-to-lambda \ - --runtime java17 \ - --handler dynamodb_streams.DynamoDBStreamHandler::handleRequest \ - --memory-size 256 \ - --zip-file fileb:///etc/localstack/init/ready.d/target/product-lambda.jar \ - --role arn:aws:iam::000000000000:role/productRole \ - --region us-east-1 + --function-name dynamodb-streams-to-lambda \ + --runtime java17 \ + --handler dynamodb_streams.DynamoDBStreamHandler::handleRequest \ + --memory-size 256 \ + --zip-file fileb:///etc/localstack/init/ready.d/target/product-lambda.jar \ + --role arn:aws:iam::000000000000:role/productRole \ + --region us-east-1 +# Get stream ARN and create mapping export STREAM_ARN=$(awslocal dynamodb describe-table --table-name Products --region us-east-1 | jq -r '.Table.LatestStreamArn') - awslocal lambda create-event-source-mapping \ - --function-name dynamodb-streams-to-lambda \ - --event-source-arn $STREAM_ARN \ - --starting-position LATEST - + --function-name dynamodb-streams-to-lambda \ + --event-source-arn $STREAM_ARN \ + --starting-position LATEST +# Create Lambdas echo "Add Product Lambda..." awslocal lambda create-function \ --function-name add-product \ @@ -45,7 +50,6 @@ awslocal lambda create-function \ --role arn:aws:iam::000000000000:role/productRole \ --environment Variables={AWS_REGION=us-east-1} - echo "Get Product Lambda..." awslocal lambda create-function \ --function-name get-product \ @@ -65,11 +69,11 @@ awslocal lambda create-function \ --memory-size 512 \ --zip-file fileb:///etc/localstack/init/ready.d/healthcheck.zip \ --region us-east-1 \ - --role arn:aws:iam::000000000000:role/productRole \ + --role arn:aws:iam::000000000000:role/productRole +# Create API Gateway export REST_API_ID=12345 -# create rest api gateway echo "Create Rest API..." awslocal apigateway create-rest-api --name quote-api-gateway --tags '{"_custom_id_":"12345"}' --region us-east-1 @@ -84,28 +88,27 @@ export HEALTHCHECK_RESOURCE_ID=$(awslocal apigateway create-resource --rest-api- echo "HEALTH CHECK ID 1:" echo $HEALTHCHECK_RESOURCE - echo "RESOURCE ID:" echo $RESOURCE +# Setup API Methods echo "Put GET Method..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" \ ---region=us-east-1 + --rest-api-id $REST_API_ID \ + --resource-id $RESOURCE_ID \ + --http-method GET \ + --request-parameters "method.request.path.productApi=true" \ + --authorization-type "NONE" \ + --region=us-east-1 echo "Put POST Method..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method POST \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" \ ---region=us-east-1 - + --rest-api-id $REST_API_ID \ + --resource-id $RESOURCE_ID \ + --http-method POST \ + --request-parameters "method.request.path.productApi=true" \ + --authorization-type "NONE" \ + --region=us-east-1 echo "Update GET Method..." awslocal apigateway update-method \ @@ -115,7 +118,7 @@ awslocal apigateway update-method \ --patch-operations "op=replace,path=/requestParameters/method.request.querystring.param,value=true" \ --region=us-east-1 - +# Integrations echo "Put POST Method Integration..." awslocal apigateway put-integration \ --rest-api-id $REST_API_ID \ @@ -138,23 +141,22 @@ awslocal apigateway put-integration \ --passthrough-behavior WHEN_NO_MATCH \ --region=us-east-1 -echo "Put GET Method that returns 200 for HealthCheck..." +echo "Put GET Method for HealthCheck..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $HEALTHCHECK_RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.healthcheck=true" \ ---authorization-type "NONE" \ ---region=us-east-1 + --rest-api-id $REST_API_ID \ + --resource-id $HEALTHCHECK_RESOURCE_ID \ + --http-method GET \ + --request-parameters "method.request.path.healthcheck=true" \ + --authorization-type "NONE" \ + --region=us-east-1 echo "Put GET Method Integration for HealthCheck..." awslocal apigateway put-integration \ --rest-api-id $REST_API_ID \ --resource-id $HEALTHCHECK_RESOURCE_ID \ --http-method GET \ - --type HTTP \ - --integration-http-method GET \ --type AWS_PROXY \ + --integration-http-method POST \ --uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/arn:aws:lambda:us-east-1:000000000000:function:healthcheck/invocations \ --passthrough-behavior WHEN_NO_MATCH \ --region=us-east-1 @@ -165,18 +167,20 @@ awslocal apigateway create-deployment \ --stage-name dev \ --region=us-east-1 -# -------------------------------------------------------------------------------------------------------------------- - -# Create resources for 2nd region +# --------------------------------------------------- +# Region: us-west-1 +# --------------------------------------------------- +# Create DynamoDB table echo "Create DynamoDB table..." awslocal dynamodb create-table \ - --table-name Products \ - --attribute-definitions AttributeName=id,AttributeType=S \ - --key-schema AttributeName=id,KeyType=HASH \ - --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 \ - --region us-west-1 + --table-name Products \ + --attribute-definitions AttributeName=id,AttributeType=S \ + --key-schema AttributeName=id,KeyType=HASH \ + --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 \ + --region us-west-1 +# Create Lambdas echo "Add Product Lambda..." awslocal lambda create-function \ --function-name add-product \ @@ -188,7 +192,6 @@ awslocal lambda create-function \ --role arn:aws:iam::000000000000:role/productRole \ --environment Variables={AWS_REGION=us-west-1} - echo "Get Product Lambda..." awslocal lambda create-function \ --function-name get-product \ @@ -208,11 +211,11 @@ awslocal lambda create-function \ --memory-size 512 \ --zip-file fileb:///etc/localstack/init/ready.d/healthcheck.zip \ --region us-west-1 \ - --role arn:aws:iam::000000000000:role/productRole \ + --role arn:aws:iam::000000000000:role/productRole +# Create API Gateway export REST_API_ID=67890 -# create rest api gateway echo "Create Rest API..." awslocal apigateway create-rest-api --name quote-api-gateway --tags '{"_custom_id_":"67890"}' --region us-west-1 @@ -227,28 +230,27 @@ export HEALTHCHECK_RESOURCE_ID=$(awslocal apigateway create-resource --rest-api- echo "HEALTH CHECK ID 1:" echo $HEALTHCHECK_RESOURCE - echo "RESOURCE ID:" echo $RESOURCE +# Setup API Methods echo "Put GET Method..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" \ ---region=us-west-1 + --rest-api-id $REST_API_ID \ + --resource-id $RESOURCE_ID \ + --http-method GET \ + --request-parameters "method.request.path.productApi=true" \ + --authorization-type "NONE" \ + --region=us-west-1 echo "Put POST Method..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $RESOURCE_ID \ ---http-method POST \ ---request-parameters "method.request.path.productApi=true" \ ---authorization-type "NONE" \ ---region=us-west-1 - + --rest-api-id $REST_API_ID \ + --resource-id $RESOURCE_ID \ + --http-method POST \ + --request-parameters "method.request.path.productApi=true" \ + --authorization-type "NONE" \ + --region=us-west-1 echo "Update GET Method..." awslocal apigateway update-method \ @@ -258,7 +260,7 @@ awslocal apigateway update-method \ --patch-operations "op=replace,path=/requestParameters/method.request.querystring.param,value=true" \ --region=us-west-1 - +# Integrations echo "Put POST Method Integration..." awslocal apigateway put-integration \ --rest-api-id $REST_API_ID \ @@ -281,24 +283,22 @@ awslocal apigateway put-integration \ --passthrough-behavior WHEN_NO_MATCH \ --region=us-west-1 -echo "Put GET Method that returns 200 for HealthCheck..." +echo "Put GET Method for HealthCheck..." awslocal apigateway put-method \ ---rest-api-id $REST_API_ID \ ---resource-id $HEALTHCHECK_RESOURCE_ID \ ---http-method GET \ ---request-parameters "method.request.path.healthcheck=true" \ ---authorization-type "NONE" \ ---region=us-west-1 - + --rest-api-id $REST_API_ID \ + --resource-id $HEALTHCHECK_RESOURCE_ID \ + --http-method GET \ + --request-parameters "method.request.path.healthcheck=true" \ + --authorization-type "NONE" \ + --region=us-west-1 echo "Put GET Method Integration for HealthCheck..." awslocal apigateway put-integration \ --rest-api-id $REST_API_ID \ --resource-id $HEALTHCHECK_RESOURCE_ID \ --http-method GET \ - --type HTTP \ - --integration-http-method GET \ --type AWS_PROXY \ + --integration-http-method POST \ --uri arn:aws:apigateway:us-west-1:lambda:path/2015-03-31/functions/arn:aws:lambda:us-west-1:000000000000:function:healthcheck/invocations \ --passthrough-behavior WHEN_NO_MATCH \ --region=us-west-1 @@ -307,4 +307,4 @@ echo "Create DEV Deployment..." awslocal apigateway create-deployment \ --rest-api-id $REST_API_ID \ --stage-name dev \ - --region=us-west-1 \ No newline at end of file + --region=us-west-1 diff --git a/route53-failover/lambda-functions/README.md b/lambda-functions/README.md similarity index 100% rename from route53-failover/lambda-functions/README.md rename to lambda-functions/README.md diff --git a/route53-failover/lambda-functions/lambda-python/healthcheck.py b/lambda-functions/lambda-python/healthcheck.py similarity index 100% rename from route53-failover/lambda-functions/lambda-python/healthcheck.py rename to lambda-functions/lambda-python/healthcheck.py diff --git a/route53-failover/lambda-functions/lambda-python/healthcheck.zip b/lambda-functions/lambda-python/healthcheck.zip similarity index 100% rename from route53-failover/lambda-functions/lambda-python/healthcheck.zip rename to lambda-functions/lambda-python/healthcheck.zip diff --git a/route53-failover/lambda-functions/pom.xml b/lambda-functions/pom.xml similarity index 100% rename from route53-failover/lambda-functions/pom.xml rename to lambda-functions/pom.xml diff --git a/route53-failover/lambda-functions/src/main/java/dynamodb_streams/DynamoDBStreamHandler.java b/lambda-functions/src/main/java/dynamodb_streams/DynamoDBStreamHandler.java similarity index 100% rename from route53-failover/lambda-functions/src/main/java/dynamodb_streams/DynamoDBStreamHandler.java rename to lambda-functions/src/main/java/dynamodb_streams/DynamoDBStreamHandler.java diff --git a/FIS-experiments/lambda-functions/src/main/java/lambda/AddProduct.java b/lambda-functions/src/main/java/lambda/AddProduct.java similarity index 100% rename from FIS-experiments/lambda-functions/src/main/java/lambda/AddProduct.java rename to lambda-functions/src/main/java/lambda/AddProduct.java diff --git a/FIS-experiments/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java b/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java similarity index 100% rename from FIS-experiments/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java rename to lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java diff --git a/FIS-experiments/lambda-functions/src/main/java/lambda/GetProduct.java b/lambda-functions/src/main/java/lambda/GetProduct.java similarity index 100% rename from FIS-experiments/lambda-functions/src/main/java/lambda/GetProduct.java rename to lambda-functions/src/main/java/lambda/GetProduct.java diff --git a/route53-failover/lambda-functions/src/main/java/lambda/ProductApi.java b/lambda-functions/src/main/java/lambda/ProductApi.java similarity index 100% rename from route53-failover/lambda-functions/src/main/java/lambda/ProductApi.java rename to lambda-functions/src/main/java/lambda/ProductApi.java diff --git a/FIS-experiments/lambda-functions/src/main/resources/lambda_update_script.sh b/lambda-functions/src/main/resources/lambda_update_script.sh similarity index 100% rename from FIS-experiments/lambda-functions/src/main/resources/lambda_update_script.sh rename to lambda-functions/src/main/resources/lambda_update_script.sh diff --git a/route53-failover/README.md b/route53-failover/README.md deleted file mode 100644 index 6b62612..0000000 --- a/route53-failover/README.md +++ /dev/null @@ -1,278 +0,0 @@ - -# Route53 Failover - - -| Environment | | -|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| __Services__ | Amazon API Gateway, Lambda, DynamoDB, SNS, SQS, Route53 | -| __Categories__ | LocalStack Pro, Init Hooks, Java SDK | - -## Introduction - -LocalStack allows you to integrate and test Chaos Plugin with Route53 to automatically divert users to a healthy secondary zone if the primary region fails, ensuring system availability and responsiveness. -Route53's health checks and traffic redirection enhance architecture resilience and ensure service continuity during regional outages, crucial for uninterrupted user experiences. - -> [!NOTE] -> Route53 Failover and Chaos API is currently available as part of the LocalStack Enterprise plan. -> If you'd like to try it out, please [contact us](https://www.localstack.cloud/demo) to request access. - -## Getting started - -This tutorial is designed for users new to the Route53 and LocalStack Chaos plugin services. -In this example, there's an active-primary and passive-standby configuration. -Route53 routes traffic to the primary region, which processes product-related requests through API Gateway and Lambda functions, with data stored in DynamoDB. -If the primary region fails, Route53 redirects to the standby region, maintained in sync by a replication Lambda function. - -For this particular example, we'll be using a [sample application repository](https://github.com/localstack-samples/samples-chaos-engineering/tree/main/route53-failover). -Clone the repository, and follow the instructions below to get started. - -### Prerequisites - -The general prerequisites for this guide are: - -- LocalStack Pro with LocalStack Auth Token](#) -- [AWS CLI](#) with the [`awslocal` wrapper](#) -- [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) -- [Python-3](https://www.python.org/downloads/) -- `dig` - -Start LocalStack by using the `docker-compose.yml` file from the repository. -Ensure to set your Auth Token as an environment variable during this process. - -``` -$ LOCALSTACK_AUTH_TOKEN= -$ docker compose up -``` - -### Architecture - -The following diagram shows the architecture that this application builds and deploys: - -{{< figure src="route53-failover-1.png" width="800">}} - -### Creating the resources - -To begin, deploy the same services in both `us-west-1` and `us-east-1` regions. -The resources specified in the `init-resources.sh` file will be created when the LocalStack container starts, using [Initialization Hooks](#) and the `awslocal` CLI tool. - -The objective is to have a backup system in case of a regional outage in the primary availability zone (`us-west-1`). -We'll focus on this region to examine the existing resilience mechanisms. - -{{< figure src="route53-failover-2.png" width="800">}} - -- The primary API Gateway includes a health check endpoint that returns a 200 HTTP status code, serving as a basic check for its availability. -- Data synchronization across regions can be achieved with AWS-native tools like DynamoDB Streams and AWS Lambda. - Here, any changes to the primary table trigger a Lambda function, replicating these changes to a secondary table. - This configuration is essential for high availability and disaster recovery. - -### Configuring a Route53 hosted zone - -Let's begin by setting up a hosted zone in Route53 named `hello-localstack.com` and retrieved the hosted zone ID: - -``` -$ HOSTED_ZONE_NAME=hello-localstack.com -$ HOSTED_ZONE_ID=$(awslocal route53 create-hosted-zone --name $HOSTED_ZONE_NAME --caller-reference foo | jq -r .HostedZone.Id) -``` - -Then, define the health check ID for the API Gateway available in the `us-west-1` region: - -``` -$ HEALTH_CHECK_ID=$( -awslocal route53 create-health-check \ ---caller-reference foobar \ ---health-check-config '{ - "FullyQualifiedDomainName": "12345.execute-api.localhost.localstack.cloud", - "Port": 4566, - "ResourcePath": "/dev/healthcheck", - "Type": "HTTP", - "RequestInterval": 10 -}' | jq -r .HealthCheck.Id -) -``` - -This command creates a Route 53 health check for an HTTP endpoint (`12345.execute-api.localhost.localstack.cloud:4566/dev/healthcheck`) with a 10-second request interval and captures the health check's ID. -The caller reference identifier in AWS resource creation or updates prevents accidental duplication if requests are repeated. - -To update DNS records in the specified Route53 hosted zone (`$HOSTED_ZONE_ID`), add two CNAME records: `12345.$HOSTED_ZONE_NAME` pointing to `12345.execute-api.localhost.localstack.cloud`, and `67890.$HOSTED_ZONE_NAME` pointing to `67890.execute-api.localhost.localstack.cloud`. -Set a TTL (Time to Live) of 60 seconds for these records. - -``` -$ awslocal route53 change-resource-record-sets \ ---hosted-zone $HOSTED_ZONE_ID \ ---change-batch '{ - "Changes": [ - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "12345.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "TTL": 60, - "ResourceRecords": [ - {"Value": "12345.execute-api.localhost.localstack.cloud"} - ] - } - }, - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "67890.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "TTL": 60, - "ResourceRecords": [ - {"Value": "67890.execute-api.localhost.localstack.cloud"} - ] - } - } - ] -}' -``` - -Finally, we'll update the DNS records in the Route53 hosted zone identified by `$HOSTED_ZONE_ID`. -We're adding two CNAME records for the subdomain `test.$HOSTED_ZONE_NAME`. -The first record points to `12345.$HOSTED_ZONE_NAME` and is linked with the earlier created health check, designated as the primary failover target. -The second record points to `67890.$HOSTED_ZONE_NAME` and is set as the secondary failover target. - -``` -$ awslocal route53 change-resource-record-sets \ ---hosted-zone-id $HOSTED_ZONE_ID \ ---change-batch '{ - "Changes": [ - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "test.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "SetIdentifier": "12345", - "AliasTarget": { - "HostedZoneId": "'$HOSTED_ZONE_ID'", - "DNSName": "12345.'$HOSTED_ZONE_NAME'", - "EvaluateTargetHealth": true - }, - "HealthCheckId": "'$HEALTH_CHECK_ID'", - "Failover": "PRIMARY" - } - }, - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "test.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "SetIdentifier": "67890", - "AliasTarget": { - "HostedZoneId": "'$HOSTED_ZONE_ID'", - "DNSName": "67890.'$HOSTED_ZONE_NAME'", - "EvaluateTargetHealth": true - }, - "Failover": "SECONDARY" - } - } -] -}' -``` - -This setup represents the basic failover configuration where traffic is redirected to different endpoints based on their health check status. -To confirm that the CNAME record for `test.hello-localstack.com` points to `12345.execute-api.localhost.localstack.cloud`, you can use the following `dig` command: - -``` -$ dig @localhost test.hello-localstack.com CNAME - -..... -;; QUESTION SECTION: -;test.hello-localstack.com. IN CNAME - -;; ANSWER SECTION: -test.hello-localstack.com. 300 IN CNAME 12345.execute-api.localhost.localstack.cloud. -..... - -``` - -### Creating a controlled outage - -Our setup is now complete and ready for testing. -To mimic a regional outage in the `us-west-1` region, we'll configure the [Chaos Plugin](#) to halt all service invocations in this region, including the health check function. -Once the primary region becomes non-functional, Route 53's health checks will fail. -This failure will activate the failover policy, redirecting traffic to the corresponding services in the secondary region, thus maintaining service continuity. - -``` -$ curl -L -X POST 'http://localhost.localstack.cloud:4566/_localstack/chaos/faults' \ --H 'Content-Type: application/json' \ --d ' -[ - { - "region": "us-west-1" - } -]' -``` - -This will cause all services to fail in the `us-west-1` region with a 503 Service Unavailable error. -Because of this, Route 53's health checks will detect the failure and redirect traffic to the standby region as per the failover setup. - -Confirm this redirection with: - -``` -$ dig @localhost test.hello-localstack.com CNAME - -..... -;; QUESTION SECTION: -;test.hello-localstack.com. IN CNAME - -;; ANSWER SECTION: -test.hello-localstack.com. 300 IN CNAME 67890.execute-api.localhost.localstack.cloud. -..... - -``` - -This indicates that the hosted zone name now points to the secondary API Gateway, and `us-east-1` services are in use. - -A Python script can simulate backend handling of this switch: - -```python -import dns.resolver -import requests - -# Set the Route53 DNS resolver to use -dns_resolver_ip = '127.0.0.1' - -# Domain to resolve -domain_to_resolve = 'test.hello-localstack.com' - -# Resolve the CNAME record using the specified DNS server -resolver = dns.resolver.Resolver(configure=False) -resolver.nameservers = [dns_resolver_ip] - -try: - cname_record = resolver.query(domain_to_resolve, rdtype=dns.rdatatype.CNAME) - resolved_domain = str(cname_record[0].target) - - # Construct the full URL with the resolved domain - resolved_url = f'http://{resolved_domain}:4566/dev/productApi?id=prod-1088' - - # Make an HTTP request to the resolved URL - response = requests.get(resolved_url) - - # Print the response - print(response.text) - -except dns.resolver.NXDOMAIN: - print(f"CNAME record not found for {domain_to_resolve}") - -except Exception as e: - print(f"Error: {e}") -``` - -Running the script will resolve the CNAME record for 'test.hello-localstack.com', make an HTTP request to the resolved URL, and print the response, which fetches a Product object from DynamoDB in the `us-east-1` region. - -``` -$ python3 dns-resolver.py - -{"price":"29.99","name":"Super Widget","description":"A versatile widget that can be used for a variety of purposes. -Durable, reliable, and affordable.","id":"prod-1088"} - -``` - -The LocalStack logs will confirm which API Gateway was called based on the resolved domain. - -```bash -2023-11-07T11:59:28.292 DEBUG --- [ asgi_gw_9] l.s.l.i.version_manager : > {resource: /productApi,path: /productApi,httpMethod: GET,headers: {Host=67890.execute-api.localhost.localstack.cloud:4566, -User-Agent=python-requests/2.31.0, accept-encoding=gzip, deflate, accept=*/*, Connection=keep-alive, x-localstack-tgt-api=apigateway .... -``` diff --git a/route53-failover/add-route53.sh b/route53-failover/add-route53.sh deleted file mode 100644 index d5578d3..0000000 --- a/route53-failover/add-route53.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash - -# This is a demo script that showcases Route53 DNS failover in LocalStack -# Make sure the Docker Compose setup is running before executing this script - -set -eux - -HOSTED_ZONE_NAME=hello-localstack.com - -# Create a hosted zone -HOSTED_ZONE_ID=$(awslocal route53 create-hosted-zone --name $HOSTED_ZONE_NAME --caller-reference foo | jq -r .HostedZone.Id) - -# Create a health check that runs against the `http_echo` container -HEALTH_CHECK_ID=$(awslocal route53 create-health-check --caller-reference foobar --health-check-config '{ - "FullyQualifiedDomainName": "12345.execute-api.localhost.localstack.cloud", - "Port": 4566, - "ResourcePath": "/dev/healthcheck", - "Type": "HTTP", - "RequestInterval": 10 -}' | jq -r .HealthCheck.Id) - -# Create RRSets -awslocal route53 change-resource-record-sets --hosted-zone ${HOSTED_ZONE_ID#/hostedzone/} --change-batch '{ -"Changes": [ - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "12345.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "TTL": 60, - "ResourceRecords": [{"Value": "12345.execute-api.localhost.localstack.cloud"}] - } - }, - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "67890.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "TTL": 60, - "ResourceRecords": [{"Value": "67890.execute-api.localhost.localstack.cloud"}] - } - } -]}' -awslocal route53 change-resource-record-sets --hosted-zone-id ${HOSTED_ZONE_ID#/hostedzone/} --change-batch '{ -"Changes": [ - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "test.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "SetIdentifier": "12345", - "AliasTarget": { - "HostedZoneId": "'${HOSTED_ZONE_ID#/hostedzone/}'", - "DNSName": "12345.'$HOSTED_ZONE_NAME'", - "EvaluateTargetHealth": true - }, - "HealthCheckId": "'${HEALTH_CHECK_ID}'", - "Failover": "PRIMARY" - } - }, - { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "test.'$HOSTED_ZONE_NAME'", - "Type": "CNAME", - "SetIdentifier": "67890", - "AliasTarget": { - "HostedZoneId": "'${HOSTED_ZONE_ID#/hostedzone/}'", - "DNSName": "67890.'$HOSTED_ZONE_NAME'", - "EvaluateTargetHealth": true - }, - "Failover": "SECONDARY" - } - } -]}' - -# Get the IP address of the LocalStack container on the Docker bridge -#LOCALSTACK_DNS_SERVER=$(docker inspect localstack | jq -r '.[0].NetworkSettings.Networks."ls_network".IPAddress') -LOCALSTACK_DNS_SERVER=localhost - -# This IP address is used to query the LocalStack DNS server -# This should return `12345.execute-api.localhost.localstack.cloud` as the healthcheck is currently passing -dig @$LOCALSTACK_DNS_SERVER +noall +answer test.hello-localstack.com CNAME - diff --git a/route53-failover/dns-resolver.py b/route53-failover/dns-resolver.py deleted file mode 100644 index b68b2f9..0000000 --- a/route53-failover/dns-resolver.py +++ /dev/null @@ -1,32 +0,0 @@ -import dns.resolver -import requests - -# Set the Route53 DNS resolver to use -dns_resolver_ip = '127.0.0.1' - -# Domain to resolve -domain_to_resolve = 'test.hello-localstack.com' - - -# Resolve the CNAME record using the specified DNS server -resolver = dns.resolver.Resolver(configure=False) -resolver.nameservers = [dns_resolver_ip] - -try: - cname_record = resolver.query(domain_to_resolve, rdtype=dns.rdatatype.CNAME) - resolved_domain = str(cname_record[0].target) - - # Construct the full URL with the resolved domain - resolved_url = f'http://{resolved_domain}:4566/dev/productApi?id=prod-1088' - - # Make an HTTP request to the resolved URL - response = requests.get(resolved_url) - - # Print the response - print(response.text) - -except dns.resolver.NXDOMAIN: - print(f"CNAME record not found for {domain_to_resolve}") - -except Exception as e: - print(f"Error: {e}") diff --git a/route53-failover/experiment-lambda.json b/route53-failover/experiment-lambda.json deleted file mode 100644 index 2d1ba55..0000000 --- a/route53-failover/experiment-lambda.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "actions": { - "Some test action": { - "actionId": "localstack:generic:api-error", - "parameters": { - "service": "lambda", - "operation": "Invoke", - "percentage": "100", - "exception": "Internal Server Error", - "errorCode": "500" - } - } - }, - "description": "Template for error return on Lambda invoke.", - "stopConditions": [{ - "source": "none" - }], - "roleArn": "arn:aws:iam:000000000000:role/ExperimentRole" -} diff --git a/route53-failover/lambda-functions/src/main/java/lambda/AddProduct.java b/route53-failover/lambda-functions/src/main/java/lambda/AddProduct.java deleted file mode 100644 index 95c5511..0000000 --- a/route53-failover/lambda-functions/src/main/java/lambda/AddProduct.java +++ /dev/null @@ -1,102 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyRequestEvent; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyResponseEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonMappingException; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.awscore.exception.AwsServiceException; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.ConditionalCheckFailedException; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.PutItemRequest; -import software.amazon.awssdk.services.sns.model.PublishRequest; - -public class AddProduct extends ProductApi implements - RequestHandler { - - private static final String TABLE_NAME = "Products"; - private static final String PRODUCT_ID = "id"; - - @Override - public APIGatewayProxyResponseEvent handleRequest(APIGatewayProxyRequestEvent requestEvent, - Context context) { - - Map productData; - try { - productData = objectMapper.readValue(requestEvent.getBody(), HashMap.class); - } catch (JsonMappingException e) { - throw new RuntimeException(e); - } catch (com.fasterxml.jackson.core.JsonProcessingException e) { - throw new RuntimeException(e); - } - - HashMap itemValues = new HashMap<>(); - itemValues.put("id", AttributeValue.builder().s(productData.get("id")).build()); - itemValues.put("name", AttributeValue.builder().s(productData.get("name")).build()); - itemValues.put("price", AttributeValue.builder().n(productData.get("price")).build()); - itemValues.put("description", - AttributeValue.builder().s(productData.get("description")).build()); - - PutItemRequest putItemRequest = PutItemRequest.builder() - .tableName(TABLE_NAME) - .item(itemValues) - .conditionExpression("attribute_not_exists(id) OR id = :id") - .expressionAttributeValues( - Map.of(":id", AttributeValue.builder().s(productData.get("id")).build())) - .build(); - - Map headers = new HashMap<>(); - headers.put("Content-Type", "application/json"); - - try { - ddb.putItem(putItemRequest); - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody("Product added/updated successfully.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (ConditionalCheckFailedException e) { - return new APIGatewayProxyResponseEvent().withStatusCode(409) - .withBody("Product with the given ID already exists.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (DynamoDbException e) { - context.getLogger().log("Error: " + e.getMessage()); - // Publish message to SNS topic if DynamoDB operation fails. - String productDataJson; - try { - productDataJson = objectMapper.writeValueAsString(productData); - } catch (JsonProcessingException ex) { - throw new RuntimeException(ex); - } - PublishRequest publishRequest = PublishRequest.builder() - .message(productDataJson) - .topicArn(topicArn) - .build(); - context.getLogger().log("Sending to queue: " + productDataJson); - - snsClient.publish(publishRequest); - - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody("A DynamoDB error occurred. Message sent to queue.") - .withIsBase64Encoded(false).withHeaders(headers); - } catch (AwsServiceException ex) { - context.getLogger().log("AwsServiceException exception: " + ex.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(ex.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } catch (RuntimeException e) { - context.getLogger().log("Runtime exception: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(e.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } catch (Exception e) { - context.getLogger().log("Generic exception: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody(e.getMessage()) - .withIsBase64Encoded(false).withHeaders(headers); - } - - } -} \ No newline at end of file diff --git a/route53-failover/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java b/route53-failover/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java deleted file mode 100644 index 5a6af98..0000000 --- a/route53-failover/lambda-functions/src/main/java/lambda/DynamoDBWriterLambda.java +++ /dev/null @@ -1,64 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.SQSEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.PutItemRequest; -import software.amazon.awssdk.services.dynamodb.model.PutItemResponse; - -public class DynamoDBWriterLambda extends ProductApi implements RequestHandler { - - private final ObjectMapper objectMapper = new ObjectMapper(); - - private static final String TABLE_NAME = "Products"; - - @Override - public Void handleRequest(SQSEvent event, Context context) { - - for (SQSEvent.SQSMessage msg : event.getRecords()) { - try { - JsonNode rootNode = objectMapper.readTree(msg.getBody()); - String messageContent = rootNode.get("Message").asText(); - - Map productData; - try { - productData = objectMapper.readValue(messageContent, HashMap.class); - } catch (JsonProcessingException e) { - throw new RuntimeException(e); - } - HashMap itemValues = new HashMap<>(); - itemValues.put("id", AttributeValue.builder().s(productData.get("id")).build()); - itemValues.put("name", AttributeValue.builder().s(productData.get("name")).build()); - itemValues.put("price", AttributeValue.builder().n(productData.get("price")).build()); - itemValues.put("description", - AttributeValue.builder().s(productData.get("description")).build()); - - // Put the item into the DynamoDB table - PutItemRequest putItemRequest = PutItemRequest.builder() - .tableName(TABLE_NAME) - .item(itemValues) - .build(); - PutItemResponse putItemResult = ddb.putItem(putItemRequest); - context.getLogger().log("Successfully processed message, result: " + putItemResult); - - } catch (DynamoDbException dbe) { - // Service unavailable, let the message go back to the queue after visibility timeout - context.getLogger().log( - "DynamoDB service is unavailable, message will be retried. Error: " - + dbe.getMessage()); - throw dbe; - } catch (Exception e) { - context.getLogger().log("Exception: Error processing the message: " + e.getMessage()); - } - } - return null; - } - -} diff --git a/route53-failover/lambda-functions/src/main/java/lambda/GetProduct.java b/route53-failover/lambda-functions/src/main/java/lambda/GetProduct.java deleted file mode 100644 index ef9558d..0000000 --- a/route53-failover/lambda-functions/src/main/java/lambda/GetProduct.java +++ /dev/null @@ -1,71 +0,0 @@ -package lambda; - -import com.amazonaws.services.lambda.runtime.Context; -import com.amazonaws.services.lambda.runtime.RequestHandler; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyRequestEvent; -import com.amazonaws.services.lambda.runtime.events.APIGatewayProxyResponseEvent; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import java.util.HashMap; -import java.util.Map; -import software.amazon.awssdk.services.dynamodb.model.AttributeValue; -import software.amazon.awssdk.services.dynamodb.model.DynamoDbException; -import software.amazon.awssdk.services.dynamodb.model.GetItemRequest; -import software.amazon.awssdk.services.dynamodb.model.GetItemResponse; - -public class GetProduct extends ProductApi implements - RequestHandler { - - private static final String TABLE_NAME = "Products"; - private static final String PRODUCT_ID = "id"; - private final ObjectMapper objectMapper = new ObjectMapper(); - - - @Override - public APIGatewayProxyResponseEvent handleRequest(APIGatewayProxyRequestEvent requestEvent, - Context context) { - String productId = requestEvent.getQueryStringParameters().get(PRODUCT_ID); - System.out.println(requestEvent); - System.out.println("PRODUCT ID: " + productId); - - HashMap valueMap = new HashMap<>(); - valueMap.put("id", AttributeValue.fromS(productId)); - - GetItemRequest getItemRequest = GetItemRequest.builder() - .tableName(TABLE_NAME) - .key(valueMap) - .build(); - - try { - GetItemResponse getItemResponse = ddb.getItem(getItemRequest); - if (getItemResponse.item() != null && !getItemResponse.item().isEmpty()) { - // Convert the result to JSON format - - Map responseBody = new HashMap<>(); - getItemResponse.item().forEach((k, v) -> responseBody.put(k, convertAttributeValue(v))); - - return new APIGatewayProxyResponseEvent().withStatusCode(200) - .withBody(objectMapper.writeValueAsString(responseBody)); - } else { - return new APIGatewayProxyResponseEvent().withStatusCode(404).withBody("Product not found"); - } - } catch (DynamoDbException | JsonProcessingException e) { - context.getLogger().log("Error: " + e.getMessage()); - return new APIGatewayProxyResponseEvent().withStatusCode(500) - .withBody("Internal server error"); - } - } - - private Object convertAttributeValue(AttributeValue value) { - if (value.s() != null) { - return value.s(); - } - if (value.n() != null) { - return value.n(); - } - if (value.b() != null) { - return value.b(); - } - return null; - } -} \ No newline at end of file diff --git a/route53-failover/lambda-functions/src/main/resources/lambda_update_script.sh b/route53-failover/lambda-functions/src/main/resources/lambda_update_script.sh deleted file mode 100644 index 38b7d93..0000000 --- a/route53-failover/lambda-functions/src/main/resources/lambda_update_script.sh +++ /dev/null @@ -1,4 +0,0 @@ - -awslocal lambda update-function-code --function-name process-product-events \ - --zip-file fileb://target/product-lambda.jar \ - --region us-east-1 diff --git a/solutions/dynamodb-outage.sh b/solutions/dynamodb-outage.sh new file mode 100755 index 0000000..830973b --- /dev/null +++ b/solutions/dynamodb-outage.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +set -e +set -o pipefail + +AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL:-"http://localhost:4566"} + +# Colors for logging +GREEN='\033[0;32m' +BLUE='\033[0;34m' +RED='\033[0;31m' +NC='\033[0m' + +# Logging functions +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" >&2 +} + +error_log() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR:${NC} $1" >&2 +} + +trap 'error_log "An error occurred. Exiting..."; exit 1' ERR + +# Setup SNS Topic +log "Creating SNS topic 'ProductEventsTopic'..." +SNS_TOPIC_ARN=$(awslocal sns create-topic --name ProductEventsTopic --output json | jq -r '.TopicArn') +log "SNS topic created. ARN: $SNS_TOPIC_ARN" + +# Setup SQS Queue +log "Creating SQS queue 'ProductEventsQueue'..." +QUEUE_URL=$(awslocal sqs create-queue --queue-name ProductEventsQueue --output json | jq -r '.QueueUrl') +QUEUE_ARN=$(awslocal sqs get-queue-attributes \ + --queue-url $QUEUE_URL \ + --attribute-names QueueArn \ + --query 'Attributes.QueueArn' --output text) +log "SQS queue created. ARN: $QUEUE_ARN" + +# Subscribe SQS Queue to SNS Topic +log "Subscribing SQS queue to SNS topic..." +awslocal sns subscribe \ + --topic-arn $SNS_TOPIC_ARN \ + --protocol sqs \ + --notification-endpoint $QUEUE_ARN >/dev/null +log "SQS queue subscribed to SNS topic." + +# Create Lambda Function +log "Creating Lambda function 'process-product-events'..." +awslocal lambda create-function \ + --function-name process-product-events \ + --runtime java17 \ + --handler lambda.DynamoDBWriterLambda::handleRequest \ + --memory-size 1024 \ + --timeout 20 \ + --zip-file fileb://lambda-functions/target/product-lambda.jar \ + --role arn:aws:iam::000000000000:role/productRole >/dev/null +log "Lambda function created." + +# Create Event Source Mapping from SQS to Lambda +log "Creating event source mapping from SQS to Lambda..." +awslocal lambda create-event-source-mapping \ + --function-name process-product-events \ + --batch-size 10 \ + --event-source-arn $QUEUE_ARN >/dev/null +log "Event source mapping created." + +# Set Queue Attributes +log "Setting SQS queue attributes..." +awslocal sqs set-queue-attributes \ + --queue-url $QUEUE_URL \ + --attributes VisibilityTimeout=10 >/dev/null +log "SQS queue attributes set." + +# Final Output +echo +echo -e "${BLUE}Setup completed successfully.${NC}" +echo -e "${BLUE}SNS Topic ARN:${NC} $SNS_TOPIC_ARN" +echo -e "${BLUE}SQS Queue ARN:${NC} $QUEUE_ARN" diff --git a/solutions/route53-failover.sh b/solutions/route53-failover.sh new file mode 100755 index 0000000..52973e0 --- /dev/null +++ b/solutions/route53-failover.sh @@ -0,0 +1,165 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Colors for logging +GREEN='\033[0;32m' +BLUE='\033[0;34m' +RED='\033[0;31m' +NC='\033[0m' + +# Logging functions +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" >&2 +} + +error_log() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR:${NC} $1" >&2 +} + +trap 'error_log "An error occurred. Exiting..."; exit 1' ERR + +# Step 1: Define Hosted Zone +log "Defining hosted zone..." +HOSTED_ZONE_NAME="hello-localstack.com" +RAW_HOSTED_ZONE_ID=$(awslocal route53 create-hosted-zone \ + --name "$HOSTED_ZONE_NAME" \ + --caller-reference "zone-$(date +%s)" | jq -r .HostedZone.Id) +CLEANED_HOSTED_ZONE_ID="${RAW_HOSTED_ZONE_ID#/hostedzone/}" + +log "Hosted Zone Name: $HOSTED_ZONE_NAME" +log "Raw Hosted Zone ID: $RAW_HOSTED_ZONE_ID" +export HOSTED_ZONE_NAME RAW_HOSTED_ZONE_ID + +# Step 2: Define API Gateway and Health Check Parameters +log "Defining API Gateway and health check parameters..." +PRIMARY_API_ID="12345" +SECONDARY_API_ID="67890" +PRIMARY_API_REGION="us-east-1" +HEALTH_CHECK_RESOURCE_PATH="/dev/healthcheck" +PRIMARY_API_GATEWAY_FQDN="${PRIMARY_API_ID}.execute-api.localhost.localstack.cloud" +HEALTH_CHECK_PORT=4566 + +log "Primary API ID: $PRIMARY_API_ID" +log "Primary API FQDN: $PRIMARY_API_GATEWAY_FQDN" +log "Health Check Port: $HEALTH_CHECK_PORT" +log "Health Check Path: $HEALTH_CHECK_RESOURCE_PATH" + +# Step 3: Create Health Check for the Primary API Gateway +log "Creating Route 53 health check..." +HEALTH_CHECK_RESOURCE_REGION="us-west-1" +HEALTH_CHECK_ID=$(awslocal route53 create-health-check \ + --caller-reference "hc-app-${PRIMARY_API_ID}-$(date +%s)" \ + --region "$HEALTH_CHECK_RESOURCE_REGION" \ + --health-check-config "{\"FullyQualifiedDomainName\": \"${PRIMARY_API_GATEWAY_FQDN}\", \"Port\": ${HEALTH_CHECK_PORT}, \"ResourcePath\": \"${HEALTH_CHECK_RESOURCE_PATH}\", \"Type\": \"HTTP\", \"RequestInterval\": 10, \"FailureThreshold\": 2}" | jq -r .HealthCheck.Id) + +log "Health check created with ID: $HEALTH_CHECK_ID in region $HEALTH_CHECK_RESOURCE_REGION" +export HEALTH_CHECK_ID +sleep 5 + +# Step 4: Verify Initial Health +log "Verifying primary health check endpoint (expect HTTP 200)..." +curl --connect-timeout 5 -v "http://${PRIMARY_API_GATEWAY_FQDN}:${HEALTH_CHECK_PORT}${HEALTH_CHECK_RESOURCE_PATH}" || true + +log "Fetching health check status from Route 53 (may take a few seconds)..." +sleep 25 +awslocal route53 get-health-check-status \ + --health-check-id "$HEALTH_CHECK_ID" \ + --region "$HEALTH_CHECK_RESOURCE_REGION" >/dev/null + +# Step 5: Create CNAME Records +log "Creating CNAME records for regional endpoints..." +PRIMARY_REGIONAL_DNS_NAME="${PRIMARY_API_ID}.${HOSTED_ZONE_NAME}" +SECONDARY_REGIONAL_DNS_NAME="${SECONDARY_API_ID}.${HOSTED_ZONE_NAME}" +PRIMARY_API_TARGET_FQDN="${PRIMARY_API_ID}.execute-api.localhost.localstack.cloud" +SECONDARY_API_TARGET_FQDN="${SECONDARY_API_ID}.execute-api.localhost.localstack.cloud" + +CHANGE_BATCH_REGIONAL_CNAMES_JSON=$(cat </dev/null +log "CNAME records created." + +# Step 6: Create Failover Alias Records +log "Creating failover alias records..." +FAILOVER_RECORD_NAME="test.${HOSTED_ZONE_NAME}" +PRIMARY_FAILOVER_SET_ID="primary-app-${PRIMARY_API_ID}" +SECONDARY_FAILOVER_SET_ID="secondary-app-${SECONDARY_API_ID}" + +CHANGE_BATCH_FAILOVER_ALIASES_JSON=$(cat </dev/null +log "Failover alias records created." + +# Final Output +echo +echo -e "${BLUE}Route 53 and failover setup completed successfully.${NC}" +echo -e "${BLUE}Hosted Zone:${NC} $HOSTED_ZONE_NAME" +echo -e "${BLUE}Primary API FQDN:${NC} $PRIMARY_API_GATEWAY_FQDN" +echo -e "${BLUE}Health Check ID:${NC} $HEALTH_CHECK_ID" +echo -e "${BLUE}Failover Domain:${NC} $FAILOVER_RECORD_NAME" diff --git a/tests/__pycache__/test_failover.cpython-311-pytest-8.3.5.pyc b/tests/__pycache__/test_failover.cpython-311-pytest-8.3.5.pyc new file mode 100644 index 0000000..8632f0c Binary files /dev/null and b/tests/__pycache__/test_failover.cpython-311-pytest-8.3.5.pyc differ diff --git a/tests/__pycache__/test_outage.cpython-311-pytest-8.3.5.pyc b/tests/__pycache__/test_outage.cpython-311-pytest-8.3.5.pyc new file mode 100644 index 0000000..2d3e7bf Binary files /dev/null and b/tests/__pycache__/test_outage.cpython-311-pytest-8.3.5.pyc differ diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt new file mode 100644 index 0000000..8b6ee54 --- /dev/null +++ b/tests/requirements-dev.txt @@ -0,0 +1,4 @@ +boto3 +pytest +requests +dnspython diff --git a/tests/test_failover.py b/tests/test_failover.py new file mode 100644 index 0000000..445f521 --- /dev/null +++ b/tests/test_failover.py @@ -0,0 +1,165 @@ +import pytest +import time +import requests +import dns.resolver +import os +import boto3 + +LOCALSTACK_ENDPOINT_URL = os.environ.get( + "LOCALSTACK_ENDPOINT_URL", "http://localhost:4566" +) +CHAOS_ENDPOINT = f"{LOCALSTACK_ENDPOINT_URL}/_localstack/chaos/faults" + +HOSTED_ZONE_NAME = "hello-localstack.com" +PRIMARY_API_ID = "12345" +SECONDARY_API_ID = "67890" +PRIMARY_API_REGION = "us-east-1" +HEALTH_CHECK_RESOURCE_REGION = "us-west-1" +HEALTH_CHECK_PORT = 4566 +HEALTH_CHECK_RESOURCE_PATH = "/dev/healthcheck" + +PRIMARY_API_GATEWAY_FQDN = f"{PRIMARY_API_ID}.execute-api.localhost.localstack.cloud" +SECONDARY_API_GATEWAY_FQDN = ( + f"{SECONDARY_API_ID}.execute-api.localhost.localstack.cloud" +) +FAILOVER_RECORD_NAME = f"test.{HOSTED_ZONE_NAME}" + +HEALTH_CHECK_INTERVAL = 10 +HEALTH_CHECK_FAILURE_THRESHOLD = 2 +INITIAL_DNS_WAIT_PERIOD = 10 +DNS_CHECK_RETRIES = 4 +DNS_CHECK_DELAY = 5 +FAILOVER_REACTION_WAIT = (HEALTH_CHECK_INTERVAL * HEALTH_CHECK_FAILURE_THRESHOLD) + 25 + + +def get_cname_target(hostname, dns_server="127.0.0.1", port=53, max_cname_hops=5): + resolver = dns.resolver.Resolver() + resolver.nameservers = [dns_server] + resolver.port = port + resolver.timeout = 2 + resolver.lifetime = 5 + + current_hostname = hostname + + for hop in range(max_cname_hops): + if ".execute-api.localhost.localstack.cloud" in current_hostname: + return current_hostname + + try: + answers = resolver.resolve(current_hostname, "CNAME") + if answers and len(answers) > 0: + new_target = str(answers[0].target).rstrip(".") + if not new_target or new_target == current_hostname: + return current_hostname + current_hostname = new_target + if ".execute-api.localhost.localstack.cloud" in current_hostname: + return current_hostname + else: + return current_hostname + except dns.resolver.NoAnswer: + return current_hostname + except dns.resolver.NXDOMAIN: + return "NXDOMAIN" + except dns.exception.Timeout: + return "TIMEOUT" + except Exception as e: + return f"ERROR_RESOLVING" + return current_hostname + + +@pytest.fixture(scope="session") +def route53_client(): + return boto3.client( + "route53", + endpoint_url=LOCALSTACK_ENDPOINT_URL, + region_name=HEALTH_CHECK_RESOURCE_REGION, + ) + + +@pytest.fixture(scope="session") +def health_check_id(route53_client): + try: + paginator = route53_client.get_paginator("list_health_checks") + for page in paginator.paginate(): + for hc in page.get("HealthChecks", []): + config = hc.get("HealthCheckConfig", {}) + if ( + config.get("FullyQualifiedDomainName") == PRIMARY_API_GATEWAY_FQDN + and config.get("Port") == HEALTH_CHECK_PORT + and config.get("ResourcePath") == HEALTH_CHECK_RESOURCE_PATH + ): + found_id = hc["Id"] + return found_id + pytest.fail( + f"Could not find an existing health check for {PRIMARY_API_GATEWAY_FQDN}:{HEALTH_CHECK_PORT}{HEALTH_CHECK_RESOURCE_PATH}" + ) + except Exception as e: + pytest.fail(f"Error trying to find health check ID: {e}") + return None + + +def perform_dns_check_with_retry(fqdn_to_check, expected_target_fqdn, step_name): + print(f"\n{step_name} (expecting: {expected_target_fqdn})...") + current_target = None + for i in range(DNS_CHECK_RETRIES): + current_target = get_cname_target(fqdn_to_check) + if current_target == expected_target_fqdn: + return current_target + if ( + current_target == "TIMEOUT" + or "ERROR_RESOLVING" in str(current_target) + or ("FAILED_ALL_RETRIES_FOR" in str(current_target)) + ): + break + time.sleep(DNS_CHECK_DELAY) + + assert ( + current_target == expected_target_fqdn + ), f"Expected DNS resolution for {fqdn_to_check} to be {expected_target_fqdn}, but got {current_target} after {DNS_CHECK_RETRIES} retries." + return current_target + + +def test_dns_failover_cycle(route53_client, health_check_id): + time.sleep(INITIAL_DNS_WAIT_PERIOD) + + perform_dns_check_with_retry( + FAILOVER_RECORD_NAME, + PRIMARY_API_GATEWAY_FQDN, + "1. Verifying initial DNS resolution", + ) + + print( + f"\n2. Inducing chaos for 'apigateway' and 'lambda' in region '{PRIMARY_API_REGION}'..." + ) + fault_payload = [ + {"service": "apigateway", "region": PRIMARY_API_REGION}, + {"service": "lambda", "region": PRIMARY_API_REGION}, + ] + try: + response = requests.post(CHAOS_ENDPOINT, json=fault_payload, timeout=10) + response.raise_for_status() + except requests.exceptions.RequestException as e: + pytest.fail(f"Failed to inject chaos: {e}") + time.sleep(FAILOVER_REACTION_WAIT) + + perform_dns_check_with_retry( + FAILOVER_RECORD_NAME, + SECONDARY_API_GATEWAY_FQDN, + "3. Verifying DNS failover to secondary", + ) + + print( + f"\n4. Clearing chaos for 'apigateway' and 'lambda' in region '{PRIMARY_API_REGION}'..." + ) + try: + response = requests.delete(CHAOS_ENDPOINT, json=fault_payload, timeout=10) + response.raise_for_status() + except requests.exceptions.RequestException as e: + pytest.fail(f"Failed to clear chaos: {e}") + time.sleep(FAILOVER_REACTION_WAIT) + + perform_dns_check_with_retry( + FAILOVER_RECORD_NAME, + PRIMARY_API_GATEWAY_FQDN, + "5. Verifying DNS failback to primary", + ) diff --git a/tests/test_outage.py b/tests/test_outage.py new file mode 100644 index 0000000..a5ae85f --- /dev/null +++ b/tests/test_outage.py @@ -0,0 +1,270 @@ +import pytest +import time +import boto3 +import requests +import os +import json +import botocore + +LOCALSTACK_ENDPOINT_URL = os.environ.get( + "LOCALSTACK_ENDPOINT_URL", "http://localhost:4566" +) +CHAOS_ENDPOINT = f"{LOCALSTACK_ENDPOINT_URL}/_localstack/chaos/faults" + +DYNAMODB_TABLE_NAME = "Products" +SERVICE_REGION = "us-east-1" +PRIMARY_API_ID = "12345" +API_GATEWAY_PORT = 4566 +ADD_PRODUCT_URL = f"http://{PRIMARY_API_ID}.execute-api.localhost.localstack.cloud:{API_GATEWAY_PORT}/dev/productApi" + +BASE_PRODUCT_DATA = { + "name": "Pytest Widget", + "price": "25.99", + "description": "A widget specifically for Pytest scenarios.", +} + +DYNAMODB_OUTAGE_REACTION_WAIT = 10 +SERVICE_RECOVERY_WAIT = 30 + + +def manage_chaos(service_name, region_name, induce=True, timeout=10): + fault_payload = [{"service": service_name, "region": region_name}] + action_str = "Inducing" if induce else "Clearing" + try: + if induce: + response = requests.post( + CHAOS_ENDPOINT, json=fault_payload, timeout=timeout + ) + else: + response = requests.delete( + CHAOS_ENDPOINT, json=fault_payload, timeout=timeout + ) + response.raise_for_status() + if induce: + return response.json() + else: + return [] + except requests.exceptions.RequestException as e: + pytest.fail( + f"Failed to {action_str.lower()} chaos for {service_name} in {region_name}: {e}" + ) + except json.JSONDecodeError as e: + current_response_text = ( + response.text + if "response" in locals() + else "Response object not available or no text." + ) + pytest.fail( + f"Failed to parse JSON response when {action_str.lower()} chaos: {e}. Response text: {current_response_text}" + ) + return None + + +def check_active_faults(expected_to_be_present_or_absent, present=True, timeout=5): + try: + response = requests.get(CHAOS_ENDPOINT, timeout=timeout) + response.raise_for_status() + active_faults = response.json() + + normalize_fault = lambda d: tuple(sorted(d.items())) + normalized_active_set = {normalize_fault(f) for f in active_faults} + + if not expected_to_be_present_or_absent and not present: + assert ( + not active_faults + ), f"Expected no active faults, but got: {active_faults}" + return + + normalized_expected_set = { + normalize_fault(f) for f in expected_to_be_present_or_absent + } + + if present: + assert normalized_expected_set.issubset( + normalized_active_set + ), f"Expected faults {expected_to_be_present_or_absent} to be active, but active set is {active_faults}" + else: + assert not normalized_expected_set.intersection( + normalized_active_set + ), f"Expected faults {expected_to_be_present_or_absent} to be cleared, but some were found in active set: {active_faults}" + + except requests.exceptions.RequestException as e: + pytest.fail(f"Failed to GET active faults: {e}") + except json.JSONDecodeError as e: + current_response_text = ( + response.text + if "response" in locals() + else "Response object not available or no text." + ) + pytest.fail( + f"Failed to parse JSON response from GET /faults: {e}. Response text: {current_response_text}" + ) + + +@pytest.fixture(scope="session") +def dynamodb_resource(): + return boto3.resource( + "dynamodb", endpoint_url=LOCALSTACK_ENDPOINT_URL, region_name=SERVICE_REGION + ) + + +@pytest.fixture(scope="session") +def lambda_client(): + return boto3.client( + "lambda", endpoint_url=LOCALSTACK_ENDPOINT_URL, region_name=SERVICE_REGION + ) + + +def test_dynamodb_table_exists(dynamodb_resource): + try: + table = dynamodb_resource.Table(DYNAMODB_TABLE_NAME) + table.load() + except Exception as e: + pytest.fail( + f"DynamoDB table '{DYNAMODB_TABLE_NAME}' not found or not accessible in region {SERVICE_REGION}: {e}" + ) + + +def test_add_product_lambda_exists(lambda_client): + try: + lambda_client.get_function(FunctionName="add-product") + except Exception as e: + pytest.fail( + f"Lambda function 'add-product' not found in region {SERVICE_REGION}: {e}" + ) + + +def test_dynamodb_outage_impacts_add_product(dynamodb_resource): + headers = {"Content-Type": "application/json"} + expected_plain_text_success_message = "Product added/updated successfully." + expected_outage_message = "A DynamoDB error occurred. Message sent to queue." + + ts = int(time.time()) + normal_product_id = f"prod-normal-{ts}" + normal_data = { + "id": normal_product_id, + "name": f"Normal Widget {ts}", + "price": BASE_PRODUCT_DATA["price"], + "description": f"{BASE_PRODUCT_DATA['description']} (Normal Operation)", + } + + outage_attempt_product_id = f"prod-outage-{ts}" + outage_data = { + "id": outage_attempt_product_id, + "name": f"Outage Attempt Widget {ts}", + "price": "0.00", + "description": f"{BASE_PRODUCT_DATA['description']} (During Outage Attempt)", + } + + restored_product_id = f"prod-restored-{ts + 1}" + restored_data = { + "id": restored_product_id, + "name": f"Restored Widget {ts+1}", + "price": "26.99", + "description": f"{BASE_PRODUCT_DATA['description']} (Post Recovery)", + } + + response_normal = None + try: + response_normal = requests.post( + ADD_PRODUCT_URL, headers=headers, json=normal_data, timeout=10 + ) + response_normal.raise_for_status() + assert expected_plain_text_success_message in response_normal.text + table = dynamodb_resource.Table(DYNAMODB_TABLE_NAME) + item = table.get_item(Key={"id": normal_product_id}).get("Item") + assert item is not None and item["name"] == normal_data["name"] + except requests.exceptions.HTTPError as http_err: + pytest.fail( + f"HTTP error during normal operation: {http_err} - Response: {http_err.response.text if http_err.response else 'N/A'}" + ) + except requests.exceptions.RequestException as e: + pytest.fail(f"Network/Request error during normal operation: {e}") + except Exception as e: + response_text = ( + response_normal.text if response_normal else "Response object not available" + ) + pytest.fail( + f"Error during normal operation verification: {e} (Response text was: '{response_text}')" + ) + + faults_to_induce = [{"service": "dynamodb", "region": SERVICE_REGION}] + manage_chaos(service_name="dynamodb", region_name=SERVICE_REGION, induce=True) + check_active_faults(expected_to_be_present_or_absent=faults_to_induce, present=True) + time.sleep(DYNAMODB_OUTAGE_REACTION_WAIT) + + response_outage = None + try: + response_outage = requests.post( + ADD_PRODUCT_URL, headers=headers, json=outage_data, timeout=10 + ) + assert ( + response_outage.status_code == 200 + ), f"Expected status code 200 during graceful handling, got {response_outage.status_code}." + assert ( + expected_outage_message in response_outage.text + ), f"Expected outage message '{expected_outage_message}' not found. Got: '{response_outage.text}'" + + table = dynamodb_resource.Table(DYNAMODB_TABLE_NAME) + try: + item_response_during_outage = table.get_item( + Key={"id": outage_attempt_product_id} + ) + item_during_outage = item_response_during_outage.get("Item") + if item_during_outage is not None: + pytest.fail( + f"Product '{outage_attempt_product_id}' WAS FOUND in DynamoDB during outage, which is unexpected." + ) + except botocore.exceptions.ClientError as ce: + error_code = ce.response.get("Error", {}).get("Code") + assert ( + "ServiceUnavailable" in str(ce) + or "ProvisionedThroughputExceededException" in str(ce) + or error_code == "ServiceUnavailable" + ), f"Expected ServiceUnavailable from DynamoDB due to chaos, but got: {ce}" + + except requests.exceptions.RequestException as e: + pytest.fail( + f"Request to API Gateway failed during outage test, unexpected if Lambda handles gracefully: {e}" + ) + except Exception as e: + response_text = ( + response_outage.text if response_outage else "Response object not available" + ) + pytest.fail( + f"Unexpected generic error during outage product addition test step: {e} (API Response text: '{response_text}')" + ) + + manage_chaos(service_name="dynamodb", region_name=SERVICE_REGION, induce=False) + check_active_faults( + expected_to_be_present_or_absent=faults_to_induce, present=False + ) + time.sleep(SERVICE_RECOVERY_WAIT) + + response_restored = None + try: + response_restored = requests.post( + ADD_PRODUCT_URL, headers=headers, json=restored_data, timeout=10 + ) + response_restored.raise_for_status() + assert expected_plain_text_success_message in response_restored.text + table = dynamodb_resource.Table(DYNAMODB_TABLE_NAME) + item_restored = table.get_item(Key={"id": restored_product_id}).get("Item") + assert ( + item_restored is not None and item_restored["name"] == restored_data["name"] + ) + except requests.exceptions.HTTPError as http_err: + pytest.fail( + f"HTTP error during post-recovery: {http_err} - Response: {http_err.response.text if http_err.response else 'N/A'}" + ) + except requests.exceptions.RequestException as e: + pytest.fail(f"Network/Request error during post-recovery: {e}") + except Exception as e: + response_text = ( + response_restored.text + if response_restored + else "Response object not available" + ) + pytest.fail( + f"Error during post-recovery: {e} (Response text: '{response_text}')" + )