From 4b1c7235f3a6ad7fc3c79c870ded4c0a7474d06f Mon Sep 17 00:00:00 2001 From: Christopher Butler Date: Fri, 22 May 2026 13:24:05 -0400 Subject: [PATCH 1/5] Update and Improve CDN Publish Why these changes are being made: This started as just a simple "update the version of third party actions" and then turned into a refactor. The refactor cleans up the logic and DRYs the code a bit and requires NO changes from any caller workflow. How this addresses that need: * Update actions/checkout and aws-actions/configure-aws-credentials to the most recent versions * Clean up usage of environment variables in the various bash commands throughout the workflow * Add a "polling" step (using the `aws cloudfront wait` command) to ensure that the cache invalidation completes before the workflow finishes --- .github/workflows/cdn-shared-publish.yml | 166 ++++++++++++++--------- 1 file changed, 100 insertions(+), 66 deletions(-) diff --git a/.github/workflows/cdn-shared-publish.yml b/.github/workflows/cdn-shared-publish.yml index d5f595c..1db304f 100644 --- a/.github/workflows/cdn-shared-publish.yml +++ b/.github/workflows/cdn-shared-publish.yml @@ -31,97 +31,131 @@ defaults: jobs: publish: - name: Publish content to CDN + name: Publish to CDN runs-on: ubuntu-latest permissions: id-token: write contents: read steps: - - uses: actions/checkout@v4 - - - name: DEV Configure AWS credentials - # Only run this step if the environment is "dev" - if: ${{ inputs.ENVIRONMENT == 'dev' }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: actions/checkout@v6 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCT_DEV }}:role/${{ inputs.GHA_ROLE }} - aws-region: ${{ inputs.AWS_REGION }} + persist-credentials: false - - name: STAGE Configure AWS credentials - # Only run this step if the environment is "stage" - if: ${{ inputs.ENVIRONMENT == 'stage' }} - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCT_STAGE }}:role/${{ inputs.GHA_ROLE }} - aws-region: ${{ inputs.AWS_REGION }} + - name: Set Environment + id: aws_env + env: + ENVIRONMENT: ${{ inputs.ENVIRONMENT }} + GHA_ROLE: ${{ inputs.GHA_ROLE }} + AWS_DEV_ACCT: ${{ secrets.AWS_ACCT_DEV }} + AWS_STAGE_ACCT: ${{ secrets.AWS_ACCT_STAGE }} + AWS_PROD_ACCT: ${{ secrets.AWS_ACCT_PROD }} + run: | + if [ "$ENVIRONMENT" == "dev" ]; then + echo "ROLE=arn:aws:iam::$AWS_DEV_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=dev1.mitlibrary.net" >> $GITHUB_ENV + elif [ "$ENVIRONMENT" == "stage" ]; then + echo "ROLE=arn:aws:iam::$AWS_STAGE_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=stage.mitlibrary.net" >> $GITHUB_ENV + elif [ "$ENVIRONMENT" == "prod" ]; then + echo "ROLE=arn:aws:iam::$AWS_PROD_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=libraries.mit.edu" >> $GITHUB_ENV + else + echo "ERROR: Incorrect environment was set" >> $GITHUB_STEP_SUMMARY + exit 1 + fi - - name: PROD Configure AWS credentials - # Only run this step if the environment is "prod" - if: ${{ inputs.ENVIRONMENT == 'prod' }} - uses: aws-actions/configure-aws-credentials@v4 + - name: Configure AWS Credentials + id: aws_credentials + uses: aws-actions/configure-aws-credentials@v6 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCT_PROD }}:role/${{ inputs.GHA_ROLE }} + role-to-assume: ${{ env.ROLE }} aws-region: ${{ inputs.AWS_REGION }} - - name: Sync custom domain CDN S3 content + - name: Sync To Custom Domain CDN # Only run this step if this is custom domain content (e.g., a folder at the root of bucket) if: ${{ inputs.DOMAIN == 'custom' }} + env: + SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} + S3_URI: ${{ inputs.S3URI }} run: | - if [ '${{ inputs.SYNC_PARAMS }}' != '' ]; then - aws s3 sync . ${{ inputs.S3URI }} --delete --exclude ".github/*" --exclude ".git/*" --exclude ".gitignore" ${{ inputs.SYNC_PARAMS }} + echo "### Content synchronization to $S3_URI." >> $GITHUB_STEP_SUMMARY + if [ "$SYNC_PARAMS" != "" ]; then + aws s3 sync . $S3_URI \ + --delete \ + --exclude ".github/*" \ + --exclude ".git/*" \ + --exclude ".gitignore" \ + $SYNC_PARAMS else - aws s3 sync . ${{ inputs.S3URI }} --delete --exclude ".github/*" --exclude ".git/*" --exclude ".gitignore" + aws s3 sync . $S3_URI \ + --delete \ + --exclude ".github/*" \ + --exclude ".git/*" + --exclude ".gitignore" fi - echo "Content is synchronized to ${{ inputs.S3URI }}" >> $GITHUB_STEP_SUMMARY + echo "Content is synchronized to $S3_URI." >> $GITHUB_STEP_SUMMARY - - name: Sync standard CDN S3 content + - name: Sync To Standard Domain CDN # Only run this step if this is standard content (e.g., a subfolder of the cdn/ folder) if: ${{ inputs.DOMAIN == 'standard' }} + env: + SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} + S3_URI: ${{ inputs.S3URI }} run: | - if [ '${{ inputs.SYNC_PARAMS }}' != '' ]; then - aws s3 sync ./$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') ${{ inputs.S3URI }} --delete --exclude ".github/*" --exclude ".git/*" --exclude ".gitignore" ${{ inputs.SYNC_PARAMS }} + echo "### Content synchronization" >> $GITHUB_STEP_SUMMARY + if [ "$SYNC_PARAMS" != "" ]; then + aws s3 sync ./$(echo $S3_URI | awk -F/ '{print $5}') $S3_URI \ + --delete \ + --exclude ".github/*" \ + --exclude ".git/*" \ + --exclude ".gitignore" \ + $SYNC_PARAMS else - aws s3 sync ./$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') ${{ inputs.S3URI }} --delete --exclude ".github/*" --exclude ".git/*" --exclude ".gitignore" + aws s3 sync ./$(echo $S3_URI | awk -F/ '{print $5}') $S3_URI \ + --delete \ + --exclude ".github/*" \ + --exclude ".git/*" \ + --exclude ".gitignore" fi - echo "Content is synchronized to ${{ inputs.S3URI }}" >> $GITHUB_STEP_SUMMARY + echo "Content is synchronized to $S3_URI" >> $GITHUB_STEP_SUMMARY - name: Invalidate cache + env: + DOMAIN: ${{ inputs.DOMAIN }} + S3_URI: ${{ inputs.S3URI }} run: | - if [ '${{ inputs.DOMAIN }}' == 'standard' ]; then - aws cloudfront create-invalidation --distribution-id $(aws ssm get-parameter --name "/tfvars/libraries-website/standard-cdn-id" --query 'Parameter.Value' --output text) --paths "/*" - echo "The cache for the $(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') folder has been cleared." >> $GITHUB_STEP_SUMMARY - else - aws cloudfront create-invalidation --distribution-id $(aws ssm get-parameter --name "/tfvars/libraries-website/custom-cdn-id" --query 'Parameter.Value' --output text) --paths "/$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}')/*" - echo "The cache for the $(echo ${{ inputs.S3URI }} | awk -F/ '{print $4}') site has been cleared." >> $GITHUB_STEP_SUMMARY - fi - - - name: Generate DEV Summary - # Only run this step if the environment is "dev" - if: ${{ inputs.ENVIRONMENT == 'dev' }} - run: | - if [ '${{ inputs.DOMAIN }}' == 'standard' ]; then - echo "The updates to https://cdn.dev1.mitlibrary.net/$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') are now available" >> $GITHUB_STEP_SUMMARY - else - echo "The updates to https://$(echo ${{ inputs.S3URI }} | awk -F/ '{print $4}').dev1.mitlibrary.net site are now available" >> $GITHUB_STEP_SUMMARY - fi - - - name: Generate STAGE Summary - # Only run this step if the environment is "stage" - if: ${{ inputs.ENVIRONMENT == 'stage' }} - run: | - if [ '${{ inputs.DOMAIN }}' == 'standard' ]; then - echo "The updates to https://cdn.stage.mitlibrary.net/$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') are now available" >> $GITHUB_STEP_SUMMARY - else - echo "The updates to https://$(echo ${{ inputs.S3URI }} | awk -F/ '{print $4}').stage.mitlibrary.net site are now available" >> $GITHUB_STEP_SUMMARY - fi - - - name: Generate PROD Summary - # Only run this step if the environment is "prod" - if: ${{ inputs.ENVIRONMENT == 'prod' }} - run: | - if [ '${{ inputs.DOMAIN }}' == 'standard' ]; then - echo "The updates to https://cdn.libraries.mit.edu/$(echo ${{ inputs.S3URI }} | awk -F/ '{print $5}') are now available" >> $GITHUB_STEP_SUMMARY + echo "### CDN cache invalidation" >> $GITHUB_STEP_SUMMARY + if [ $DOMAIN == "standard" ]; then + DISTRIBUTION_ID=$(aws ssm get-parameter \ + --name "/tfvars/libraries-website/standard-cdn-id" \ + --query 'Parameter.Value' \ + --output text) + INVALIDATION_ID=$(aws cloudfront create-invalidation \ + --distribution-id $DISTRIBUTION_ID \ + --paths "/*" \ + --query 'Invalidation.Id' \ + --output text) + echo "Start CDN Cache invalidation" + aws cloudfront wait invalidation-completed \ + --distribution-id $DISTRIBUTION_ID \ + --id $INVALIDATION_ID + echo "The cache for the $(echo $S3_URI | awk -F/ '{print $5}') folder has been cleared." >> $GITHUB_STEP_SUMMARY + echo "The updates to https://cdn.$CDN_DOMAIN/$(echo $S3_URI | awk -F/ '{print $5}') are now available." >> $GITHUB_STEP_SUMMARY else - echo "The updates to https://$(echo ${{ inputs.S3URI }} | awk -F/ '{print $4}').libraries.mit.edu site are now available" >> $GITHUB_STEP_SUMMARY + DISTRIBUTION_ID=$(aws ssm get-parameter \ + --name "/tfvars/libraries-website/custom-cdn-id" \ + --query 'Parameter.Value' \ + --output text) + INVALIDATION_ID=$(aws cloudfront create-invalidation \ + --distribution-id $DISTRIBUTION_ID \ + --paths "/$(echo $S3_URI | awk -F/ '{print $5}')/*" \ + --query 'Invalidation.Id' \ + --output text) + echo "Start CDN Cache invalidation." + aws cloudfront wait invalidation-completed \ + --distribution-id $DISTRIBUTION_ID \ + --id $INVALIDATION_ID + echo "The cache for the $(echo $S3_URI | awk -F/ '{print $4}') site has been cleared." >> $GITHUB_STEP_SUMMARY + echo "The updates to the https://$(echo $S3_URI | awk -F/ '{print $4}').$CDN_DOMAIN site are now available." >> $GITHUB_STEP_SUMMARY fi From f011a62fe7ee487763c035164a09176d38f59d63 Mon Sep 17 00:00:00 2001 From: Christopher Butler Date: Tue, 26 May 2026 16:55:34 -0400 Subject: [PATCH 2/5] Enhance the CDN Publishing Workflow Why these changes are being introduced: The previous commit was a simple cleanup of the shared CDN publishing workflow. This commit extends that work with a full refactor of the workflow that remains compatible with our existing caller workflows and sets the stage for future repositories that might need to publish content to the CDN (in particular, the future work to publish the NDE-tacos content to the CDN). How this addresses that need: * Create additional inputs to allow for more options for the the source and target of the `aws s3 sync` command * Update existing inputs with corrected "required" versus "optional" and default values * Update all the inputs with descriptions * Add a "validation" step to ensure that some of the freeform inputs are valid and fail the job quickly if they are not * Refine the "environment" and "domain" verifications and set environment variables with the correct combination of environment, domain, S3 bucket name, and CloudFront domain name (this also allows future caller workflows to not have to pass the bucket name when calling this workflow) * Using the new envrionment variables, simplify both the `aws s3 sync` command and the cache invalidation command Side effects of this change: None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/NDE-96 --- .github/workflows/cdn-shared-publish.yml | 237 ++++++++++++++--------- README.md | 32 +-- 2 files changed, 161 insertions(+), 108 deletions(-) diff --git a/.github/workflows/cdn-shared-publish.yml b/.github/workflows/cdn-shared-publish.yml index 1db304f..9c9cf78 100644 --- a/.github/workflows/cdn-shared-publish.yml +++ b/.github/workflows/cdn-shared-publish.yml @@ -5,24 +5,47 @@ on: workflow_call: inputs: AWS_REGION: - required: true + description: "Region for AWS resources." + required: false type: string - GHA_ROLE: - required: true + default: us-east-1 + DOMAIN: + description: "Indicates the standard CDN or a custom domain for the URL to the CDN (only standard or custom accepted)." + required: false type: string + default: standard ENVIRONMENT: + description: "The AWS environment where the resources will be deployed." required: true type: string - S3URI: + GHA_ROLE: + description: "The IAM Role linked to the OIDC connection." required: true type: string - DOMAIN: + S3URI: + description: "Legacy (deprecated) full S3 URI for the sync target in AWS." required: false type: string - default: standard + SOURCE_PATH: + description: "The path in the caller repository containing the files to sync to the S3 bucket." + required: false + type: string + default: . SYNC_PARAMS: + description: "Additional parameters for the aws s3 sync command, specific to the caller repository." + required: false + type: string + TARGET_PATH: + description: "The prefix in the S3 bucket to which the repository files should be synced (must start with slash)." required: false type: string + default: / + + +permissions: + # These are the minimum permissions to allow for OIDC connection to AWS + id-token: write + contents: read # Set defaults defaults: @@ -30,132 +53,162 @@ defaults: shell: bash jobs: - publish: - name: Publish to CDN + prep: + # Start with validating the inputs from the caller workflow and prepping + # environment variables for the synchronization job. + name: Prep and Validate runs-on: ubuntu-latest - permissions: - id-token: write - contents: read steps: - uses: actions/checkout@v6 with: persist-credentials: false - - name: Set Environment - id: aws_env + - name: Validate + # Verify that the DOMAIN & ENVIRONMENT inputs are using the correct + # values. Verify that the SOURCE_PATH and TARGET_PATH inputs are + # formatted correctly. + id: validate env: + DOMAIN: ${{ inputs.DOMAIN }} ENVIRONMENT: ${{ inputs.ENVIRONMENT }} - GHA_ROLE: ${{ inputs.GHA_ROLE }} - AWS_DEV_ACCT: ${{ secrets.AWS_ACCT_DEV }} - AWS_STAGE_ACCT: ${{ secrets.AWS_ACCT_STAGE }} - AWS_PROD_ACCT: ${{ secrets.AWS_ACCT_PROD }} + SOURCE_PATH: ${{ inputs.SOURCE_PATH }} + TARGET_PATH: ${{ inputs.TARGET_PATH }} run: | - if [ "$ENVIRONMENT" == "dev" ]; then - echo "ROLE=arn:aws:iam::$AWS_DEV_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV - echo "CDN_DOMAIN=dev1.mitlibrary.net" >> $GITHUB_ENV - elif [ "$ENVIRONMENT" == "stage" ]; then - echo "ROLE=arn:aws:iam::$AWS_STAGE_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV - echo "CDN_DOMAIN=stage.mitlibrary.net" >> $GITHUB_ENV - elif [ "$ENVIRONMENT" == "prod" ]; then - echo "ROLE=arn:aws:iam::$AWS_PROD_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV - echo "CDN_DOMAIN=libraries.mit.edu" >> $GITHUB_ENV + case "$DOMAIN" in + standard|custom) + echo "Valid DOMAIN=$DOMAIN input, proceed." + ;; + *) + echo "Invalid DOMAIN=$DOMAIN input, exiting." + exit 1 + ;; + esac + case "$ENVIRONMENT" in + dev|stage|prod) + echo "Valid ENVIRONMENT=$ENVIRONMENT input, proceed." + ;; + *) + echo "Invalid ENVIRONMENT=$ENVIRONMENT input, exiting." + exit 1 + ;; + esac + if [[ "${SOURCE_PATH:0:1}" == "." ]]; then + echo "Valid SOURCE_PATH=$SOURCE_PATH, proceed." else - echo "ERROR: Incorrect environment was set" >> $GITHUB_STEP_SUMMARY + echo "Invalid SOURCE_PATH=$SOURCE_PATH, exiting." exit 1 fi + if [[ "${TARGET_PATH:0:1}" == "/" ]]; then + echo "Valid TARGET_PATH=$TARGET_PATH, proceed." + else + echo "Invalid TARGET_PATH=$TARGET_PATH, exiting." + exit 1 + fi + + - name: Set Environment + # Prepare environment variables for the synchronization job. + id: env + env: + AWS_DEV_ACCT: ${{ secrets.AWS_ACCT_DEV }} + AWS_STAGE_ACCT: ${{ secrets.AWS_ACCT_STAGE }} + AWS_PROD_ACCT: ${{ secrets.AWS_ACCT_PROD }} + ENVIRONMENT: ${{ inputs.ENVIRONMENT }} + GHA_ROLE: ${{ inputs.GHA_ROLE }} + run: | + case "$ENVIRONMENT" in + dev) + echo "AWS_ROLE=arn:aws:iam::$AWS_DEV_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=dev1.mitlibrary.net" >> $GITHUB_ENV + echo "AWS_ROLE and CDN_DOMAIN set for synchronization job to Dev1" + ;; + stage) + echo "AWS_ROLE=arn:aws:iam::$AWS_STAGE_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=stage.mitlibrary.net" >> $GITHUB_ENV + echo "AWS_ROLE and CDN_DOMAIN set for synchronization job to Stage-Workloads" + ;; + prod) + echo "AWS_ROLE=arn:aws:iam::$AWS_PROD_ACCT:role/$GHA_ROLE" >> $GITHUB_ENV + echo "CDN_DOMAIN=libraries.mit.edu" >> $GITHUB_ENV + echo "AWS_ROLE and CDN_DOMAIN set for synchronization job to Prod-Workloads" + ;; + esac - name: Configure AWS Credentials id: aws_credentials uses: aws-actions/configure-aws-credentials@v6 with: - role-to-assume: ${{ env.ROLE }} aws-region: ${{ inputs.AWS_REGION }} + role-to-assume: ${{ env.AWS_ROLE }} - - name: Sync To Custom Domain CDN - # Only run this step if this is custom domain content (e.g., a folder at the root of bucket) - if: ${{ inputs.DOMAIN == 'custom' }} + - name: Get AWS Information + # Set the correct S3 URI for the synchronization job + id: aws_info env: - SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} - S3_URI: ${{ inputs.S3URI }} + AWS_REGION: ${{ inputs.AWS_REGION }} + DOMAIN: ${{ inputs.DOMAIN }} + TARGET_PATH: ${{ inputs.TARGET_PATH }} run: | - echo "### Content synchronization to $S3_URI." >> $GITHUB_STEP_SUMMARY - if [ "$SYNC_PARAMS" != "" ]; then - aws s3 sync . $S3_URI \ - --delete \ - --exclude ".github/*" \ - --exclude ".git/*" \ - --exclude ".gitignore" \ - $SYNC_PARAMS + BUCKET=$(aws ssm get-parameter \ + --region "$AWS_REGION" \ + --name "/tfvars/libraries-website/cdn-origin-bucket-name" \ + --query 'Parameter.Value' \ + --output text) + if [[ "$DOMAIN" == "standard" ]]; then + echo "DISTRIBUTION_ID=$(aws ssm get-parameter \ + --name "/tfvars/libraries-website/standard-cdn-id" \ + --query 'Parameter.Value' \ + --output text)" >> $GITHUB_ENV + echo "S3_URI=s3://$BUCKET/cdn$TARGET_PATH/" >> $GITHUB_ENV else - aws s3 sync . $S3_URI \ - --delete \ - --exclude ".github/*" \ - --exclude ".git/*" - --exclude ".gitignore" + echo "DISTRIBUTION_ID=$(aws ssm get-parameter \ + --name "/tfvars/libraries-website/custom-cdn-id" \ + --query 'Parameter.Value' \ + --output text)" >> $GITHUB_ENV + echo "S3_URI=s3://$BUCKET$TARGET_PATH/" >> $GITHUB_ENV fi - echo "Content is synchronized to $S3_URI." >> $GITHUB_STEP_SUMMARY - - name: Sync To Standard Domain CDN - # Only run this step if this is standard content (e.g., a subfolder of the cdn/ folder) - if: ${{ inputs.DOMAIN == 'standard' }} + - name: Sync To CDN S3 Bucket env: + S3_URI: ${{ env.S3_URI }} + SOURCE_PATH: ${{ inputs.SOURCE_PATH }} SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} - S3_URI: ${{ inputs.S3URI }} run: | - echo "### Content synchronization" >> $GITHUB_STEP_SUMMARY - if [ "$SYNC_PARAMS" != "" ]; then - aws s3 sync ./$(echo $S3_URI | awk -F/ '{print $5}') $S3_URI \ + echo "### Content synchronization to $S3_URI." >> $GITHUB_STEP_SUMMARY + if [[ "$S3_URI" == *"cdn/"* ]]; then + echo "Standard CDN content is synchronizing" + else + echo "Custom CDN content is synchronizing" + fi + cd "$GITHUB_WORKSPACE" + aws s3 sync "$SOURCE_PATH" "$S3_URI" \ --delete \ --exclude ".github/*" \ --exclude ".git/*" \ --exclude ".gitignore" \ $SYNC_PARAMS - else - aws s3 sync ./$(echo $S3_URI | awk -F/ '{print $5}') $S3_URI \ - --delete \ - --exclude ".github/*" \ - --exclude ".git/*" \ - --exclude ".gitignore" - fi - echo "Content is synchronized to $S3_URI" >> $GITHUB_STEP_SUMMARY + echo "Content is synchronized to $S3_URI." >> $GITHUB_STEP_SUMMARY - name: Invalidate cache env: + CDN_DOMAIN: ${{ env.CDN_DOMAIN }} + DISTRIBUTION_ID: ${{ env.DISTRIBUTION_ID }} DOMAIN: ${{ inputs.DOMAIN }} - S3_URI: ${{ inputs.S3URI }} + TARGET_PATH: ${{ inputs.TARGET_PATH }} run: | echo "### CDN cache invalidation" >> $GITHUB_STEP_SUMMARY - if [ $DOMAIN == "standard" ]; then - DISTRIBUTION_ID=$(aws ssm get-parameter \ - --name "/tfvars/libraries-website/standard-cdn-id" \ - --query 'Parameter.Value' \ - --output text) - INVALIDATION_ID=$(aws cloudfront create-invalidation \ - --distribution-id $DISTRIBUTION_ID \ - --paths "/*" \ + echo "Start CDN Cache invalidation." + INVALIDATION_ID=$(aws cloudfront create-invalidation \ + --distribution-id "$DISTRIBUTION_ID" \ + --paths "$TARGET_PATH" \ --query 'Invalidation.Id' \ --output text) - echo "Start CDN Cache invalidation" - aws cloudfront wait invalidation-completed \ - --distribution-id $DISTRIBUTION_ID \ - --id $INVALIDATION_ID - echo "The cache for the $(echo $S3_URI | awk -F/ '{print $5}') folder has been cleared." >> $GITHUB_STEP_SUMMARY - echo "The updates to https://cdn.$CDN_DOMAIN/$(echo $S3_URI | awk -F/ '{print $5}') are now available." >> $GITHUB_STEP_SUMMARY + aws cloudfront wait invalidation-completed \ + --distribution-id "$DISTRIBUTION_ID" \ + --id "$INVALIDATION_ID" + echo "The cache has been cleared." >> $GITHUB_STEP_SUMMARY + if [[ "$DOMAIN" == "standard" ]]; then + echo "The updates to https://cdn.$CDN_DOMAIN$TARGET_PATH are now available." >> $GITHUB_STEP_SUMMARY else - DISTRIBUTION_ID=$(aws ssm get-parameter \ - --name "/tfvars/libraries-website/custom-cdn-id" \ - --query 'Parameter.Value' \ - --output text) - INVALIDATION_ID=$(aws cloudfront create-invalidation \ - --distribution-id $DISTRIBUTION_ID \ - --paths "/$(echo $S3_URI | awk -F/ '{print $5}')/*" \ - --query 'Invalidation.Id' \ - --output text) - echo "Start CDN Cache invalidation." - aws cloudfront wait invalidation-completed \ - --distribution-id $DISTRIBUTION_ID \ - --id $INVALIDATION_ID - echo "The cache for the $(echo $S3_URI | awk -F/ '{print $4}') site has been cleared." >> $GITHUB_STEP_SUMMARY - echo "The updates to the https://$(echo $S3_URI | awk -F/ '{print $4}').$CDN_DOMAIN site are now available." >> $GITHUB_STEP_SUMMARY + echo "The updates to the https://$TARGET_PATH.$CDN_DOMAIN site are now available." >> $GITHUB_STEP_SUMMARY fi diff --git a/README.md b/README.md index c923c86..79b3d74 100644 --- a/README.md +++ b/README.md @@ -266,34 +266,34 @@ It also assumes that the appropriate infrastructure is in place, particularly th ## Automated Publishing to CDN -There are multiple static HTML repositories (future-of-libraries and open-access-task-force) that will benefit from automated publishing to the S3-based CDN in our AWS Organization. The publishing automation (for both stage & prod) is handled by one shared workflow, [cdn-shared-publish.yml](./.github/workflows/cdn-shared-publish.yml), that covers all three tiers (dev/stage/prod) as well as both the standard CDN and the custom domain CDN. +There are multiple static HTML repositories ([future-of-libraries](https://github.com/MITLibraries/future-of-libraries-static) and [grandchallenges](https://github.com/MITLibraries/grandchallenges-static)) that benefit from automated publishing to the S3-based CDN in our AWS Organization. Additionally, the [web-images-static](https://github.com/MITLibraries/web-images-static) also benefits from automated publishing to the S3-based CDN. The publishing automation is handled by one shared workflow, [cdn-shared-publish.yml](./.github/workflows/cdn-shared-publish.yml) that covers all three tiers (dev/stage/prod) as well as both the standard CDN and the custom domain CDN. -This workflow assumes that the calling repository is structured in a very particular way! - -- For custom domain repos, all the content to be published to the `` folder in the S3 bucket **must** live at the root of the repository. -- For standard domain repos, all the content to be published to the `cdn/` folder in the S3 bucket **must** live in a top level folder named ``. - - For a custom domain example see [future-of-libraries-static](https://github.com/mitlibraries/future-of-libraries-static). - - For a standard CDN example see [web-images-static](https://github.com/mitlibraries/web-images-static). +- For a custom domain example see [future-of-libraries-static](https://github.com/mitlibraries/future-of-libraries-static). +- For a standard CDN example see [web-images-static](https://github.com/mitlibraries/web-images-static). ### CDN Requirements -The following values must be passed in to the shared workflow from the caller workflow: +There are a number of inputs to the shared workflow, some optional and some required. Here's a summary of the inputs. -- `AWS_REGION` (*string*, **required**): the region where the S3 bucket lives -- `DOMAIN` (*string*, **optional**): the default value is `standard` which refers to the standard CDN. If the content in question is associated with the custom domain CDN, then the caller workflow must pass the value `custom` instead of relying on the default. -- `ENVIRONMENT` (*string*, **required**): either `stage` or `prod` (this workflow is not intended for the `dev` environment) +- `AWS_REGION` (*string*, **optional**, default = `us-east-1`): the region where the S3 bucket lives +- `DOMAIN` (*string*, **optional**, default = `standard`): the default value of `standard` refers to the standard CDN. If the content in question is associated with the custom domain CDN, then the caller workflow must pass the value `custom` instead of relying on the default. +- `ENVIRONMENT` (*string*, **required**): one of `dev`, `stage`, or `prod` - `GHA_ROLE` (*string*, **required**): the OIDC role (managed by the [mitlib-tf-workloads-libraries-website](https://github.com/MITLibraries/mitlib-tf-workloads-libraries-website) repository) -- `SYNC_PARAMS` (*string*, **optional**): this is a string that is appended to the `aws s3 sync` command. If nothing is passed from the caller workflow, it is ignored. This is intended to be used for adding additional `--exclude` arguments for any other files/folders in the web content repo that shouldn't be published to the S3 bucket for the site. - - The typical use for the web dev is to exclude additional top level folders (e.g., `--exclude "docs/*"`) or exclude the top level README (`--exclude "README.md"`). - - It **can** be used to exclude everything except for one top level folder (e.g., `--exclude "*" --include "use_only_this_folder/*"`). +- **[deprecated]** `S3URI` (*string*, **optional**, **no default**): the full S3 URI (including the path) where the files should be uploaded. This was the old way of handling the target for the content synchronization. The new method for handling the sources & target are handled with `SOURCE_PATH` and `TARGET_PATH` detailed below +- `SOURCE_PATH` (*string*, **optional**, default = `.`): this is the relative path in the caller repository to the content that should be synced to the S3 bucket. The default value of `.` references the root of the repository. The combination of `SOURCE_PATH` and `TARGET_PATH` (see below) fully replace the `S3URI` input +- `SYNC_PARAMS` (*string*, **optional**, **no default**): this is a string that is appended to the `aws s3 sync` command. If nothing is passed from the caller workflow, the value for this in the workflow is `""`. This is intended to be used for adding additional `--exclude` arguments for any other files/folders in the web content repo that shouldn't be published to the S3 bucket for the site. + - The typical use for the web developer is to exclude additional top level folders (e.g., `--exclude "docs/*"`) or exclude the top level README (`--exclude "README.md"`). + - It **can** be used to exclude everything except for one top level folder (e.g., `--exclude "*" --include "use_only_this_folder/*"`) - for more details on the additional parameters that can be used for `SYNC_PARAMS` see - [AWS CLI s3 reference](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html) - [AWS CLI s3 sync reference](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/sync.html) - - The fixed behavior of this workflow is to ignore the `.gitignore` file, the `.git` directory, and the `.github` directory. -- `S3URI` (*string*, **required**): the full S3 URI (including the path) where the files should be uploaded + - The fixed behavior of this workflow is to ignore the `.gitignore` file, the `.git` directory, and the `.github` directory +- `TARGET_PATH` (*string*, **optional**, default = `/`): this is the prefix in the S3 bucket where the caller repository content should be synchronized. The combination of `SOURCE_PATH` (see above) and `TARGET_PATH` fully replace the `S3URI` input To make life easy for the web developers, the [mitlib-tf-workloads-libraries-website](https://github.com/MITLibraries/mitlib-tf-workloads-libraries-website) repository generates the correct caller workflow for the custom domain sites and stores it as a Terraform output in TfCloud. This can be copy/pasted into the repository containing the content to be published to the CDN. +**NOTE**: The `S3URI` input is deprecated (replaced by `SOURCE_PATH` and `TARGET_PATH`) and will be removed once all the legacy caller workflows are updated. The default values for `SOURCE_PATH` and `TARGET_PATH` match the behavior of the `S3URI` method. + ## Automated Lambda@Edge Deployments There are multiple Lambda@Edge functions in our CloudFront distributions. The Lambda update & deployment as well as the CloudFront re-deployment (via Terraform) are centralized here to make it easier to add additional Lambda functions in the future. See [cf-lambda-deploy.yml](./.github/workflows/cf-lambda-shared-deploy.yml) for the actual workflow. See [Lambda@Edge CloudFront Deployment Model](https://mitlibraries.atlassian.net/l/cp/SP3QNj1s) for an overview of the deployment process. From 98f7d25ac8706761ba0b2a1e0ab49e26cabd4f6a Mon Sep 17 00:00:00 2001 From: Christopher Butler Date: Fri, 29 May 2026 13:26:52 -0400 Subject: [PATCH 3/5] Ensure Backwards Compatibility Why these changes are being made: Eventually, we will update the caller workflows, but for now we want to ensure that the old caller workflows still work even with all the changes to this shared workflow. How these changes are implemented: * Add a block to the validation step to parse the deprecated S3_URI input to set "legacy" environment variables for later steps and set a boolean to indicate if this workflow has been called by a legacy caller workflow * Update later steps with a conditional for the SOURCE_PATH variable to set it correctly for legacy and non-legacy workflows Side effects: None. --- .github/workflows/cdn-shared-publish.yml | 30 +++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cdn-shared-publish.yml b/.github/workflows/cdn-shared-publish.yml index 9c9cf78..e89aa5b 100644 --- a/.github/workflows/cdn-shared-publish.yml +++ b/.github/workflows/cdn-shared-publish.yml @@ -53,10 +53,10 @@ defaults: shell: bash jobs: - prep: + publish: # Start with validating the inputs from the caller workflow and prepping # environment variables for the synchronization job. - name: Prep and Validate + name: Publish runs-on: ubuntu-latest steps: @@ -73,6 +73,7 @@ jobs: DOMAIN: ${{ inputs.DOMAIN }} ENVIRONMENT: ${{ inputs.ENVIRONMENT }} SOURCE_PATH: ${{ inputs.SOURCE_PATH }} + S3URI: ${{ inputs.S3URI }} TARGET_PATH: ${{ inputs.TARGET_PATH }} run: | case "$DOMAIN" in @@ -99,11 +100,22 @@ jobs: echo "Invalid SOURCE_PATH=$SOURCE_PATH, exiting." exit 1 fi - if [[ "${TARGET_PATH:0:1}" == "/" ]]; then - echo "Valid TARGET_PATH=$TARGET_PATH, proceed." + if [[ "$S3URI" == "" ]]; then + if [[ "${TARGET_PATH:0:1}" == "/" ]]; then + echo "Valid TARGET_PATH=$TARGET_PATH, proceed." + else + echo "Invalid TARGET_PATH=$TARGET_PATH, exiting." + exit 1 + fi else - echo "Invalid TARGET_PATH=$TARGET_PATH, exiting." - exit 1 + echo "Legacy caller workflow that passed ab S3_URI value." + if [[ "$DOMAIN" == "standard" ]]; then + echo "LEGACY_TARGET_PATH=/$(echo "$S3URI" | awk -F/ '{print $5}')" >> $GITHUB_ENV + echo "LEGACY_SOURCE_PATH=$(echo "$S3URI" | awk -F/ '{print $5}')" >> $GITHUB_ENV + else + echo "LEGACY_TARGET_PATH=/$(echo "$S3URI" | awk -F/ '{print $4}')" >> $GITHUB_ENV + fi + echo "LEGACY=true" >> $GITHUB_ENV fi - name: Set Environment @@ -147,7 +159,7 @@ jobs: env: AWS_REGION: ${{ inputs.AWS_REGION }} DOMAIN: ${{ inputs.DOMAIN }} - TARGET_PATH: ${{ inputs.TARGET_PATH }} + TARGET_PATH: ${{ env.LEGACY && env.LEGACY_TARGET_PATH || inputs.TARGET_PATH }} run: | BUCKET=$(aws ssm get-parameter \ --region "$AWS_REGION" \ @@ -171,7 +183,7 @@ jobs: - name: Sync To CDN S3 Bucket env: S3_URI: ${{ env.S3_URI }} - SOURCE_PATH: ${{ inputs.SOURCE_PATH }} + SOURCE_PATH: ${{ env.LEGACY && env.LEGACY_SOURCE_PATH || inputs.SOURCE_PATH }} SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} run: | echo "### Content synchronization to $S3_URI." >> $GITHUB_STEP_SUMMARY @@ -194,7 +206,7 @@ jobs: CDN_DOMAIN: ${{ env.CDN_DOMAIN }} DISTRIBUTION_ID: ${{ env.DISTRIBUTION_ID }} DOMAIN: ${{ inputs.DOMAIN }} - TARGET_PATH: ${{ inputs.TARGET_PATH }} + TARGET_PATH: ${{ env.LEGACY && env.LEGACY_TARGET_PATH || inputs.TARGET_PATH }} run: | echo "### CDN cache invalidation" >> $GITHUB_STEP_SUMMARY echo "Start CDN Cache invalidation." From 2a4e2ba9baa4e9067b3dd6fc9b8ef3fbbcdc0170 Mon Sep 17 00:00:00 2001 From: Christopher Butler Date: Fri, 29 May 2026 16:25:56 -0400 Subject: [PATCH 4/5] Validate and Construct sync Parameters Why these changes are being introduced: First, we want to ensure that the only additional parameters supplied by the caller workflow are `--include` and `--exclude` parameters. Second, since we pass the parameters via an `env:` block, we have to be extra careful with double-quotes (which are required by the `aws s3 sync` command, but typically get stripped when expanded in a bash script). How these changes are implemented: * Add a block to the validation step to throw an error if there is any other parameter in the SYNC_PARAMS input outside of `--include` and `--exclude` * If the SYNC_PARAMS is valid, merge it together with the other stock `--exclude` parameters and set one string in the GITHUB_ENV with the full list of inclues and excludes * Add the `eval` command to properly expand the `aws s3 sync` command and preserve the double-quotes where they are required Side effects: None. --- .github/workflows/cdn-shared-publish.yml | 38 +++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cdn-shared-publish.yml b/.github/workflows/cdn-shared-publish.yml index e89aa5b..e1b803c 100644 --- a/.github/workflows/cdn-shared-publish.yml +++ b/.github/workflows/cdn-shared-publish.yml @@ -67,7 +67,10 @@ jobs: - name: Validate # Verify that the DOMAIN & ENVIRONMENT inputs are using the correct # values. Verify that the SOURCE_PATH and TARGET_PATH inputs are - # formatted correctly. + # formatted correctly and ensure this supports legacy caller workflows. + # Validate the SYNC_PARAMS input to only allow `--exclude` and + # `--include` parameters and then construct a single VALID_SYNC_PARAMS + # environment variable with proper quoting for use in the sync step. id: validate env: DOMAIN: ${{ inputs.DOMAIN }} @@ -75,6 +78,7 @@ jobs: SOURCE_PATH: ${{ inputs.SOURCE_PATH }} S3URI: ${{ inputs.S3URI }} TARGET_PATH: ${{ inputs.TARGET_PATH }} + SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} run: | case "$DOMAIN" in standard|custom) @@ -100,6 +104,7 @@ jobs: echo "Invalid SOURCE_PATH=$SOURCE_PATH, exiting." exit 1 fi + if [[ "$S3URI" == "" ]]; then if [[ "${TARGET_PATH:0:1}" == "/" ]]; then echo "Valid TARGET_PATH=$TARGET_PATH, proceed." @@ -108,7 +113,7 @@ jobs: exit 1 fi else - echo "Legacy caller workflow that passed ab S3_URI value." + echo "Legacy caller workflow that passed an S3_URI value." if [[ "$DOMAIN" == "standard" ]]; then echo "LEGACY_TARGET_PATH=/$(echo "$S3URI" | awk -F/ '{print $5}')" >> $GITHUB_ENV echo "LEGACY_SOURCE_PATH=$(echo "$S3URI" | awk -F/ '{print $5}')" >> $GITHUB_ENV @@ -118,6 +123,20 @@ jobs: echo "LEGACY=true" >> $GITHUB_ENV fi + if [[ -n "$SYNC_PARAMS" ]]; then + temp_params="${SYNC_PARAMS//--include/}" + temp_params="${temp_params//--exclude/}" + # If there's still a -- in there, it's an invalid flag + if [[ $temp_params =~ -- ]]; then + echo "Invalid SYNC_PARAMS: only --include and --exclude parameters are allowed, exiting." + exit 1 + fi + echo "Valid SYNC_PARAMS, proceed." + echo "VALID_SYNC_PARAMS=--exclude \".github/*\" --exclude \".git/*\" --exclude \".gitignore\" $SYNC_PARAMS" >> $GITHUB_ENV + else + echo "VALID_SYNC_PARAMS=--exclude \".github/*\" --exclude \".git/*\" --exclude \".gitignore\"" >> $GITHUB_ENV + fi + - name: Set Environment # Prepare environment variables for the synchronization job. id: env @@ -153,9 +172,9 @@ jobs: aws-region: ${{ inputs.AWS_REGION }} role-to-assume: ${{ env.AWS_ROLE }} - - name: Get AWS Information + - name: Set S3 Target URI # Set the correct S3 URI for the synchronization job - id: aws_info + id: s3_target env: AWS_REGION: ${{ inputs.AWS_REGION }} DOMAIN: ${{ inputs.DOMAIN }} @@ -184,7 +203,7 @@ jobs: env: S3_URI: ${{ env.S3_URI }} SOURCE_PATH: ${{ env.LEGACY && env.LEGACY_SOURCE_PATH || inputs.SOURCE_PATH }} - SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} + VALID_SYNC_PARAMS: ${{ env.VALID_SYNC_PARAMS }} run: | echo "### Content synchronization to $S3_URI." >> $GITHUB_STEP_SUMMARY if [[ "$S3_URI" == *"cdn/"* ]]; then @@ -193,12 +212,9 @@ jobs: echo "Custom CDN content is synchronizing" fi cd "$GITHUB_WORKSPACE" - aws s3 sync "$SOURCE_PATH" "$S3_URI" \ - --delete \ - --exclude ".github/*" \ - --exclude ".git/*" \ - --exclude ".gitignore" \ - $SYNC_PARAMS + eval "aws s3 sync \"$SOURCE_PATH\" \"$S3_URI\" \ + --delete \ + $VALID_SYNC_PARAMS" echo "Content is synchronized to $S3_URI." >> $GITHUB_STEP_SUMMARY - name: Invalidate cache From 530239bf049a6bfdae3be0fd66e854e5ea57f881 Mon Sep 17 00:00:00 2001 From: Christopher Butler Date: Fri, 12 Jun 2026 07:59:22 -0400 Subject: [PATCH 5/5] Address GitHub Copilot Review Why these changes are being introduced: GitHub Copilot reviewed the pull request and noted a few issues in the shared workflow code. After reviewing the comments by Copilot, it was appropriate to address all of the concerns. How this addresses that need: * Update the Validate step to ensure that the SOURCE_PATH does not start with `..` (preventing a directory outside of the repository itself from being the source) * Update the Validate step to fail quickly if there are any newline characters in the SYNC_PARAMS input * Update the Validate step to add validation of the value passed in to the GHA_ROLE input to ensure that it is formmatted correctly as an IAM Role name (using a grep filter provided by AWS) * Update the Sync to CDN S3 Bucket step to prevent command injection; the previous version used `eval` to handle the command and preserve the double-quotes. The new version uses a bash array and the `read` command to parse the VALID_SYNC_PARAMS and then just executes the combined array (this techinque was suggested by GitHub Copilot) * Update the Invalidate Cache step to correct the mistake in the --paths argument (the "*" was missing previously) * Update the README with more details on the workflow inputs and a clearer explanation of how this shared workflow actually works Side effects of this change: None. --- .github/workflows/cdn-shared-publish.yml | 38 ++++++++++++++++------ README.md | 41 ++++++++++++++++++------ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/.github/workflows/cdn-shared-publish.yml b/.github/workflows/cdn-shared-publish.yml index e1b803c..29434a8 100644 --- a/.github/workflows/cdn-shared-publish.yml +++ b/.github/workflows/cdn-shared-publish.yml @@ -70,15 +70,18 @@ jobs: # formatted correctly and ensure this supports legacy caller workflows. # Validate the SYNC_PARAMS input to only allow `--exclude` and # `--include` parameters and then construct a single VALID_SYNC_PARAMS - # environment variable with proper quoting for use in the sync step. + # environment variable with proper quoting for use in the sync step. + # Verify that the GHA_ROLE is properly formatted as an ARN (using the + # grep pattern provided by AWS) id: validate env: DOMAIN: ${{ inputs.DOMAIN }} ENVIRONMENT: ${{ inputs.ENVIRONMENT }} + GHA_ROLE: ${{ inputs.GHA_ROLE }} SOURCE_PATH: ${{ inputs.SOURCE_PATH }} S3URI: ${{ inputs.S3URI }} - TARGET_PATH: ${{ inputs.TARGET_PATH }} SYNC_PARAMS: ${{ inputs.SYNC_PARAMS }} + TARGET_PATH: ${{ inputs.TARGET_PATH }} run: | case "$DOMAIN" in standard|custom) @@ -98,7 +101,7 @@ jobs: exit 1 ;; esac - if [[ "${SOURCE_PATH:0:1}" == "." ]]; then + if [[ "${SOURCE_PATH:0:1}" == "." && "${SOURCE_PATH:0:2}" != ".." ]]; then echo "Valid SOURCE_PATH=$SOURCE_PATH, proceed." else echo "Invalid SOURCE_PATH=$SOURCE_PATH, exiting." @@ -127,7 +130,7 @@ jobs: temp_params="${SYNC_PARAMS//--include/}" temp_params="${temp_params//--exclude/}" # If there's still a -- in there, it's an invalid flag - if [[ $temp_params =~ -- ]]; then + if [[ $temp_params =~ -- || $temp_params =~ $'\n' ]]; then echo "Invalid SYNC_PARAMS: only --include and --exclude parameters are allowed, exiting." exit 1 fi @@ -137,6 +140,13 @@ jobs: echo "VALID_SYNC_PARAMS=--exclude \".github/*\" --exclude \".git/*\" --exclude \".gitignore\"" >> $GITHUB_ENV fi + if grep -q "[a-zA-Z0-9+=,.@\-_]+" <<<< "$GHA_ROLE"; then + echo "Valid GHA_ROLE, proceed" + else + echo "Invalid GHA_ROLE, exiting." + exit 1 + fi + - name: Set Environment # Prepare environment variables for the synchronization job. id: env @@ -173,7 +183,8 @@ jobs: role-to-assume: ${{ env.AWS_ROLE }} - name: Set S3 Target URI - # Set the correct S3 URI for the synchronization job + # Set the correct S3 URI for the synchronization job, differentiating + # between the general CDN prefix and the custom domain CDN prefix id: s3_target env: AWS_REGION: ${{ inputs.AWS_REGION }} @@ -200,6 +211,11 @@ jobs: fi - name: Sync To CDN S3 Bucket + # This uses a bash array to build the command in order to avoid the + # command injecttion risk (in light of the fact that the command needs + # the double-quotes around some of the parameters). We use the build-in + # `read` command to split the VALID_SYNC_PARAMS into an array and then + # merge the two arrays together before executing the command. env: S3_URI: ${{ env.S3_URI }} SOURCE_PATH: ${{ env.LEGACY && env.LEGACY_SOURCE_PATH || inputs.SOURCE_PATH }} @@ -212,9 +228,11 @@ jobs: echo "Custom CDN content is synchronizing" fi cd "$GITHUB_WORKSPACE" - eval "aws s3 sync \"$SOURCE_PATH\" \"$S3_URI\" \ - --delete \ - $VALID_SYNC_PARAMS" + + aws_cmd=(aws s3 sync "$SOURCE_PATH" "$S3_URI" --delete) + read -ra params <<< "$VALID_SYNC_PARAMS" + aws_cmd+=("${params[@]}") + "${aws_cmd[@]}" echo "Content is synchronized to $S3_URI." >> $GITHUB_STEP_SUMMARY - name: Invalidate cache @@ -228,7 +246,7 @@ jobs: echo "Start CDN Cache invalidation." INVALIDATION_ID=$(aws cloudfront create-invalidation \ --distribution-id "$DISTRIBUTION_ID" \ - --paths "$TARGET_PATH" \ + --paths "${TARGET_PATH%/}/*" \ --query 'Invalidation.Id' \ --output text) aws cloudfront wait invalidation-completed \ @@ -238,5 +256,5 @@ jobs: if [[ "$DOMAIN" == "standard" ]]; then echo "The updates to https://cdn.$CDN_DOMAIN$TARGET_PATH are now available." >> $GITHUB_STEP_SUMMARY else - echo "The updates to the https://$TARGET_PATH.$CDN_DOMAIN site are now available." >> $GITHUB_STEP_SUMMARY + echo "The updates to the https://{$TARGET_PATH#/}.$CDN_DOMAIN site are now available." >> $GITHUB_STEP_SUMMARY fi diff --git a/README.md b/README.md index 79b3d74..1a20369 100644 --- a/README.md +++ b/README.md @@ -275,25 +275,48 @@ There are multiple static HTML repositories ([future-of-libraries](https://githu There are a number of inputs to the shared workflow, some optional and some required. Here's a summary of the inputs. -- `AWS_REGION` (*string*, **optional**, default = `us-east-1`): the region where the S3 bucket lives -- `DOMAIN` (*string*, **optional**, default = `standard`): the default value of `standard` refers to the standard CDN. If the content in question is associated with the custom domain CDN, then the caller workflow must pass the value `custom` instead of relying on the default. -- `ENVIRONMENT` (*string*, **required**): one of `dev`, `stage`, or `prod` -- `GHA_ROLE` (*string*, **required**): the OIDC role (managed by the [mitlib-tf-workloads-libraries-website](https://github.com/MITLibraries/mitlib-tf-workloads-libraries-website) repository) -- **[deprecated]** `S3URI` (*string*, **optional**, **no default**): the full S3 URI (including the path) where the files should be uploaded. This was the old way of handling the target for the content synchronization. The new method for handling the sources & target are handled with `SOURCE_PATH` and `TARGET_PATH` detailed below -- `SOURCE_PATH` (*string*, **optional**, default = `.`): this is the relative path in the caller repository to the content that should be synced to the S3 bucket. The default value of `.` references the root of the repository. The combination of `SOURCE_PATH` and `TARGET_PATH` (see below) fully replace the `S3URI` input -- `SYNC_PARAMS` (*string*, **optional**, **no default**): this is a string that is appended to the `aws s3 sync` command. If nothing is passed from the caller workflow, the value for this in the workflow is `""`. This is intended to be used for adding additional `--exclude` arguments for any other files/folders in the web content repo that shouldn't be published to the S3 bucket for the site. +- `AWS_REGION` (**optional**, *string*, default = `us-east-1`): the region where the S3 bucket lives. +- `DOMAIN` (**optional**, *string*, default = `standard`): the default value of `standard` refers to the standard CDN. If the content in question is associated with the custom domain CDN, then the caller workflow must pass the value `custom` instead of relying on the default. +- `ENVIRONMENT` (**required**, *string*): one of `dev`, `stage`, or `prod`. +- `GHA_ROLE` (**required**, *string*): the OIDC role name (managed by the [mitlib-tf-workloads-libraries-website](https://github.com/MITLibraries/mitlib-tf-workloads-libraries-website) repository). +- **[deprecated]** `S3URI` (**optional**, *string*, **no default**): the full S3 URI (including the path) where the files should be uploaded. This was the old way of handling the target for the content synchronization. The new method for handling the sources & target are handled with `SOURCE_PATH` and `TARGET_PATH` detailed below. +- `SOURCE_PATH` (**optional**, *string*, default = `.`): this is the relative path in the caller repository to the content that should be synced to the S3 bucket. The default value of `.` references the root of the repository. The combination of `SOURCE_PATH` and `TARGET_PATH` (see below) fully replace the `S3URI` input.The value passed to this shared workflow must start with `./` for ensure that the path is in the repository. +- `SYNC_PARAMS` (**optional**, *string*, **no default**): this is a string that is appended to the `aws s3 sync` command. If nothing is passed from the caller workflow, the value for this in the workflow is `""`. This is intended to be used for adding additional `--exclude` (or `--include`) arguments for any other files/folders in the web content repo that shouldn't (or should) be published to the S3 bucket for the site. - The typical use for the web developer is to exclude additional top level folders (e.g., `--exclude "docs/*"`) or exclude the top level README (`--exclude "README.md"`). - It **can** be used to exclude everything except for one top level folder (e.g., `--exclude "*" --include "use_only_this_folder/*"`) - for more details on the additional parameters that can be used for `SYNC_PARAMS` see - [AWS CLI s3 reference](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html) - [AWS CLI s3 sync reference](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/sync.html) - - The fixed behavior of this workflow is to ignore the `.gitignore` file, the `.git` directory, and the `.github` directory -- `TARGET_PATH` (*string*, **optional**, default = `/`): this is the prefix in the S3 bucket where the caller repository content should be synchronized. The combination of `SOURCE_PATH` (see above) and `TARGET_PATH` fully replace the `S3URI` input + - The workflow is already configured to ignore the `.gitignore` file, the `.git` directory, and the `.github` directory +- `TARGET_PATH` (**optional**, *string*, default = `/`): this is the prefix in the S3 bucket where the caller repository content should be synchronized. The combination of `SOURCE_PATH` (see above) and `TARGET_PATH` fully replace the `S3URI` input. To make life easy for the web developers, the [mitlib-tf-workloads-libraries-website](https://github.com/MITLibraries/mitlib-tf-workloads-libraries-website) repository generates the correct caller workflow for the custom domain sites and stores it as a Terraform output in TfCloud. This can be copy/pasted into the repository containing the content to be published to the CDN. **NOTE**: The `S3URI` input is deprecated (replaced by `SOURCE_PATH` and `TARGET_PATH`) and will be removed once all the legacy caller workflows are updated. The default values for `SOURCE_PATH` and `TARGET_PATH` match the behavior of the `S3URI` method. +### How the workflow works + +#### Validation + +The first step runs through basic validation of the inputs to ensure that they are formatted correctly and exits the job quickly if any validation step fails. In addition to validation, this step sets two additional environment variables. + +1. It captures whether the caller workflow is a "legacy" call (e.g., uses the `S3_URI` input) or a "new" call (e.g., does NOT pass the `S3_URI` input) and sets a boolean in the environment to track this. +1. It builds the correct `--include` and `--exclude` command line parameters by combing the default excludes with any additional `SYNC_PARAMS` that are passed in from the caller workflow. + +#### Environment + +The next two steps set a collection of environment variables. Since we are using one shared workflow for caller workflows that will publish to Dev, Stage, or Prod, there is work to do to set the correct target AWS Account and IAM Role for the OIDC connection to AWS. Additionally, there is work to determine whether the caller workflow is publishing to the standard CDN or to a custom domain CDN. + +The environment setting is broken out into two different steps (with the AWS Credentials step in the middle) because some of the environment variables are required before connecting to AWS and some of the environment variables cannot be set until information can be read from the correct AWS Account. + +#### Synchronization + +With all the environment set and the connection to AWS established, the synchronziation is straightfowrard, except for the complication needed to prevent any malicious command injection. The `aws sync ...` command is loaded into an array along with the constructed `VALID_SYNC_PARAMS` from the validation step. Then, the combined array is executed as a command (thanks to GitHub Copilot for suggesting this technique). + +#### Cache Invalidation + +Any time that content in the S3 bucket backing the CDN is updated, the currently cached content in CloudFront must be invalidated so that the cache can serve the updated files. So, the final step in the workflow runs the `aws cloudfront cache-invalidation ...` command for the appropriate path and then waits for the invalication to complete before exiting the workflow. + ## Automated Lambda@Edge Deployments There are multiple Lambda@Edge functions in our CloudFront distributions. The Lambda update & deployment as well as the CloudFront re-deployment (via Terraform) are centralized here to make it easier to add additional Lambda functions in the future. See [cf-lambda-deploy.yml](./.github/workflows/cf-lambda-shared-deploy.yml) for the actual workflow. See [Lambda@Edge CloudFront Deployment Model](https://mitlibraries.atlassian.net/l/cp/SP3QNj1s) for an overview of the deployment process.