fix(agent): validate AWS credentials before Docker build (#33)

scoropeza · krokoko · web-flow · commit b7554810f8eb · 2026-04-15T04:27:55.000Z
Move credential resolution (explicit env vars, AWS CLI/SSO, ~/.aws
mount) before the Docker image build so expired sessions and missing
credentials fail immediately with actionable guidance.

- Fix unbound variable crash with empty array under set -u
- Remove dead MOUNT_AWS_DIR variable
- Expand error message to list all three credential methods
- Update local testing docs with credential resolution details,
  troubleshooting table, and Starlight admonitions

Co-authored-by: Alain Krok &lt;alkrok@amazon.com&gt;
diff --git a/agent/run.sh b/agent/run.sh
@@ -111,6 +111,56 @@ if [[ -z "${AWS_REGION:-}" ]]; then
     exit 1
 fi
 
+# ---------------------------------------------------------------------------
+# Resolve AWS credentials (before Docker build to fail fast)
+# ---------------------------------------------------------------------------
+# Store resolved credentials in variables so they can be applied to DOCKER_ARGS
+# after the image is built. This avoids a lengthy Docker build only to discover
+# that AWS credentials are missing or expired.
+AWS_CRED_MODE=""
+RESOLVED_KEY=""
+RESOLVED_SECRET=""
+RESOLVED_TOKEN=""
+
+if [[ -n "${AWS_ACCESS_KEY_ID:-}" ]]; then
+    AWS_CRED_MODE="explicit"
+    RESOLVED_KEY="${AWS_ACCESS_KEY_ID}"
+    RESOLVED_SECRET="${AWS_SECRET_ACCESS_KEY}"
+    RESOLVED_TOKEN="${AWS_SESSION_TOKEN:-}"
+    echo "  AWS:       using explicit credentials (AWS_ACCESS_KEY_ID)"
+elif command -v aws &>/dev/null; then
+    # Resolve credentials from the AWS CLI (handles SSO, profiles, credential files).
+    # This avoids the need to mount ~/.aws and replicate the full credential chain
+    # inside the container — SSO tokens in particular don't resolve well there.
+    echo "  AWS:       resolving credentials via AWS CLI${AWS_PROFILE:+ (profile '${AWS_PROFILE}')}..."
+    EXPORT_CMD=(aws configure export-credentials --format process)
+    [[ -n "${AWS_PROFILE:-}" ]] && EXPORT_CMD+=(--profile "${AWS_PROFILE}")
+
+    CREDS_JSON=$("${EXPORT_CMD[@]}" 2>/dev/null) || {
+        echo "ERROR: Failed to resolve AWS credentials via AWS CLI." >&2
+        echo "  Possible fixes:" >&2
+        echo "    - Run 'aws sso login${AWS_PROFILE:+ --profile ${AWS_PROFILE}}' if using SSO" >&2
+        echo "    - Run 'aws configure${AWS_PROFILE:+ --profile ${AWS_PROFILE}}' to set up a profile" >&2
+        echo "    - Export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY directly" >&2
+        exit 1
+    }
+
+    AWS_CRED_MODE="resolved"
+    RESOLVED_KEY=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c['AccessKeyId'])")
+    RESOLVED_SECRET=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c['SecretAccessKey'])")
+    RESOLVED_TOKEN=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c.get('SessionToken',''))")
+    echo "  AWS:       resolved temporary credentials (AccessKeyId: ${RESOLVED_KEY:0:8}...)"
+elif [[ -d "${HOME}/.aws" ]]; then
+    AWS_CRED_MODE="mount"
+    if [[ -n "${AWS_PROFILE:-}" ]]; then
+        echo "  AWS:       mounting ~/.aws with profile '${AWS_PROFILE}' (SSO may not work)"
+    else
+        echo "  AWS:       mounting ~/.aws (using default profile)"
+    fi
+else
+    echo "WARNING: No AWS credentials detected. Set AWS_ACCESS_KEY_ID or AWS_PROFILE, or ensure ~/.aws exists." >&2
+fi
+
 # ---------------------------------------------------------------------------
 # Build
 # ---------------------------------------------------------------------------
@@ -161,49 +211,17 @@ if [[ "$MODE" == "server" ]]; then
     DOCKER_ARGS+=(-p 8080:8080)
 fi
 
-# AWS credentials: prefer explicit env vars, then resolve from profile/SSO
-if [[ -n "${AWS_ACCESS_KEY_ID:-}" ]]; then
-    DOCKER_ARGS+=(
-        -e "AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}"
-        -e "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}"
-    )
-    [[ -n "${AWS_SESSION_TOKEN:-}" ]] && DOCKER_ARGS+=(-e "AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}")
-    echo "  AWS:       using explicit credentials (AWS_ACCESS_KEY_ID)"
-elif command -v aws &>/dev/null; then
-    # Resolve credentials from the AWS CLI (handles SSO, profiles, credential files).
-    # This avoids the need to mount ~/.aws and replicate the full credential chain
-    # inside the container — SSO tokens in particular don't resolve well there.
-    PROFILE_ARG=()
-    [[ -n "${AWS_PROFILE:-}" ]] && PROFILE_ARG=(--profile "${AWS_PROFILE}")
-
-    echo "  AWS:       resolving credentials via AWS CLI${AWS_PROFILE:+ (profile '${AWS_PROFILE}')}..."
-    CREDS_JSON=$(aws configure export-credentials "${PROFILE_ARG[@]}" --format process 2>/dev/null) || {
-        echo "ERROR: Failed to resolve AWS credentials. Make sure you are logged in:" >&2
-        echo "  aws sso login${AWS_PROFILE:+ --profile ${AWS_PROFILE}}" >&2
-        exit 1
-    }
-
-    RESOLVED_KEY=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c['AccessKeyId'])")
-    RESOLVED_SECRET=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c['SecretAccessKey'])")
-    RESOLVED_TOKEN=$(echo "$CREDS_JSON" | python3 -c "import sys,json; c=json.load(sys.stdin); print(c.get('SessionToken',''))")
-
+# Apply previously resolved AWS credentials to DOCKER_ARGS
+if [[ "$AWS_CRED_MODE" == "explicit" || "$AWS_CRED_MODE" == "resolved" ]]; then
     DOCKER_ARGS+=(
         -e "AWS_ACCESS_KEY_ID=${RESOLVED_KEY}"
         -e "AWS_SECRET_ACCESS_KEY=${RESOLVED_SECRET}"
     )
     [[ -n "${RESOLVED_TOKEN}" ]] && DOCKER_ARGS+=(-e "AWS_SESSION_TOKEN=${RESOLVED_TOKEN}")
-    echo "  AWS:       resolved temporary credentials (AccessKeyId: ${RESOLVED_KEY:0:8}...)"
-elif [[ -d "${HOME}/.aws" ]]; then
+elif [[ "$AWS_CRED_MODE" == "mount" ]]; then
     # Fallback: mount ~/.aws directly (works for static credential files, not SSO)
     DOCKER_ARGS+=(-v "${HOME}/.aws:/home/agent/.aws:ro")
-    if [[ -n "${AWS_PROFILE:-}" ]]; then
-        DOCKER_ARGS+=(-e "AWS_PROFILE=${AWS_PROFILE}")
-        echo "  AWS:       mounting ~/.aws with profile '${AWS_PROFILE}' (SSO may not work)"
-    else
-        echo "  AWS:       mounting ~/.aws (using default profile)"
-    fi
-else
-    echo "WARNING: No AWS credentials detected. Set AWS_ACCESS_KEY_ID or AWS_PROFILE, or ensure ~/.aws exists." >&2
+    [[ -n "${AWS_PROFILE:-}" ]] && DOCKER_ARGS+=(-e "AWS_PROFILE=${AWS_PROFILE}")
 fi
 
 echo ""
diff --git a/docs/guides/DEVELOPER_GUIDE.md b/docs/guides/DEVELOPER_GUIDE.md
@@ -256,6 +256,10 @@ cd ..
 
 Before deploying to AWS, you can build and run the agent Docker container locally. The `agent/run.sh` script handles building the image, resolving AWS credentials, and applying AgentCore-matching resource constraints (2 vCPU, 8 GB RAM) so the local environment closely mirrors production.
 
+:::tip
+The script validates AWS credentials **before** starting the Docker build, so problems like an expired SSO session surface immediately — not after a lengthy image build.
+:::
+
 #### Prerequisites
 
 The `owner/repo` you pass to `run.sh` must match an onboarded Blueprint and be a repository your `GITHUB_TOKEN` can **push to and open PRs on** (same rules as **Repository preparation** at the start of this guide). If you have not changed the Blueprint, fork `awslabs/agent-plugins`, set **`repo`** to your fork, and use a PAT scoped to that fork—then pass the same **`owner/repo`** here.
@@ -267,7 +271,29 @@ export GITHUB_TOKEN="ghp_..."     # Fine-grained PAT (see agent/README.md for re
 export AWS_REGION="us-east-1"     # Region where Bedrock models are enabled
 ```
 
-For AWS credentials, either export `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` directly, or have the AWS CLI configured (the script will resolve credentials from your active profile or SSO session automatically).
+#### AWS credential resolution
+
+The script resolves AWS credentials in priority order:
+
+1. **Explicit environment variables** — If `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` are set, they are passed directly to the container. Include `AWS_SESSION_TOKEN` when using temporary credentials (e.g. from `aws sts assume-role`).
+
+   ```bash
+   export AWS_ACCESS_KEY_ID="AKIA..."
+   export AWS_SECRET_ACCESS_KEY="..."
+   export AWS_SESSION_TOKEN="..."   # required for temporary credentials
+   ```
+
+2. **AWS CLI resolution** — If the CLI is installed, the script runs `aws configure export-credentials` to resolve credentials from your active profile or SSO session. Set `AWS_PROFILE` to target a specific profile.
+
+   ```bash
+   export AWS_PROFILE="my-dev-profile"   # optional — defaults to the CLI default profile
+   ```
+
+3. **`~/.aws` directory mount** — If neither of the above is available but `~/.aws` exists, the directory is bind-mounted read-only into the container. This works for static credential files but **not for SSO tokens**, which don't resolve well inside the container.
+
+:::caution
+If none of these methods succeeds, the script prints a warning and continues without AWS credentials. The container will start but any AWS API call (Bedrock, DynamoDB, etc.) will fail at runtime. Make sure at least one credential source is configured before running a real task.
+:::
 
 #### Running a task locally
 
@@ -303,6 +329,8 @@ curl -X POST http://localhost:8080/invocations \
   -d '{"input":{"prompt":"Fix the login bug","repo_url":"owner/repo"}}'
 ```
 
+In server mode, `repo_url`, `prompt`, and other task parameters can be sent via the `/invocations` JSON payload instead of environment variables.
+
 #### Monitoring a running container
 
 The container runs with a fixed name (`bgagent-run`). In a second terminal:
@@ -320,12 +348,22 @@ docker exec -it bgagent-run bash                  # shell into the container
 |---|---|---|
 | `ANTHROPIC_MODEL` | `us.anthropic.claude-sonnet-4-6` | Bedrock model ID |
 | `MAX_TURNS` | `100` | Max agent turns before stopping |
+| `MAX_BUDGET_USD` | | Cost ceiling for local batch runs (USD). Not used in production — see below |
 | `DRY_RUN` | | Set to `1` to validate config and print prompt without running the agent |
 
 **Cost budget** is not configured here for production tasks: set **`max_budget_usd`** when creating a task (REST API, CLI `--max-budget`, or per-repo Blueprint). The orchestrator passes it in the runtime invocation payload. The optional env var `MAX_BUDGET_USD` applies only to **local batch** runs; see `agent/README.md`.
 
 For the full list of environment variables and GitHub PAT permissions, see `agent/README.md`.
 
+#### Troubleshooting
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| `ERROR: Failed to resolve AWS credentials via AWS CLI` | SSO session expired or profile misconfigured | Run `aws sso login --profile <your-profile>` if using SSO, or `aws configure` to set up a profile, or export `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY` directly |
+| `ERROR: GITHUB_TOKEN is not set` | Missing PAT | Export `GITHUB_TOKEN` (see `agent/README.md` for required scopes) |
+| `WARNING: No AWS credentials detected` | No env vars, no AWS CLI, no `~/.aws` directory | Configure one of the three credential methods above |
+| `WARNING: Image exceeds AgentCore 2 GB limit!` | Agent image too large for production | Reduce dependencies or use multi-stage Docker build |
+
 ### Deployment
 
 Once your agent works locally, you can deploy it on AWS. A **full** `mise run //cdk:deploy` of this stack has been observed at **~572 seconds (~9.5 minutes)** total (CDK-reported *Total time*); expect variation by Region, account state, and whether container layers are already cached.
diff --git a/docs/src/content/docs/developer-guide/Installation.md b/docs/src/content/docs/developer-guide/Installation.md