deploy: replace post-migrate-db / post-deploy-worker sleeps with real readiness checks

backspace · claude · backspace · commit 52e6282f7d4a · 2026-05-14T07:44:12.000-05:00
The two `sleep 180` jobs were workarounds from when pg-migration and worker containers had no healthcheck, so ECS service-stability returned on container start instead of on actual readiness (see commits 36cc92f, af51fcd, fbee5d7). Now that prerender + prerender- manager have proven the healthcheck-based approach (dbdfc33), apply the same pattern to the other two services so we can drop the heuristic sleeps from the critical path. pg-migration: append `touch /tmp/migrations-complete` to the migration CMD and add a HEALTHCHECK that requires the sentinel. Service-stability now waits for migrations to actually finish, not just for ECS to start the task. worker: pass `--port=3000` in the staging/prod startup scripts so worker-manager mounts its existing readiness endpoint (`GET /` returns 503 until `isReady = true`, then 200), and add a curl-based HEALTHCHECK on it. Workflow: flip `wait-for-service-stability: true` on both deploys, set `timeout-minutes: 10`, delete the two `sleep 180` wait jobs, and rewire the `needs:` of dependents (`deploy-ai-bot`, `deploy-bot-runner`, `deploy-worker`, `deploy-realm-server`, `finalize-deployment`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/.github/workflows/manual-deploy.yml b/.github/workflows/manual-deploy.yml
@@ -83,7 +83,7 @@ jobs:
       dockerfile: "packages/ai-bot/Dockerfile"
 
   deploy-ai-bot:
-    needs: [build-ai-bot, post-migrate-db]
+    needs: [build-ai-bot, migrate-db]
     name: Deploy ai-bot to AWS ECS
     uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
     secrets: inherit
@@ -105,7 +105,7 @@ jobs:
       dockerfile: "packages/bot-runner/Dockerfile"
 
   deploy-bot-runner:
-    needs: [build-bot-runner, post-migrate-db]
+    needs: [build-bot-runner, migrate-db]
     name: Deploy bot-runner to AWS ECS
     uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
     secrets: inherit
@@ -207,17 +207,13 @@ jobs:
       cluster: ${{ inputs.environment }}
       service-name: "boxel-pg-migration-${{ inputs.environment }}"
       image: ${{ needs.build-pg-migration.outputs.image }}
-      wait-for-service-stability: false
-
-  # the wait-for-service-stability flag doesn't seem to work in
-  # aws-actions/amazon-ecs-deploy-task-definition@v2. we keep getting timeouts
-  # waiting for service stability. So we are manually waiting here.
-  post-migrate-db:
-    name: Wait for db-migration
-    needs: [migrate-db]
-    runs-on: ubuntu-latest
-    steps:
-      - run: sleep 180
+      timeout-minutes: 10
+      # The pg-migration container writes /tmp/migrations-complete after
+      # `node-pg-migrate up` finishes (packages/postgres/Dockerfile) and its
+      # HEALTHCHECK depends on that sentinel, so service-stability now waits
+      # for migrations to actually finish rather than just for ECS to start
+      # the task. This replaces the heuristic `sleep 180` post-migrate job.
+      wait-for-service-stability: true
 
   deploy-prerender:
     name: Deploy prerender
@@ -250,7 +246,7 @@ jobs:
   deploy-worker:
     name: Deploy worker
     needs:
-      [build-worker, deploy-host, post-migrate-db, deploy-prerender-manager]
+      [build-worker, deploy-host, migrate-db, deploy-prerender-manager]
     uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
     secrets: inherit
     with:
@@ -259,22 +255,18 @@ jobs:
       cluster: ${{ inputs.environment }}
       service-name: "boxel-worker-${{ inputs.environment }}"
       image: ${{ needs.build-worker.outputs.image }}
-      wait-for-service-stability: false
-
-  # the wait-for-service-stability flag doesn't seem to work in
-  # aws-actions/amazon-ecs-deploy-task-definition@v2. we keep getting timeouts
-  # waiting for service stability. So we are manually waiting here.
-  post-deploy-worker:
-    name: Wait for worker
-    needs: [deploy-worker]
-    runs-on: ubuntu-latest
-    steps:
-      - run: sleep 180
+      timeout-minutes: 10
+      # The worker container's HEALTHCHECK curls `GET /` on the worker-manager,
+      # which returns 200 only once `isReady = true` — i.e. all workers have
+      # actually spawned (worker-manager.ts). Service-stability therefore
+      # waits for true readiness, replacing the heuristic `sleep 180` we used
+      # to do after deploy-worker.
+      wait-for-service-stability: true
 
   deploy-realm-server:
     name: Deploy realm server
     needs:
-      [post-deploy-worker, build-realm-server, deploy-host, post-migrate-db]
+      [deploy-worker, build-realm-server, deploy-host, migrate-db]
     uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
     secrets: inherit
     with:
@@ -350,11 +342,9 @@ jobs:
         build-worker,
         build-pg-migration,
         migrate-db,
-        post-migrate-db,
         deploy-prerender,
         deploy-prerender-manager,
         deploy-worker,
-        post-deploy-worker,
         deploy-realm-server,
         post-deploy-realm-server,
         apply-observability,
diff --git a/packages/postgres/Dockerfile b/packages/postgres/Dockerfile
@@ -20,4 +20,12 @@ RUN CI=1 pnpm install -r --offline
 
 WORKDIR /boxel/packages/postgres
 
-CMD ./node_modules/.bin/ts-node --transpileOnly ./scripts/fix-migration-names.ts && ./node_modules/.bin/node-pg-migrate --check-order false --migrations-table migrations up && sleep infinity
+# Touch a sentinel file after migrations complete so the HEALTHCHECK can
+# signal readiness. ECS treats the task as "stable" only once the healthcheck
+# passes, which lets the deploy job's `wait-for-service-stability: true` block
+# until migrations have actually finished — replacing the heuristic 180s sleep
+# we used to do after migrate-db.
+HEALTHCHECK --interval=5s --timeout=2s --start-period=60s --retries=120 \
+  CMD test -f /tmp/migrations-complete || exit 1
+
+CMD ./node_modules/.bin/ts-node --transpileOnly ./scripts/fix-migration-names.ts && ./node_modules/.bin/node-pg-migrate --check-order false --migrations-table migrations up && touch /tmp/migrations-complete && sleep infinity
diff --git a/packages/realm-server/scripts/start-worker-production.sh b/packages/realm-server/scripts/start-worker-production.sh
@@ -14,6 +14,7 @@ NODE_NO_WARNINGS=1 \
   OPENROUTER_REALM_URL='https://app.boxel.ai/openrouter/' \
   ts-node \
   --transpileOnly worker-manager \
+  --port=3000 \
   --allPriorityCount="${WORKER_ALL_PRIORITY_COUNT:-1}" \
   --highPriorityCount="${WORKER_HIGH_PRIORITY_COUNT:-0}" \
   --prerendererUrl='http://boxel-prerender-manager.boxel-production-internal:4222' \
diff --git a/packages/realm-server/scripts/start-worker-staging.sh b/packages/realm-server/scripts/start-worker-staging.sh
@@ -14,6 +14,7 @@ NODE_NO_WARNINGS=1 \
   OPENROUTER_REALM_URL='https://realms-staging.stack.cards/openrouter/' \
   ts-node \
   --transpileOnly worker-manager \
+  --port=3000 \
   --allPriorityCount="${WORKER_ALL_PRIORITY_COUNT:-1}" \
   --highPriorityCount="${WORKER_HIGH_PRIORITY_COUNT:-0}" \
   --prerendererUrl='http://boxel-prerender-manager.boxel-staging-internal:4222' \
diff --git a/packages/realm-server/worker.Dockerfile b/packages/realm-server/worker.Dockerfile
@@ -22,4 +22,11 @@ RUN CI=1 pnpm install -r --offline
 
 EXPOSE 3000
 
+# `GET /` returns 200 once all workers have started (`isReady = true` in
+# worker-manager.ts) and 503 before that, so this is also a readiness probe.
+# ECS `wait-for-service-stability` won't return until this passes, replacing
+# the heuristic 180s sleep we used to do after deploy-worker.
+HEALTHCHECK --interval=10s --timeout=5s --start-period=30s --retries=6 \
+  CMD curl --fail --silent --show-error --max-time 5 --output /dev/null http://localhost:3000/ || exit 1
+
 CMD pnpm --filter "./packages/realm-server" $worker_script