fix(k8s): cap buildLogCache size to bound memory on failure bursts (bug-bash #2) #463
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # instant.dev/api — CI | |
| # | |
| # Multi-repo note: go.mod uses `replace instant.dev/proto => ../proto`. The | |
| # "Checkout proto sibling" step clones the proto repo next to this checkout. | |
| # If your GitHub repo is not named `proto`, set the `PROTO_REPO` repository | |
| # variable (e.g. `myorg/instant-proto`) or fork/rename to match | |
| # `${{ github.repository_owner }}/proto`. | |
| name: CI | |
| on: | |
| push: | |
| branches: [master] | |
| # CI-minute savings (2026-05-21): skip CI on docs-only commits. | |
| paths-ignore: | |
| - '**.md' | |
| - 'docs/**' | |
| - 'CLAUDE.md' | |
| - '.gitignore' | |
| - 'LICENSE' | |
| - 'BUGBASH-*/**' | |
| pull_request: | |
| branches: [master] | |
| paths-ignore: | |
| - '**.md' | |
| - 'docs/**' | |
| - 'CLAUDE.md' | |
| - '.gitignore' | |
| - 'LICENSE' | |
| - 'BUGBASH-*/**' | |
| schedule: | |
| # Weekly — reserved for optional scheduled jobs (see e2e job). | |
| - cron: '0 6 * * 1' | |
| workflow_dispatch: | |
| concurrency: | |
| # CI-minute savings (2026-05-21): cancel prior in-flight CI run for the | |
| # same branch/PR when a new commit lands. Different PRs/branches still | |
| # run in parallel (group key includes github.ref). | |
| group: ci-${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| # Stale-green guard. A PR can show a green CI run that was executed BEFORE a | |
| # breaking commit landed on the base branch — merging it would ship a broken | |
| # master. This job FAILS if the PR branch does not contain origin/<base> as | |
| # an ancestor, forcing an "Update branch" before the PR can merge. | |
| up-to-date-with-base: | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'pull_request' | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Fail if PR branch is behind its base branch | |
| run: | | |
| BASE="${{ github.event.pull_request.base.ref }}" | |
| git fetch origin "${BASE}" --depth=1 | |
| if git merge-base --is-ancestor "origin/${BASE}" HEAD; then | |
| echo "PR branch contains origin/${BASE} — up to date." | |
| else | |
| echo "::error::PR branch is behind origin/${BASE}. Update the branch (merge/rebase ${BASE}) and re-run CI so it validates against current base." | |
| exit 1 | |
| fi | |
| build-and-test: | |
| runs-on: ubuntu-latest | |
| services: | |
| postgres: | |
| image: postgres:16-alpine | |
| env: | |
| POSTGRES_USER: postgres | |
| POSTGRES_PASSWORD: postgres | |
| POSTGRES_DB: instant_dev_test | |
| ports: | |
| - 5432:5432 | |
| options: >- | |
| --health-cmd pg_isready | |
| --health-interval 10s | |
| --health-timeout 5s | |
| --health-retries 5 | |
| redis: | |
| image: redis:7-alpine | |
| ports: | |
| - 6379:6379 | |
| options: >- | |
| --health-cmd "redis-cli ping" | |
| --health-interval 10s | |
| --health-timeout 5s | |
| --health-retries 5 | |
| env: | |
| TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/instant_dev_test?sslmode=disable | |
| TEST_REDIS_URL: redis://localhost:6379/15 | |
| # db-provider admin target. internal/providers/db/local.go CREATEs a | |
| # customer database per /db/new; in tests it connects to | |
| # TEST_POSTGRES_CUSTOMERS_URL. testhelpers defaults this to an | |
| # unreachable localhost:5434, so without this every postgres- | |
| # provisioning test (TestDBNew_*, TestBulkTwin_*) 503'd. Points at an | |
| # instant_customers DB created on the same service container below — | |
| # exactly as deploy.yml's proven-green gate does. | |
| TEST_POSTGRES_CUSTOMERS_URL: postgres://postgres:postgres@localhost:5432/instant_customers?sslmode=disable | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Checkout proto sibling (for go.mod replace ../proto) | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ vars.PROTO_REPO || format('{0}/proto', github.repository_owner) }} | |
| token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} | |
| path: _proto_ci | |
| - name: Place ../proto for Go replace directive | |
| run: mv _proto_ci ../proto | |
| - name: Checkout common sibling (for go.mod replace ../common) | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ vars.COMMON_REPO || format('{0}/common', github.repository_owner) }} | |
| token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} | |
| path: _common_ci | |
| - name: Place ../common for Go replace directive | |
| run: mv _common_ci ../common | |
| - uses: actions/setup-go@v6 | |
| with: | |
| go-version: '1.25' | |
| - name: Apply DB migrations to the test database | |
| # Mirrors deploy.yml's proven-green gate. Before this step CI ran | |
| # tests against a BARE Postgres whose schema came ONLY from | |
| # testhelpers.runMigrations — a hand-maintained mirror. This step | |
| # applies the REAL migration files (exactly like `make test-db-up`), | |
| # then creates instant_customers — the db provider's local backend | |
| # (internal/providers/db/local.go) CREATEs a customer database per | |
| # /db/new and connects to TEST_POSTGRES_CUSTOMERS_URL for it. Without | |
| # this DB every postgres provision (TestDBNew_*, TestBulkTwin_*) 503'd. | |
| env: | |
| PGPASSWORD: postgres | |
| run: | | |
| for f in $(ls internal/db/migrations/*.sql | sort); do | |
| echo "→ applying $(basename "$f")" | |
| psql -h localhost -U postgres -d instant_dev_test -f "$f" >/dev/null | |
| done | |
| echo "all migrations applied to instant_dev_test" | |
| psql -h localhost -U postgres -d postgres -c "CREATE DATABASE instant_customers" >/dev/null | |
| echo "created instant_customers (db-provider admin target)" | |
| - run: go build ./... | |
| - run: go vet ./... | |
| - name: Start NATS with monitoring (queue provider health-checks :8222) | |
| # internal/providers/queue/local.go Provision() health-checks | |
| # http://<NATSHost>:8222/healthz then returns nats://<host>:4222. | |
| # TestQueue_* build a handler with an empty NATSHost, which defaults to | |
| # "localhost" (queueprovider.New("")), so they need a real NATS | |
| # reachable on localhost:8222. GitHub service containers can't pass the | |
| # `-m` monitoring flag, so we run nats-server here instead. NATS-DOWN | |
| # tests use the reserved non-resolvable host `nats.test`, so a live NATS | |
| # on localhost does not collide with their 503 expectations. | |
| run: | | |
| docker run -d --name nats -p 4222:4222 -p 8222:8222 nats:2.10-alpine -m 8222 | |
| for i in $(seq 1 15); do | |
| curl -sf http://localhost:8222/healthz >/dev/null && { echo "NATS healthy after ${i}s"; break; } | |
| echo "waiting for NATS monitoring endpoint (${i}/15)"; sleep 1 | |
| done | |
| curl -sf http://localhost:8222/healthz >/dev/null || { echo "::error::NATS monitoring never came up"; exit 1; } | |
| # The gate. This MUST stay equal to deploy.yml's proven-green | |
| # invocation (`go test ./... -short -count=1 -p 1`) PLUS `-race`: | |
| # - `-p 1` is load-bearing: every package shares the single | |
| # instant_dev_test DB + redis/15. Default parallelism runs ~25 | |
| # package binaries at once and they corrupt each other's DB/redis | |
| # state mid-test. `-p 1` serialises package execution. | |
| # - `-short` matches deploy.yml so the two gates run the identical | |
| # hermetic suite (tests that genuinely need a live k8s/provisioner | |
| # stack are tagged `e2e` and excluded from `./...` anyway). | |
| # - `-race` is the extra rigor CI adds over deploy.yml — it caught | |
| # the BillingHandler.ensureRazorpayFns data race. | |
| - run: go test ./... -short -race -count=1 -p 1 | |
| # E2E requires a live Kubernetes stack (see repo CLAUDE.md). This job does not | |
| # run on push/PR — only on schedule or manual dispatch — so default CI stays fast. | |
| e2e: | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Checkout proto sibling | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ vars.PROTO_REPO || format('{0}/proto', github.repository_owner) }} | |
| token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} | |
| path: _proto_ci | |
| - run: mv _proto_ci ../proto | |
| - name: Checkout common sibling | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: ${{ vars.COMMON_REPO || format('{0}/common', github.repository_owner) }} | |
| token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} | |
| path: _common_ci | |
| - run: mv _common_ci ../common | |
| - uses: actions/setup-go@v6 | |
| with: | |
| go-version: '1.25' | |
| - name: E2E placeholder (wire to k8s / secrets) | |
| run: | | |
| echo "Configure services, secrets, and port-forwards, then run e.g.:" | |
| echo " go test ./e2e/... -tags e2e -count=1 -timeout 180s" | |
| echo "See CLAUDE.md (Full-stack E2E) for required env vars." | |
| # Cross-repo Layer-1 auth-contract gate. The api owns the CORS allowlist | |
| # and the /auth/exchange + /auth/email/start endpoints — an api-side | |
| # change that drops access-control-allow-credentials would not trigger | |
| # the instanode-web CI on its own, so the browser-level regression | |
| # (2026-05-29 → 2026-05-30) could ship despite green api unit tests. | |
| # | |
| # This job fires a repository_dispatch on instanode-web; instanode-web's | |
| # .github/workflows/auth-contract-e2e.yml listens for the matching | |
| # `auth-contract-e2e-from-api` type and runs the Chromium smoke against | |
| # the same prod targets. The dispatch result will not gate this PR | |
| # mechanically (cross-repo status checks aren't wired here yet — see | |
| # follow-up issue), but it surfaces the failure in the instanode-web | |
| # Actions tab so anyone reviewing the api PR can click through. | |
| # | |
| # Auth: REPO_ACCESS_TOKEN must have `repo` scope on instanode-web. If the | |
| # secret is missing the step soft-skips (warn, don't fail) so the api CI | |
| # stays green during initial rollout — flip the soft-skip to `exit 1` | |
| # once the secret is provisioned on all relevant environments. | |
| dispatch-auth-contract-e2e: | |
| name: Trigger instanode-web auth-contract smoke | |
| runs-on: ubuntu-latest | |
| needs: build-and-test | |
| if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/master') | |
| steps: | |
| - name: Fire repository_dispatch on instanode-web | |
| env: | |
| DISPATCH_TOKEN: ${{ secrets.REPO_ACCESS_TOKEN }} | |
| # SECURITY: avoid interpolating untrusted github.event.* fields | |
| # into the shell. Only stable repo-controlled identifiers are | |
| # exposed and the payload is constructed via printf with | |
| # parameter expansion (no string concatenation of attacker | |
| # input). | |
| SHA: ${{ github.sha }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| TRIGGER: ${{ github.event_name }} | |
| run: | | |
| set -euo pipefail | |
| if [ -z "${DISPATCH_TOKEN:-}" ]; then | |
| echo "::warning::REPO_ACCESS_TOKEN not set; skipping cross-repo auth-contract dispatch. " \ | |
| "Provision the secret on the api repo (with `repo` scope on instanode-web) to enable Layer-1 gate." | |
| exit 0 | |
| fi | |
| # PR_NUMBER may be empty on push events; default to "main". | |
| # Defense-in-depth: enforce numeric PR number even though | |
| # github.event.pull_request.number is an integer assigned by | |
| # GitHub, never user-controlled. | |
| pr="${PR_NUMBER:-main}" | |
| case "$pr" in | |
| main|[0-9]*) ;; | |
| *) echo "::error::unexpected PR_NUMBER value: $pr"; exit 1 ;; | |
| esac | |
| # SHA is a 40-char hex from github.sha — repo-controlled. Validate | |
| # shape to keep the JSON payload trivially-injection-proof. | |
| case "$SHA" in | |
| [0-9a-f]*) ;; | |
| *) echo "::error::unexpected SHA shape: $SHA"; exit 1 ;; | |
| esac | |
| # TRIGGER is github.event_name — a GitHub-controlled enum | |
| # (push|pull_request|schedule|workflow_dispatch|...). Allowlist | |
| # the values this job is reachable from. | |
| case "$TRIGGER" in | |
| push|pull_request) ;; | |
| *) echo "::error::unexpected TRIGGER: $TRIGGER"; exit 1 ;; | |
| esac | |
| payload=$(printf '{"event_type":"auth-contract-e2e-from-api","client_payload":{"api_sha":"%s","api_pr":"%s","trigger":"%s","api_url":"https://api.instanode.dev","web_origin":"https://instanode.dev"}}' \ | |
| "$SHA" "$pr" "$TRIGGER") | |
| echo "Dispatching to InstaNode-dev/instanode-web: $payload" | |
| http_code=$(curl -sS -o /tmp/dispatch.out -w '%{http_code}' \ | |
| -X POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer ${DISPATCH_TOKEN}" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| https://api.github.com/repos/InstaNode-dev/instanode-web/dispatches \ | |
| -d "$payload") | |
| echo "dispatch response: HTTP $http_code" | |
| cat /tmp/dispatch.out || true | |
| # GitHub returns 204 on success. Treat anything else as a soft | |
| # failure during the rollout window — log and pass so a transient | |
| # cross-repo hiccup doesn't red the api PR. Tighten to `exit 1` | |
| # once we have a week of clean runs. | |
| if [ "$http_code" != "204" ]; then | |
| echo "::warning::cross-repo dispatch returned $http_code (expected 204). Not failing the api PR yet." | |
| fi |