Agenta-AI
diff --git a/‎.github/workflows/41-railway-setup.yml‎
Lines changed: 36 additions & 1 deletion b/‎.github/workflows/41-railway-setup.yml‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎.github/workflows/43-railway-deploy.yml‎
Lines changed: 51 additions & 7 deletions b/‎.github/workflows/43-railway-deploy.yml‎
Lines changed: 51 additions & 7 deletions
diff --git a/‎.github/workflows/44-railway-tests.yml‎
Lines changed: 48 additions & 3 deletions b/‎.github/workflows/44-railway-tests.yml‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎api/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎api/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/uv.lock‎
Lines changed: 3 additions & 3 deletions b/‎api/uv.lock‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎clients/python/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎clients/python/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clients/python/uv.lock‎
Lines changed: 1 addition & 1 deletion b/‎clients/python/uv.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hosting/kubernetes/helm/Chart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎hosting/kubernetes/helm/Chart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎hosting/railway/oss/scripts/bootstrap.sh‎
Lines changed: 13 additions & 1 deletion b/‎hosting/railway/oss/scripts/bootstrap.sh‎
Lines changed: 13 additions & 1 deletion
@@ -73,10 +73,45 @@ jobs:
           chmod +x hosting/railway/oss/scripts/*.sh
           # shellcheck source=/dev/null
           source hosting/railway/oss/scripts/preview-resolve-env.sh
-          hosting/railway/oss/scripts/bootstrap.sh
+
+          # Persist the full bootstrap output so the "Upload setup log" step can
+          # publish it as an artifact, regardless of live-log truncation.
+          log_file="${GITHUB_WORKSPACE:-$PWD}/railway-setup-${PR_NUMBER:-unknown}.log"
+
+          set +e
+          hosting/railway/oss/scripts/bootstrap.sh 2>&1 | tee "$log_file"
+          setup_status=${PIPESTATUS[0]}
+          set -e
+
           echo "project_name=${RAILWAY_PROJECT_NAME}" >> "$GITHUB_OUTPUT"
           echo "environment_name=${RAILWAY_ENVIRONMENT_NAME}" >> "$GITHUB_OUTPUT"
 
+          if [ "$setup_status" -ne 0 ]; then
+            {
+              echo "### Railway Preview Setup — Failed"
+              echo
+              echo "<details><summary>Setup log (last 100 lines)</summary>"
+              echo
+              echo '```'
+              tail -n 100 "$log_file" 2>/dev/null
+              echo '```'
+              echo "</details>"
+            } >> "$GITHUB_STEP_SUMMARY"
+            exit "$setup_status"
+          fi
+
+      - name: Upload setup log
+        if: always()
+        # Diagnostics only: a failed/duplicate upload must never fail the job.
+        continue-on-error: true
+        uses: actions/upload-artifact@v4
+        with:
+          name: railway-setup-log-${{ inputs.pr_number }}
+          path: railway-setup-*.log
+          if-no-files-found: ignore
+          overwrite: true
+          retention-days: 7
+
       - name: Summary
         run: |
           {
 
@@ -112,11 +112,10 @@ jobs:
           # shellcheck source=/dev/null
           source hosting/railway/oss/scripts/preview-resolve-env.sh
 
-          log_file="$(mktemp)"
-          cleanup() {
-            rm -f "$log_file"
-          }
-          trap cleanup EXIT
+          # Keep the log in the workspace so the "Upload deploy log" step can
+          # publish it as an artifact. GitHub's live log can truncate streamed
+          # output, so we always persist a full copy.
+          log_file="${GITHUB_WORKSPACE:-$PWD}/railway-deploy-${PR_NUMBER:-unknown}.log"
 
           project="$RAILWAY_PROJECT_NAME"
           environment_name="$RAILWAY_ENVIRONMENT_NAME"
@@ -177,13 +176,58 @@ jobs:
           echo "environment_name=${environment_name}" >> "$GITHUB_OUTPUT"
           echo "railway_logs_url=${railway_logs_url}" >> "$GITHUB_OUTPUT"
 
-          trap - EXIT
-          cleanup
+          # Best-effort diagnostics; never let these change the step outcome.
+          set +e
+          # On failure, pull the tail of the key services' Railway logs into
+          # this job so the root cause (e.g. a Postgres crash-loop) is visible
+          # here instead of only in the Railway dashboard.
+          if [ "$deploy_failed" = "true" ]; then
+            # Tee into the persisted log so the uploaded artifact and the
+            # step-summary tail include the Railway service logs too, not just
+            # the (possibly truncated) live Actions log.
+            dump_railway_logs 2>&1 | tee -a "$log_file"
+          fi
+
+          status_label="Deployed"
+          [ "$deploy_failed" = "true" ] && status_label="Failed"
+          {
+            echo "### Railway Preview Deploy"
+            echo
+            echo "| Item | Value |"
+            echo "| --- | --- |"
+            echo "| PR | \`${PR_NUMBER}\` |"
+            echo "| Image tag | \`${IMAGE_TAG}\` |"
+            echo "| Status | ${status_label} |"
+            [ -n "$url" ] && echo "| Preview URL | ${url} |"
+            [ -n "$railway_logs_url" ] && echo "| Railway logs | [Open logs](${railway_logs_url}) |"
+            if [ "$deploy_failed" = "true" ]; then
+              echo
+              echo "<details><summary>Deploy log (last 100 lines)</summary>"
+              echo
+              echo '```'
+              tail -n 100 "$log_file" 2>/dev/null
+              echo '```'
+              echo "</details>"
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"
+          set -e
 
           if [ "$deploy_failed" = "true" ]; then
             exit 1
           fi
 
+      - name: Upload deploy log
+        if: always()
+        # Diagnostics only: a failed/duplicate upload must never fail the job.
+        continue-on-error: true
+        uses: actions/upload-artifact@v4
+        with:
+          name: railway-deploy-log-${{ inputs.pr_number }}
+          path: railway-deploy-*.log
+          if-no-files-found: ignore
+          overwrite: true
+          retention-days: 7
+
       - name: Post preview URL as PR comment
         if: inputs.pr_number != '' && steps.deploy.outputs.preview_url != ''
         uses: actions/github-script@v7
 
@@ -235,6 +235,11 @@ jobs:
     needs: prepare
     if: needs.prepare.outputs.needs_deployment == 'true'
     runs-on: ubuntu-latest
+    # Backstop only: the real hangs are bounded per-step (wait_for caps at ~10m,
+    # the browser install retries with a 180s/attempt cap). This guards against
+    # an unexpected hang without cutting off a legitimately slow run (deploy
+    # readiness wait + cold browser install can take ~20m).
+    timeout-minutes: 30
     steps:
       - name: Wait for deployed web and API
         env:
@@ -281,7 +286,7 @@ jobs:
         if: steps.auth_bootstrap.outputs.enabled == 'true'
         uses: actions/setup-node@v4
         with:
-          node-version: "24"
+          node-version: "22"
 
       - name: Install pnpm
         if: steps.auth_bootstrap.outputs.enabled == 'true'
@@ -294,10 +299,50 @@ jobs:
         working-directory: web
         run: pnpm install --no-frozen-lockfile --filter agenta-web-tests...
 
+      # Cache the downloaded browsers. On a cache hit `playwright install` is a
+      # no-op, which avoids the chromium download entirely — and that download
+      # is what stalls: the debug trace showed the 170 MiB transfer hitting 100%
+      # in ~2s, then the install hanging (no progress) until killed. apt deps
+      # were never the problem (they finished in ~10s).
+      - name: Cache Playwright browsers
+        id: pw-cache
+        if: steps.auth_bootstrap.outputs.enabled == 'true'
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ hashFiles('web/pnpm-lock.yaml') }}
+          restore-keys: |
+            playwright-${{ runner.os }}-
+
+      # OS libraries for chromium. This is the fast, reliable part (~10s); kept
+      # as its own step so a browser-download stall can't be confused with it.
+      - name: Install Playwright system dependencies
+        if: steps.auth_bootstrap.outputs.enabled == 'true'
+        working-directory: web/tests
+        run: pnpm exec playwright install-deps chromium
+
+      # Browser binaries. Root cause of the original ~6h hang: Playwright 1.59's
+      # zip extraction deadlocks on Node 24 (reproduced locally: Node 22/23
+      # extract in ~16s, Node 24 hangs indefinitely). The job is pinned to Node
+      # 22 (LTS) above, which fixes it. The cache makes this a no-op on a hit,
+      # and the retry + per-attempt timeout guard against any transient stall.
       - name: Install Playwright browser
         if: steps.auth_bootstrap.outputs.enabled == 'true'
         working-directory: web/tests
-        run: pnpm exec playwright install --with-deps chromium
+        run: |
+          for attempt in 1 2 3; do
+            echo "::group::playwright install chromium (attempt ${attempt}/3)"
+            if timeout 180 pnpm exec playwright install chromium; then
+              echo "::endgroup::"
+              echo "browser install succeeded on attempt ${attempt}"
+              exit 0
+            fi
+            echo "::endgroup::"
+            echo "attempt ${attempt} stalled or failed; retrying after 5s..."
+            sleep 5
+          done
+          echo "playwright browser install failed after 3 attempts" >&2
+          exit 1
 
       - name: Bootstrap auth with global setup
         if: steps.auth_bootstrap.outputs.enabled == 'true'
@@ -587,7 +632,7 @@ jobs:
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: "24"
+          node-version: "22"
 
       - name: Install pnpm
         uses: pnpm/action-setup@v4
 
@@ -1,6 +1,6 @@
 [project]
 name = "api"
-version = "0.100.9"
+version = "0.101.0"
 description = "Agenta API"
 requires-python = ">=3.11,<3.14"
 authors = [
 
@@ -1,6 +1,6 @@
 [project]
 name = "agenta-client"
-version = "0.100.9"
+version = "0.101.0"
 description = "Fern-generated Python client for the Agenta API."
 requires-python = ">=3.11,<3.14"
 authors = [
 
@@ -2,8 +2,8 @@ apiVersion: v2
 name: agenta
 description: A Helm chart for deploying Agenta (OSS or EE) on Kubernetes
 type: application
-version: 0.100.9
-appVersion: "v0.100.9"
+version: 0.101.0
+appVersion: "v0.101.0"
 keywords:
   - agenta
   - llm
 
@@ -7,6 +7,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
 # shellcheck source=lib.sh
 source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
 
+install_error_trap
+
 PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}"
 ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}"
 SOURCE_COMPOSE_FILE="${RAILWAY_SOURCE_COMPOSE_FILE:-$(railway_source_compose_file "$ROOT_DIR")}"
@@ -37,9 +39,19 @@ require_railway_auth() {
 
     # Verify the token actually works. A revoked or invalid token will cause
     # every subsequent call to fail with a confusing "Unauthorized" error.
+    # Distinguish a genuine auth failure from rate-limiting / transient network
+    # errors (where the token is fine) so the log points at the real cause.
     local whoami_output
     whoami_output="$(railway_call whoami 2>&1)" || {
-        printf "Railway authentication failed. The token appears to be invalid or revoked.\n" >&2
+        if printf "%s" "$whoami_output" | grep -qiE "rate.?limit"; then
+            printf "Railway auth check could not complete: the API is rate-limiting requests (retries exhausted).\n" >&2
+            printf "This is throttling, not a bad token. Re-run once the rate-limit window clears.\n" >&2
+        elif printf "%s" "$whoami_output" | grep -qiE "timed out|error sending request|failed to fetch|connection (reset|refused|closed)|temporarily unavailable|service unavailable|bad gateway|gateway time-?out"; then
+            printf "Railway auth check could not complete: transient network error reaching the Railway API.\n" >&2
+            printf "The token is likely fine; this is usually temporary. Re-run.\n" >&2
+        else
+            printf "Railway authentication failed. The token appears to be invalid or revoked.\n" >&2
+        fi
         printf "Output: %s\n" "$whoami_output" >&2
         exit 1
     }