From e30f90eb9867dab57e641cccbd2b4681edf26395 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 27 Apr 2026 14:26:20 +0200 Subject: [PATCH 1/8] feat(ci): add experimental bug-fix agent with live devcontainer access --- .../workflows/_bug-fix-agent-experimental.yml | 324 ++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 .github/workflows/_bug-fix-agent-experimental.yml diff --git a/.github/workflows/_bug-fix-agent-experimental.yml b/.github/workflows/_bug-fix-agent-experimental.yml new file mode 100644 index 000000000..cb1fc2105 --- /dev/null +++ b/.github/workflows/_bug-fix-agent-experimental.yml @@ -0,0 +1,324 @@ +name: Bug Fix Agent (Experimental — Visual) + +# EXPERIMENT: tier-2 bug fixing with a live ToolHive Studio running inside the +# project's devcontainer. The agent can drive the running app via `docker exec` +# (xdotool, screenshots) when a unit-test reproduction is impractical. PR flow +# is identical to the production agent — base = main, head = fix/auto-. +# +# Trigger: +# gh workflow run _bug-fix-agent-experimental.yml \ +# -r experiment/bug-fix-visual -f issue-number=663 + +on: + workflow_dispatch: + inputs: + issue-number: + description: 'Issue number to fix' + required: true + type: number + +concurrency: + group: bug-fix-experimental-${{ inputs.issue-number }} + cancel-in-progress: true + +jobs: + fix: + name: Bug Fix (Visual POC) + runs-on: ubuntu-latest + timeout-minutes: 90 + permissions: + contents: write + pull-requests: write + issues: write + id-token: write + env: + # Required by .devcontainer/devcontainer.json runArgs port mapping. + # In CI we don't need the host-side noVNC tunnel; just pick a free port + # so the mapping resolves to something valid. + NOVNC_HOST_PORT: 6080 + + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3 + with: + app-id: ${{ secrets.TOOLHIVE_STUDIO_CI_APP_ID }} + private-key: ${{ secrets.TOOLHIVE_STUDIO_CI_APP_KEY }} + + - name: Checkout main + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + ref: refs/heads/main + token: ${{ steps.app-token.outputs.token }} + + - name: Fetch issue body + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + gh issue view ${{ inputs.issue-number }} --json title,body,labels \ + --template '# {{.title}}{{"\n\n"}}{{.body}}' > issue-body.md + + - name: Build & start devcontainer (cached via gha) + uses: devcontainers/ci@v0.3 + with: + imageName: toolhive-studio-devcontainer + cacheFrom: type=gha + push: never + runCmd: | + echo "Devcontainer up. Workspace: $(pwd)" + ls -la + node --version + pnpm --version + + - name: Find container ID + id: container + run: | + CONTAINER=$(docker ps \ + --filter "label=devcontainer.local_folder=$GITHUB_WORKSPACE" \ + --format '{{.ID}}' | head -1) + if [ -z "$CONTAINER" ]; then + echo "::error::No devcontainer found for $GITHUB_WORKSPACE" + docker ps + exit 1 + fi + echo "id=$CONTAINER" >> $GITHUB_OUTPUT + echo "::notice::Devcontainer ID: $CONTAINER" + + - name: Launch app inside devcontainer (background) + run: | + # Detached exec; the entrypoint backgrounds Xvfb/fluxbox/x11vnc/dbus/keyring, + # then runs `pnpm start` under a PTY wrapper (script -qfc) that survives + # the lack of a TTY in CI. + docker exec -d -u node ${{ steps.container.outputs.id }} \ + bash -c 'cd /workspaces/toolhive-studio && \ + nohup bash scripts/devcontainer-entrypoint.sh \ + > /tmp/entrypoint.log 2>&1 &' + + - name: Wait for app readiness + timeout-minutes: 8 + run: | + CONTAINER=${{ steps.container.outputs.id }} + for i in $(seq 1 90); do + # Three signals: noVNC HTTP up, electron running, thv serve running. + # Use [t]hv / [e]lectron bracket trick so pgrep doesn't self-match + # the shell's argv (see devcontainer-dev skill, "pgrep self-match"). + if docker exec "$CONTAINER" bash -c ' + curl -fsS http://localhost:6080/ >/dev/null 2>&1 \ + && pgrep -f "[e]lectron/dist/electron" >/dev/null \ + && pgrep -f "[t]hv serve" >/dev/null + '; then + echo "::notice::App ready (attempt $i)" + exit 0 + fi + echo "[$i/90] not ready yet..." + sleep 5 + done + echo "::error::App did not become ready" + echo '--- entrypoint.log (tail) ---' + docker exec "$CONTAINER" tail -200 /tmp/entrypoint.log || true + echo '--- xvfb.log (tail) ---' + docker exec "$CONTAINER" tail -50 /tmp/xvfb.log || true + exit 1 + + - name: 'Phase 1 — Analyze & Repro (Opus, with live app access)' + id: phase1 + continue-on-error: true + uses: anthropics/claude-code-action@567fe954a4527e81f132d87d1bdbcc94f7737434 # v1 + env: + DEVCONTAINER_ID: ${{ steps.container.outputs.id }} + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + prompt: | + Read .claude/skills/bug-fix-tdd/SKILL.md and follow it. + Read issue-body.md for the bug report. + + EXPERIMENTAL — live app available: + A running ToolHive Studio is available inside a devcontainer. + The container ID is in the environment variable DEVCONTAINER_ID. + DISPLAY=:99 inside the container. The workspace is bind-mounted at + /workspaces/toolhive-studio, so source edits on the runner propagate + to the container via Vite HMR (renderer changes only — main-process + edits would need an Electron restart). + + Useful commands: + docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xwininfo -root -tree' + docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool search --name ToolHive' + docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool mousemove X Y click 1' + docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool key Tab' + docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 import -window root /tmp/shot.png' + docker cp "$DEVCONTAINER_ID":/tmp/shot.png /tmp/shot.png + # then Read /tmp/shot.png to view it + + WHEN TO USE THE LIVE APP: + Default to a unit-test reproduction. Reach for the live app only when + the bug genuinely requires it: cross-route side effects, async backend + lifecycle (`thv` workload events), real IPC. Pixel-driving is fragile + and slow — even when you use the live app for repro, your eventual + FAILING TEST should be a unit test that captures the underlying + invariant (e.g. mock the IPC event and assert the router behavior). + The unit test is what gates Phase 2. + + Your task (Phase 1): + 1. Analyze the bug report. + 2. Reproduce the bug (in the live app and/or as a failing unit test). + 3. Write a unit test that captures the bug — must FAIL. + 4. Run: pnpm run test:nonInteractive -- + 5. Verify the failure is for the RIGHT reason. + 6. Write bug-analysis.md per the skill format. + Include a "Visual repro:" section if you used the live app. + + Do NOT modify source files in this phase. Tests + bug-analysis.md only. + claude_args: >- + --model opus + --max-turns 75 + --allowedTools "Read,Grep,Glob,Edit,Write,Bash(pnpm run test:nonInteractive *),Bash(cat *),Bash(ls *),Bash(docker exec *),Bash(docker cp *),Bash(docker ps *)" + + - name: 'Hard gate — Verify test fails' + id: gate + if: steps.phase1.outcome == 'success' + run: | + if [ ! -f bug-analysis.md ]; then + echo "::warning::bug-analysis.md not found" + echo "test_fails=false" >> $GITHUB_OUTPUT + exit 0 + fi + TEST_FILE=$(grep "^Test file:" bug-analysis.md | sed 's/^Test file: //' || true) + if [ -z "$TEST_FILE" ]; then + echo "::warning::No 'Test file:' line in bug-analysis.md" + echo "test_fails=false" >> $GITHUB_OUTPUT + exit 0 + fi + echo "test_file=$TEST_FILE" >> $GITHUB_OUTPUT + if pnpm run test:nonInteractive -- "$TEST_FILE" 2>&1; then + echo "::warning::Test passed (bug not reproduced)" + echo "test_fails=false" >> $GITHUB_OUTPUT + else + echo "::notice::Test fails as expected" + echo "test_fails=true" >> $GITHUB_OUTPUT + fi + + - name: 'Phase 2 — TDD Fix (Sonnet)' + id: phase2 + if: steps.gate.outputs.test_fails == 'true' + continue-on-error: true + uses: anthropics/claude-code-action@567fe954a4527e81f132d87d1bdbcc94f7737434 # v1 + env: + DEVCONTAINER_ID: ${{ steps.container.outputs.id }} + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + prompt: | + Read .claude/skills/bug-fix-tdd/SKILL.md and bug-analysis.md. + + The live app from Phase 1 is still running in the devcontainer + (ID: $DEVCONTAINER_ID). You may use it for sanity-checking your fix, + but the gate is still: pnpm run test:nonInteractive passes. + + Your task (Phase 2 — Fix): + 1. Read the failing test and understand what it expects. + 2. Apply the MINIMUM fix to make the test pass. + 3. Run: pnpm run test:nonInteractive -- ${{ steps.gate.outputs.test_file }} + 4. Run the full suite: pnpm run test:nonInteractive + 5. Run pnpm run lint and pnpm run type-check. + 6. If any check fails, adjust (max 5 attempts). + 7. Write pr-body.md (include 'Fixes #${{ inputs.issue-number }}') + and fix-title.txt. + + Do NOT run git, gh, or modify .env files. + Do NOT over-engineer. + claude_args: >- + --model sonnet + --max-turns 150 + --allowedTools "Read,Grep,Glob,Edit,Write,Bash(pnpm run test:nonInteractive *),Bash(pnpm run lint),Bash(pnpm run type-check),Bash(cat *),Bash(ls *),Bash(docker exec *),Bash(docker cp *)" + + - name: 'Hard gate — Verify all checks pass' + id: verify + if: steps.phase2.outcome == 'success' + run: | + if pnpm run test:nonInteractive && pnpm run lint && pnpm run type-check; then + echo "::notice::All checks pass" + echo "result=success" >> $GITHUB_OUTPUT + else + echo "::warning::Verification failed" + echo "result=failure" >> $GITHUB_OUTPUT + fi + + - name: Create branch and commit + if: steps.verify.outputs.result == 'success' + id: push + run: | + ISSUE_NUM="${{ inputs.issue-number }}" + BRANCH="fix/auto-${ISSUE_NUM}-experimental" + TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUM}") + + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git checkout -B "$BRANCH" + git add -A -- '*.ts' '*.tsx' '**/*.ts' '**/*.tsx' + + if git diff --cached --quiet; then + echo "::warning::No source changes to commit" + echo "has_changes=false" >> $GITHUB_OUTPUT + else + git commit -m "$TITLE" + git push -u origin "$BRANCH" + echo "has_changes=true" >> $GITHUB_OUTPUT + echo "branch=$BRANCH" >> $GITHUB_OUTPUT + fi + + - name: Create Pull Request + if: steps.verify.outputs.result == 'success' && steps.push.outputs.has_changes == 'true' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + BRANCH="${{ steps.push.outputs.branch }}" + TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${{ inputs.issue-number }}") + + if [ ! -f pr-body.md ]; then + echo "Automated fix for #${{ inputs.issue-number }} (experimental visual flow)." > pr-body.md + fi + + # Append a marker so reviewers can tell this came from the experiment + printf '\n\n---\n_Generated by the experimental visual bug-fix flow (`%s`)._\n' \ + "${{ github.workflow }}" >> pr-body.md + + gh label create auto-fix-experimental \ + --description "Experimental visual bug-fix agent" \ + --color FBCA04 2>/dev/null || true + + gh pr create \ + --title "$TITLE" \ + --body-file pr-body.md \ + --base main \ + --head "$BRANCH" \ + --label "auto-fix-experimental" + + - name: Comment on issue (failure path) + if: always() && steps.verify.outputs.result != 'success' && steps.phase1.outcome != 'skipped' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [ -f bug-analysis.md ]; then + ANALYSIS=$(cat bug-analysis.md) + else + ANALYSIS="The experimental agent could not analyze this bug." + fi + printf '## Bug Fix Agent (Experimental — Visual) — Analysis\n\n%s\n\n---\n_This is from the experimental visual flow. The production fixer may behave differently._\n' \ + "$ANALYSIS" > /tmp/comment-body.md + gh issue comment "${{ inputs.issue-number }}" --body-file /tmp/comment-body.md + + - name: Diagnostics on failure + if: failure() || steps.verify.outputs.result != 'success' + run: | + CONTAINER=${{ steps.container.outputs.id }} + if [ -n "$CONTAINER" ]; then + echo '=== Container processes ===' + docker exec "$CONTAINER" ps auxf || true + echo '=== entrypoint.log ===' + docker exec "$CONTAINER" tail -300 /tmp/entrypoint.log || true + echo '=== xvfb.log ===' + docker exec "$CONTAINER" tail -100 /tmp/xvfb.log || true + echo '=== fluxbox.log ===' + docker exec "$CONTAINER" tail -100 /tmp/fluxbox.log || true + echo '=== keyring.log ===' + docker exec "$CONTAINER" tail -50 /tmp/keyring.log || true + fi From ac816b624f07a5cccd030eeb1a63c4b064d868a2 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 27 Apr 2026 14:34:11 +0200 Subject: [PATCH 2/8] ci(experiment): switch to push trigger on experiment branch --- .../workflows/_bug-fix-agent-experimental.yml | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/.github/workflows/_bug-fix-agent-experimental.yml b/.github/workflows/_bug-fix-agent-experimental.yml index cb1fc2105..756f854c5 100644 --- a/.github/workflows/_bug-fix-agent-experimental.yml +++ b/.github/workflows/_bug-fix-agent-experimental.yml @@ -5,20 +5,23 @@ name: Bug Fix Agent (Experimental — Visual) # (xdotool, screenshots) when a unit-test reproduction is impractical. PR flow # is identical to the production agent — base = main, head = fix/auto-. # -# Trigger: -# gh workflow run _bug-fix-agent-experimental.yml \ -# -r experiment/bug-fix-visual -f issue-number=663 +# Trigger: push a commit to this branch. +# git commit --allow-empty -m "trigger experiment" && git push +# To push without firing a run, prefix the commit message with [skip ci]. +# +# The branch filter pins this workflow to its dedicated experiment branch, so +# it cannot fire from main or any other branch even if the file ends up there. +# +# This workflow operates on a single fixed issue (set in env.ISSUE_NUMBER +# below). To target a different issue, edit the env var and push. on: - workflow_dispatch: - inputs: - issue-number: - description: 'Issue number to fix' - required: true - type: number + push: + branches: + - experiment/bug-fix-visual concurrency: - group: bug-fix-experimental-${{ inputs.issue-number }} + group: bug-fix-experimental-663 cancel-in-progress: true jobs: @@ -32,6 +35,9 @@ jobs: issues: write id-token: write env: + # Issue this experiment is targeting. Hardcoded so the workflow can + # be a `push:`-triggered file with no inputs. Change + push to retarget. + ISSUE_NUMBER: '663' # Required by .devcontainer/devcontainer.json runArgs port mapping. # In CI we don't need the host-side noVNC tunnel; just pick a free port # so the mapping resolves to something valid. @@ -55,7 +61,7 @@ jobs: env: GH_TOKEN: ${{ steps.app-token.outputs.token }} run: | - gh issue view ${{ inputs.issue-number }} --json title,body,labels \ + gh issue view "$ISSUE_NUMBER" --json title,body,labels \ --template '# {{.title}}{{"\n\n"}}{{.body}}' > issue-body.md - name: Build & start devcontainer (cached via gha) @@ -220,7 +226,7 @@ jobs: 4. Run the full suite: pnpm run test:nonInteractive 5. Run pnpm run lint and pnpm run type-check. 6. If any check fails, adjust (max 5 attempts). - 7. Write pr-body.md (include 'Fixes #${{ inputs.issue-number }}') + 7. Write pr-body.md (include 'Fixes #${{ env.ISSUE_NUMBER }}') and fix-title.txt. Do NOT run git, gh, or modify .env files. @@ -246,9 +252,8 @@ jobs: if: steps.verify.outputs.result == 'success' id: push run: | - ISSUE_NUM="${{ inputs.issue-number }}" - BRANCH="fix/auto-${ISSUE_NUM}-experimental" - TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUM}") + BRANCH="fix/auto-${ISSUE_NUMBER}-experimental" + TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUMBER}") git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" @@ -271,10 +276,10 @@ jobs: GH_TOKEN: ${{ steps.app-token.outputs.token }} run: | BRANCH="${{ steps.push.outputs.branch }}" - TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${{ inputs.issue-number }}") + TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUMBER}") if [ ! -f pr-body.md ]; then - echo "Automated fix for #${{ inputs.issue-number }} (experimental visual flow)." > pr-body.md + echo "Automated fix for #${ISSUE_NUMBER} (experimental visual flow)." > pr-body.md fi # Append a marker so reviewers can tell this came from the experiment @@ -304,7 +309,7 @@ jobs: fi printf '## Bug Fix Agent (Experimental — Visual) — Analysis\n\n%s\n\n---\n_This is from the experimental visual flow. The production fixer may behave differently._\n' \ "$ANALYSIS" > /tmp/comment-body.md - gh issue comment "${{ inputs.issue-number }}" --body-file /tmp/comment-body.md + gh issue comment "$ISSUE_NUMBER" --body-file /tmp/comment-body.md - name: Diagnostics on failure if: failure() || steps.verify.outputs.result != 'success' From 40b65274eb69d861fa5d6d6bc32efd76df0fc62b Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 27 Apr 2026 14:58:10 +0200 Subject: [PATCH 3/8] ci(experiment): trigger via PR-label wrapper; drop issue comment --- .../workflows/_bug-fix-agent-experimental.yml | 32 ++++--------------- .../workflows/experiment-bug-fix-trigger.yml | 32 +++++++++++++++++++ 2 files changed, 39 insertions(+), 25 deletions(-) create mode 100644 .github/workflows/experiment-bug-fix-trigger.yml diff --git a/.github/workflows/_bug-fix-agent-experimental.yml b/.github/workflows/_bug-fix-agent-experimental.yml index 756f854c5..5fe8e0b89 100644 --- a/.github/workflows/_bug-fix-agent-experimental.yml +++ b/.github/workflows/_bug-fix-agent-experimental.yml @@ -5,20 +5,16 @@ name: Bug Fix Agent (Experimental — Visual) # (xdotool, screenshots) when a unit-test reproduction is impractical. PR flow # is identical to the production agent — base = main, head = fix/auto-. # -# Trigger: push a commit to this branch. -# git commit --allow-empty -m "trigger experiment" && git push -# To push without firing a run, prefix the commit message with [skip ci]. -# -# The branch filter pins this workflow to its dedicated experiment branch, so -# it cannot fire from main or any other branch even if the file ends up there. +# This workflow is `workflow_call`-only (mirrors the production agent's shape, +# so the underlying `claude-code-action` only ever sees an event type it +# accepts). It is invoked from `experiment-bug-fix-trigger.yml`, which fires +# on `pull_request: labeled` against the experiment PR. # # This workflow operates on a single fixed issue (set in env.ISSUE_NUMBER # below). To target a different issue, edit the env var and push. on: - push: - branches: - - experiment/bug-fix-visual + workflow_call: concurrency: group: bug-fix-experimental-663 @@ -35,8 +31,8 @@ jobs: issues: write id-token: write env: - # Issue this experiment is targeting. Hardcoded so the workflow can - # be a `push:`-triggered file with no inputs. Change + push to retarget. + # Issue this experiment is targeting. Hardcoded for this POC; change + + # push + re-label to retarget another issue. ISSUE_NUMBER: '663' # Required by .devcontainer/devcontainer.json runArgs port mapping. # In CI we don't need the host-side noVNC tunnel; just pick a free port @@ -297,20 +293,6 @@ jobs: --head "$BRANCH" \ --label "auto-fix-experimental" - - name: Comment on issue (failure path) - if: always() && steps.verify.outputs.result != 'success' && steps.phase1.outcome != 'skipped' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [ -f bug-analysis.md ]; then - ANALYSIS=$(cat bug-analysis.md) - else - ANALYSIS="The experimental agent could not analyze this bug." - fi - printf '## Bug Fix Agent (Experimental — Visual) — Analysis\n\n%s\n\n---\n_This is from the experimental visual flow. The production fixer may behave differently._\n' \ - "$ANALYSIS" > /tmp/comment-body.md - gh issue comment "$ISSUE_NUMBER" --body-file /tmp/comment-body.md - - name: Diagnostics on failure if: failure() || steps.verify.outputs.result != 'success' run: | diff --git a/.github/workflows/experiment-bug-fix-trigger.yml b/.github/workflows/experiment-bug-fix-trigger.yml new file mode 100644 index 000000000..323cfac23 --- /dev/null +++ b/.github/workflows/experiment-bug-fix-trigger.yml @@ -0,0 +1,32 @@ +name: Experiment Bug-Fix Trigger + +# Branch-only trigger for the experimental visual bug-fix agent. +# Fires when label `run-experiment-bug-fix` is added to the experiment PR +# (head ref `experiment/bug-fix-visual`). For same-repo PRs, GitHub uses the +# workflow file from the head ref, so this workflow does not need to exist +# on `main` to be eligible to run. +# +# To fire a run: +# gh pr edit 2120 --add-label run-experiment-bug-fix +# To fire again, remove the label and re-add it: +# gh pr edit 2120 --remove-label run-experiment-bug-fix \ +# && gh pr edit 2120 --add-label run-experiment-bug-fix + +on: + pull_request: + types: [labeled] + +permissions: + contents: write + pull-requests: write + issues: write + id-token: write + +jobs: + run-experiment: + name: Run Experimental Bug-Fix Agent + if: >- + github.event.label.name == 'run-experiment-bug-fix' + && github.head_ref == 'experiment/bug-fix-visual' + uses: ./.github/workflows/_bug-fix-agent-experimental.yml + secrets: inherit From 1e7e64e811e35d83a74cca9e0fb8c7531bd4c0b3 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Tue, 28 Apr 2026 10:58:00 +0200 Subject: [PATCH 4/8] ci(experiment): pivot to agent-free devcontainer-in-CI proof --- .../workflows/_bug-fix-agent-experimental.yml | 311 ------------------ .../workflows/experiment-bug-fix-trigger.yml | 32 -- .../experiment-devcontainer-proof.yml | 210 ++++++++++++ 3 files changed, 210 insertions(+), 343 deletions(-) delete mode 100644 .github/workflows/_bug-fix-agent-experimental.yml delete mode 100644 .github/workflows/experiment-bug-fix-trigger.yml create mode 100644 .github/workflows/experiment-devcontainer-proof.yml diff --git a/.github/workflows/_bug-fix-agent-experimental.yml b/.github/workflows/_bug-fix-agent-experimental.yml deleted file mode 100644 index 5fe8e0b89..000000000 --- a/.github/workflows/_bug-fix-agent-experimental.yml +++ /dev/null @@ -1,311 +0,0 @@ -name: Bug Fix Agent (Experimental — Visual) - -# EXPERIMENT: tier-2 bug fixing with a live ToolHive Studio running inside the -# project's devcontainer. The agent can drive the running app via `docker exec` -# (xdotool, screenshots) when a unit-test reproduction is impractical. PR flow -# is identical to the production agent — base = main, head = fix/auto-. -# -# This workflow is `workflow_call`-only (mirrors the production agent's shape, -# so the underlying `claude-code-action` only ever sees an event type it -# accepts). It is invoked from `experiment-bug-fix-trigger.yml`, which fires -# on `pull_request: labeled` against the experiment PR. -# -# This workflow operates on a single fixed issue (set in env.ISSUE_NUMBER -# below). To target a different issue, edit the env var and push. - -on: - workflow_call: - -concurrency: - group: bug-fix-experimental-663 - cancel-in-progress: true - -jobs: - fix: - name: Bug Fix (Visual POC) - runs-on: ubuntu-latest - timeout-minutes: 90 - permissions: - contents: write - pull-requests: write - issues: write - id-token: write - env: - # Issue this experiment is targeting. Hardcoded for this POC; change + - # push + re-label to retarget another issue. - ISSUE_NUMBER: '663' - # Required by .devcontainer/devcontainer.json runArgs port mapping. - # In CI we don't need the host-side noVNC tunnel; just pick a free port - # so the mapping resolves to something valid. - NOVNC_HOST_PORT: 6080 - - steps: - - name: Generate GitHub App token - id: app-token - uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3 - with: - app-id: ${{ secrets.TOOLHIVE_STUDIO_CI_APP_ID }} - private-key: ${{ secrets.TOOLHIVE_STUDIO_CI_APP_KEY }} - - - name: Checkout main - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - with: - ref: refs/heads/main - token: ${{ steps.app-token.outputs.token }} - - - name: Fetch issue body - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - run: | - gh issue view "$ISSUE_NUMBER" --json title,body,labels \ - --template '# {{.title}}{{"\n\n"}}{{.body}}' > issue-body.md - - - name: Build & start devcontainer (cached via gha) - uses: devcontainers/ci@v0.3 - with: - imageName: toolhive-studio-devcontainer - cacheFrom: type=gha - push: never - runCmd: | - echo "Devcontainer up. Workspace: $(pwd)" - ls -la - node --version - pnpm --version - - - name: Find container ID - id: container - run: | - CONTAINER=$(docker ps \ - --filter "label=devcontainer.local_folder=$GITHUB_WORKSPACE" \ - --format '{{.ID}}' | head -1) - if [ -z "$CONTAINER" ]; then - echo "::error::No devcontainer found for $GITHUB_WORKSPACE" - docker ps - exit 1 - fi - echo "id=$CONTAINER" >> $GITHUB_OUTPUT - echo "::notice::Devcontainer ID: $CONTAINER" - - - name: Launch app inside devcontainer (background) - run: | - # Detached exec; the entrypoint backgrounds Xvfb/fluxbox/x11vnc/dbus/keyring, - # then runs `pnpm start` under a PTY wrapper (script -qfc) that survives - # the lack of a TTY in CI. - docker exec -d -u node ${{ steps.container.outputs.id }} \ - bash -c 'cd /workspaces/toolhive-studio && \ - nohup bash scripts/devcontainer-entrypoint.sh \ - > /tmp/entrypoint.log 2>&1 &' - - - name: Wait for app readiness - timeout-minutes: 8 - run: | - CONTAINER=${{ steps.container.outputs.id }} - for i in $(seq 1 90); do - # Three signals: noVNC HTTP up, electron running, thv serve running. - # Use [t]hv / [e]lectron bracket trick so pgrep doesn't self-match - # the shell's argv (see devcontainer-dev skill, "pgrep self-match"). - if docker exec "$CONTAINER" bash -c ' - curl -fsS http://localhost:6080/ >/dev/null 2>&1 \ - && pgrep -f "[e]lectron/dist/electron" >/dev/null \ - && pgrep -f "[t]hv serve" >/dev/null - '; then - echo "::notice::App ready (attempt $i)" - exit 0 - fi - echo "[$i/90] not ready yet..." - sleep 5 - done - echo "::error::App did not become ready" - echo '--- entrypoint.log (tail) ---' - docker exec "$CONTAINER" tail -200 /tmp/entrypoint.log || true - echo '--- xvfb.log (tail) ---' - docker exec "$CONTAINER" tail -50 /tmp/xvfb.log || true - exit 1 - - - name: 'Phase 1 — Analyze & Repro (Opus, with live app access)' - id: phase1 - continue-on-error: true - uses: anthropics/claude-code-action@567fe954a4527e81f132d87d1bdbcc94f7737434 # v1 - env: - DEVCONTAINER_ID: ${{ steps.container.outputs.id }} - with: - anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - prompt: | - Read .claude/skills/bug-fix-tdd/SKILL.md and follow it. - Read issue-body.md for the bug report. - - EXPERIMENTAL — live app available: - A running ToolHive Studio is available inside a devcontainer. - The container ID is in the environment variable DEVCONTAINER_ID. - DISPLAY=:99 inside the container. The workspace is bind-mounted at - /workspaces/toolhive-studio, so source edits on the runner propagate - to the container via Vite HMR (renderer changes only — main-process - edits would need an Electron restart). - - Useful commands: - docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xwininfo -root -tree' - docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool search --name ToolHive' - docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool mousemove X Y click 1' - docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 xdotool key Tab' - docker exec "$DEVCONTAINER_ID" bash -c 'DISPLAY=:99 import -window root /tmp/shot.png' - docker cp "$DEVCONTAINER_ID":/tmp/shot.png /tmp/shot.png - # then Read /tmp/shot.png to view it - - WHEN TO USE THE LIVE APP: - Default to a unit-test reproduction. Reach for the live app only when - the bug genuinely requires it: cross-route side effects, async backend - lifecycle (`thv` workload events), real IPC. Pixel-driving is fragile - and slow — even when you use the live app for repro, your eventual - FAILING TEST should be a unit test that captures the underlying - invariant (e.g. mock the IPC event and assert the router behavior). - The unit test is what gates Phase 2. - - Your task (Phase 1): - 1. Analyze the bug report. - 2. Reproduce the bug (in the live app and/or as a failing unit test). - 3. Write a unit test that captures the bug — must FAIL. - 4. Run: pnpm run test:nonInteractive -- - 5. Verify the failure is for the RIGHT reason. - 6. Write bug-analysis.md per the skill format. - Include a "Visual repro:" section if you used the live app. - - Do NOT modify source files in this phase. Tests + bug-analysis.md only. - claude_args: >- - --model opus - --max-turns 75 - --allowedTools "Read,Grep,Glob,Edit,Write,Bash(pnpm run test:nonInteractive *),Bash(cat *),Bash(ls *),Bash(docker exec *),Bash(docker cp *),Bash(docker ps *)" - - - name: 'Hard gate — Verify test fails' - id: gate - if: steps.phase1.outcome == 'success' - run: | - if [ ! -f bug-analysis.md ]; then - echo "::warning::bug-analysis.md not found" - echo "test_fails=false" >> $GITHUB_OUTPUT - exit 0 - fi - TEST_FILE=$(grep "^Test file:" bug-analysis.md | sed 's/^Test file: //' || true) - if [ -z "$TEST_FILE" ]; then - echo "::warning::No 'Test file:' line in bug-analysis.md" - echo "test_fails=false" >> $GITHUB_OUTPUT - exit 0 - fi - echo "test_file=$TEST_FILE" >> $GITHUB_OUTPUT - if pnpm run test:nonInteractive -- "$TEST_FILE" 2>&1; then - echo "::warning::Test passed (bug not reproduced)" - echo "test_fails=false" >> $GITHUB_OUTPUT - else - echo "::notice::Test fails as expected" - echo "test_fails=true" >> $GITHUB_OUTPUT - fi - - - name: 'Phase 2 — TDD Fix (Sonnet)' - id: phase2 - if: steps.gate.outputs.test_fails == 'true' - continue-on-error: true - uses: anthropics/claude-code-action@567fe954a4527e81f132d87d1bdbcc94f7737434 # v1 - env: - DEVCONTAINER_ID: ${{ steps.container.outputs.id }} - with: - anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - prompt: | - Read .claude/skills/bug-fix-tdd/SKILL.md and bug-analysis.md. - - The live app from Phase 1 is still running in the devcontainer - (ID: $DEVCONTAINER_ID). You may use it for sanity-checking your fix, - but the gate is still: pnpm run test:nonInteractive passes. - - Your task (Phase 2 — Fix): - 1. Read the failing test and understand what it expects. - 2. Apply the MINIMUM fix to make the test pass. - 3. Run: pnpm run test:nonInteractive -- ${{ steps.gate.outputs.test_file }} - 4. Run the full suite: pnpm run test:nonInteractive - 5. Run pnpm run lint and pnpm run type-check. - 6. If any check fails, adjust (max 5 attempts). - 7. Write pr-body.md (include 'Fixes #${{ env.ISSUE_NUMBER }}') - and fix-title.txt. - - Do NOT run git, gh, or modify .env files. - Do NOT over-engineer. - claude_args: >- - --model sonnet - --max-turns 150 - --allowedTools "Read,Grep,Glob,Edit,Write,Bash(pnpm run test:nonInteractive *),Bash(pnpm run lint),Bash(pnpm run type-check),Bash(cat *),Bash(ls *),Bash(docker exec *),Bash(docker cp *)" - - - name: 'Hard gate — Verify all checks pass' - id: verify - if: steps.phase2.outcome == 'success' - run: | - if pnpm run test:nonInteractive && pnpm run lint && pnpm run type-check; then - echo "::notice::All checks pass" - echo "result=success" >> $GITHUB_OUTPUT - else - echo "::warning::Verification failed" - echo "result=failure" >> $GITHUB_OUTPUT - fi - - - name: Create branch and commit - if: steps.verify.outputs.result == 'success' - id: push - run: | - BRANCH="fix/auto-${ISSUE_NUMBER}-experimental" - TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUMBER}") - - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git checkout -B "$BRANCH" - git add -A -- '*.ts' '*.tsx' '**/*.ts' '**/*.tsx' - - if git diff --cached --quiet; then - echo "::warning::No source changes to commit" - echo "has_changes=false" >> $GITHUB_OUTPUT - else - git commit -m "$TITLE" - git push -u origin "$BRANCH" - echo "has_changes=true" >> $GITHUB_OUTPUT - echo "branch=$BRANCH" >> $GITHUB_OUTPUT - fi - - - name: Create Pull Request - if: steps.verify.outputs.result == 'success' && steps.push.outputs.has_changes == 'true' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - run: | - BRANCH="${{ steps.push.outputs.branch }}" - TITLE=$(cat fix-title.txt 2>/dev/null || echo "fix: auto-fix for #${ISSUE_NUMBER}") - - if [ ! -f pr-body.md ]; then - echo "Automated fix for #${ISSUE_NUMBER} (experimental visual flow)." > pr-body.md - fi - - # Append a marker so reviewers can tell this came from the experiment - printf '\n\n---\n_Generated by the experimental visual bug-fix flow (`%s`)._\n' \ - "${{ github.workflow }}" >> pr-body.md - - gh label create auto-fix-experimental \ - --description "Experimental visual bug-fix agent" \ - --color FBCA04 2>/dev/null || true - - gh pr create \ - --title "$TITLE" \ - --body-file pr-body.md \ - --base main \ - --head "$BRANCH" \ - --label "auto-fix-experimental" - - - name: Diagnostics on failure - if: failure() || steps.verify.outputs.result != 'success' - run: | - CONTAINER=${{ steps.container.outputs.id }} - if [ -n "$CONTAINER" ]; then - echo '=== Container processes ===' - docker exec "$CONTAINER" ps auxf || true - echo '=== entrypoint.log ===' - docker exec "$CONTAINER" tail -300 /tmp/entrypoint.log || true - echo '=== xvfb.log ===' - docker exec "$CONTAINER" tail -100 /tmp/xvfb.log || true - echo '=== fluxbox.log ===' - docker exec "$CONTAINER" tail -100 /tmp/fluxbox.log || true - echo '=== keyring.log ===' - docker exec "$CONTAINER" tail -50 /tmp/keyring.log || true - fi diff --git a/.github/workflows/experiment-bug-fix-trigger.yml b/.github/workflows/experiment-bug-fix-trigger.yml deleted file mode 100644 index 323cfac23..000000000 --- a/.github/workflows/experiment-bug-fix-trigger.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Experiment Bug-Fix Trigger - -# Branch-only trigger for the experimental visual bug-fix agent. -# Fires when label `run-experiment-bug-fix` is added to the experiment PR -# (head ref `experiment/bug-fix-visual`). For same-repo PRs, GitHub uses the -# workflow file from the head ref, so this workflow does not need to exist -# on `main` to be eligible to run. -# -# To fire a run: -# gh pr edit 2120 --add-label run-experiment-bug-fix -# To fire again, remove the label and re-add it: -# gh pr edit 2120 --remove-label run-experiment-bug-fix \ -# && gh pr edit 2120 --add-label run-experiment-bug-fix - -on: - pull_request: - types: [labeled] - -permissions: - contents: write - pull-requests: write - issues: write - id-token: write - -jobs: - run-experiment: - name: Run Experimental Bug-Fix Agent - if: >- - github.event.label.name == 'run-experiment-bug-fix' - && github.head_ref == 'experiment/bug-fix-visual' - uses: ./.github/workflows/_bug-fix-agent-experimental.yml - secrets: inherit diff --git a/.github/workflows/experiment-devcontainer-proof.yml b/.github/workflows/experiment-devcontainer-proof.yml new file mode 100644 index 000000000..81f76a740 --- /dev/null +++ b/.github/workflows/experiment-devcontainer-proof.yml @@ -0,0 +1,210 @@ +name: Devcontainer-in-CI Proof + +# Proof-of-concept: boot the project's devcontainer (Xvfb + fluxbox + Electron +# + thv) inside a GitHub Actions runner, drive the live app via docker exec +# (xdotool, screenshots), and run a workspace smoke check (type-check) inside +# the container. No agent involved — this isolates the genuinely novel piece +# (devcontainer-in-CI) from the AI-agent piece, which has a separate +# constraint (`claude-code-action` enforces workflow_file == default branch). +# +# Trigger: push to this branch (no claude-code-action means no validation). +# git commit --allow-empty -m "trigger" && git push +# To skip a run, use [skip ci] in the commit message. +# +# Once this proof is reliable, integrating it into the production +# `_bug-fix-agent.yml` is small: add the devcontainer steps before the +# claude-code-action step, expand `--allowedTools`, append the live-app +# paragraph to the prompt. + +on: + push: + branches: + - experiment/bug-fix-visual + +permissions: + contents: read + +concurrency: + group: devcontainer-proof-${{ github.ref }} + cancel-in-progress: true + +jobs: + proof: + name: Devcontainer Proof + runs-on: ubuntu-latest + timeout-minutes: 45 + env: + # Required by .devcontainer/devcontainer.json runArgs port mapping. + # Anything valid; we don't expose the port off-runner. + NOVNC_HOST_PORT: 6080 + + steps: + - name: Mark t0 (workflow start) + id: t0 + run: echo "epoch=$(date +%s)" >> $GITHUB_OUTPUT + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Build & start devcontainer (cached via gha) + uses: devcontainers/ci@v0.3 + with: + imageName: toolhive-studio-devcontainer + cacheFrom: type=gha + push: never + runCmd: | + echo "Devcontainer up. Workspace: $(pwd)" + ls -la + node --version + pnpm --version + + - name: Mark t1 (devcontainer up); record cold/warm + id: t1 + run: | + NOW=$(date +%s) + ELAPSED=$(( NOW - ${{ steps.t0.outputs.epoch }} )) + echo "epoch=$NOW" >> $GITHUB_OUTPUT + echo "elapsed=$ELAPSED" >> $GITHUB_OUTPUT + echo "::notice::Devcontainer up after ${ELAPSED}s" + + - name: Find container ID + id: container + run: | + C=$(docker ps \ + --filter "label=devcontainer.local_folder=$GITHUB_WORKSPACE" \ + --format '{{.ID}}' | head -1) + if [ -z "$C" ]; then + echo "::error::No devcontainer found for $GITHUB_WORKSPACE" + docker ps + exit 1 + fi + echo "id=$C" >> $GITHUB_OUTPUT + echo "::notice::Devcontainer ID: $C" + + - name: Launch app inside devcontainer (background) + run: | + docker exec -d -u node ${{ steps.container.outputs.id }} \ + bash -c 'cd /workspaces/toolhive-studio && \ + nohup bash scripts/devcontainer-entrypoint.sh \ + > /tmp/entrypoint.log 2>&1 &' + + - name: Wait for app readiness + id: ready + timeout-minutes: 10 + run: | + C=${{ steps.container.outputs.id }} + T_START=$(date +%s) + for i in $(seq 1 120); do + # See devcontainer-dev skill — bracket trick avoids pgrep self-match. + if docker exec "$C" bash -c ' + curl -fsS http://localhost:6080/ >/dev/null 2>&1 \ + && pgrep -f "[e]lectron/dist/electron" >/dev/null \ + && pgrep -f "[t]hv serve" >/dev/null + '; then + NOW=$(date +%s) + ELAPSED=$(( NOW - T_START )) + SINCE_T1=$(( NOW - ${{ steps.t1.outputs.epoch }} )) + echo "::notice::App ready in ${ELAPSED}s after launch (${SINCE_T1}s after devcontainer up)" + echo "elapsed=$ELAPSED" >> $GITHUB_OUTPUT + echo "since_t1=$SINCE_T1" >> $GITHUB_OUTPUT + exit 0 + fi + echo "[$i/120] not ready yet..." + sleep 5 + done + echo "::error::App did not become ready within 10 minutes" + docker exec "$C" tail -200 /tmp/entrypoint.log || true + exit 1 + + - name: Screenshot 1 — initial paint + run: | + C=${{ steps.container.outputs.id }} + docker exec "$C" bash -c 'DISPLAY=:99 import -window root /tmp/shot1.png' + docker cp "$C:/tmp/shot1.png" ./shot1.png + ls -la ./shot1.png + + - name: Inspect window tree (sanity check) + run: | + C=${{ steps.container.outputs.id }} + echo '--- xwininfo -root -tree ---' + docker exec "$C" bash -c 'DISPLAY=:99 xwininfo -root -tree' || true + echo '--- xdotool search --name ToolHive ---' + docker exec "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive' || true + + - name: Drive xdotool — focus app and send Tab + run: | + C=${{ steps.container.outputs.id }} + docker exec "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive windowactivate' || true + sleep 2 + docker exec "$C" bash -c 'DISPLAY=:99 xdotool key Tab' || true + sleep 1 + + - name: Screenshot 2 — after input + run: | + C=${{ steps.container.outputs.id }} + docker exec "$C" bash -c 'DISPLAY=:99 import -window root /tmp/shot2.png' + docker cp "$C:/tmp/shot2.png" ./shot2.png + ls -la ./shot2.png + + - name: Workspace smoke check — type-check inside container + id: type_check + run: | + C=${{ steps.container.outputs.id }} + T_START=$(date +%s) + if docker exec -u node "$C" bash -c \ + 'cd /workspaces/toolhive-studio && pnpm run type-check'; then + NOW=$(date +%s) + echo "elapsed=$(( NOW - T_START ))" >> $GITHUB_OUTPUT + echo "result=pass" >> $GITHUB_OUTPUT + else + NOW=$(date +%s) + echo "elapsed=$(( NOW - T_START ))" >> $GITHUB_OUTPUT + echo "result=fail" >> $GITHUB_OUTPUT + echo "::warning::type-check failed" + fi + + - name: Upload screenshots + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + with: + name: screenshots + path: | + shot1.png + shot2.png + if-no-files-found: warn + retention-days: 7 + + - name: Job summary + if: always() + run: | + { + echo "## Devcontainer-in-CI Proof" + echo + echo "| Stage | Elapsed |" + echo "|---|---|" + echo "| Checkout + devcontainer build/start | ${{ steps.t1.outputs.elapsed || 'n/a' }}s |" + echo "| App readiness (after launch) | ${{ steps.ready.outputs.elapsed || 'n/a' }}s |" + echo "| App readiness (since devcontainer) | ${{ steps.ready.outputs.since_t1 || 'n/a' }}s |" + echo "| Type-check inside container | ${{ steps.type_check.outputs.elapsed || 'n/a' }}s (${{ steps.type_check.outputs.result || 'n/a' }}) |" + echo + echo "Cache backend: \`type=gha\`. Compare cold (first run) and warm (subsequent) timings to evaluate cache effectiveness." + echo + echo "Screenshots uploaded as the **screenshots** artifact (\`shot1.png\` initial, \`shot2.png\` after Tab keystroke)." + } >> $GITHUB_STEP_SUMMARY + + - name: Diagnostics on failure + if: failure() + run: | + C=${{ steps.container.outputs.id }} + if [ -n "$C" ]; then + echo '=== Container processes ===' + docker exec "$C" ps auxf || true + echo '=== entrypoint.log ===' + docker exec "$C" tail -300 /tmp/entrypoint.log || true + echo '=== xvfb.log ===' + docker exec "$C" tail -100 /tmp/xvfb.log || true + echo '=== fluxbox.log ===' + docker exec "$C" tail -100 /tmp/fluxbox.log || true + echo '=== keyring.log ===' + docker exec "$C" tail -50 /tmp/keyring.log || true + fi From 90c81beccd1e0b56b73a72c63b45e67b5b59118a Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Tue, 28 Apr 2026 11:25:12 +0200 Subject: [PATCH 5/8] ci(experiment): add diagnostics to screenshot step; run as node --- .../experiment-devcontainer-proof.yml | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/.github/workflows/experiment-devcontainer-proof.yml b/.github/workflows/experiment-devcontainer-proof.yml index 81f76a740..0f08d0cbb 100644 --- a/.github/workflows/experiment-devcontainer-proof.yml +++ b/.github/workflows/experiment-devcontainer-proof.yml @@ -116,33 +116,49 @@ jobs: docker exec "$C" tail -200 /tmp/entrypoint.log || true exit 1 - - name: Screenshot 1 — initial paint + - name: Display + tooling sanity check run: | C=${{ steps.container.outputs.id }} - docker exec "$C" bash -c 'DISPLAY=:99 import -window root /tmp/shot1.png' - docker cp "$C:/tmp/shot1.png" ./shot1.png - ls -la ./shot1.png + echo '--- which import / xdotool / xwininfo ---' + docker exec "$C" bash -c 'which import xdotool xwininfo xdpyinfo 2>&1 || true' + echo '--- xdpyinfo (X server reachable on :99?) ---' + docker exec -u node "$C" bash -c 'DISPLAY=:99 xdpyinfo 2>&1 | head -10' || echo "xdpyinfo failed exit=$?" + echo '--- xwininfo -root -tree ---' + docker exec -u node "$C" bash -c 'DISPLAY=:99 xwininfo -root -tree 2>&1' || true + echo '--- xdotool search --name ToolHive ---' + docker exec -u node "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive 2>&1' || true - - name: Inspect window tree (sanity check) + - name: Screenshot 1 — initial paint run: | + set -x C=${{ steps.container.outputs.id }} - echo '--- xwininfo -root -tree ---' - docker exec "$C" bash -c 'DISPLAY=:99 xwininfo -root -tree' || true - echo '--- xdotool search --name ToolHive ---' - docker exec "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive' || true + # Run as `node` to match the X server owner. Capture stderr explicitly + # so we can see *why* import fails if it does, instead of inferring. + docker exec -u node "$C" bash -c ' + set -x + DISPLAY=:99 import -window root /tmp/shot1.png 2>&1 + echo "import exit=$?" + ls -la /tmp/shot1.png 2>&1 + ' + docker cp "$C:/tmp/shot1.png" ./shot1.png + ls -la ./shot1.png - name: Drive xdotool — focus app and send Tab run: | C=${{ steps.container.outputs.id }} - docker exec "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive windowactivate' || true + docker exec -u node "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive windowactivate' || true sleep 2 - docker exec "$C" bash -c 'DISPLAY=:99 xdotool key Tab' || true + docker exec -u node "$C" bash -c 'DISPLAY=:99 xdotool key Tab' || true sleep 1 - name: Screenshot 2 — after input run: | C=${{ steps.container.outputs.id }} - docker exec "$C" bash -c 'DISPLAY=:99 import -window root /tmp/shot2.png' + docker exec -u node "$C" bash -c ' + DISPLAY=:99 import -window root /tmp/shot2.png 2>&1 + echo "import exit=$?" + ls -la /tmp/shot2.png 2>&1 + ' docker cp "$C:/tmp/shot2.png" ./shot2.png ls -la ./shot2.png From a5363f543653899f9d27d764839148f915aaf8db Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Tue, 28 Apr 2026 13:01:41 +0200 Subject: [PATCH 6/8] feat(devcontainer): add screenshot + steal helpers for tmpfs files --- .../experiment-devcontainer-proof.yml | 36 ++----------- scripts/devcontainer-screenshot.sh | 52 +++++++++++++++++++ scripts/devcontainer-steal.sh | 28 ++++++++++ 3 files changed, 84 insertions(+), 32 deletions(-) create mode 100755 scripts/devcontainer-screenshot.sh create mode 100755 scripts/devcontainer-steal.sh diff --git a/.github/workflows/experiment-devcontainer-proof.yml b/.github/workflows/experiment-devcontainer-proof.yml index 0f08d0cbb..471f22062 100644 --- a/.github/workflows/experiment-devcontainer-proof.yml +++ b/.github/workflows/experiment-devcontainer-proof.yml @@ -116,32 +116,10 @@ jobs: docker exec "$C" tail -200 /tmp/entrypoint.log || true exit 1 - - name: Display + tooling sanity check - run: | - C=${{ steps.container.outputs.id }} - echo '--- which import / xdotool / xwininfo ---' - docker exec "$C" bash -c 'which import xdotool xwininfo xdpyinfo 2>&1 || true' - echo '--- xdpyinfo (X server reachable on :99?) ---' - docker exec -u node "$C" bash -c 'DISPLAY=:99 xdpyinfo 2>&1 | head -10' || echo "xdpyinfo failed exit=$?" - echo '--- xwininfo -root -tree ---' - docker exec -u node "$C" bash -c 'DISPLAY=:99 xwininfo -root -tree 2>&1' || true - echo '--- xdotool search --name ToolHive ---' - docker exec -u node "$C" bash -c 'DISPLAY=:99 xdotool search --name ToolHive 2>&1' || true - - name: Screenshot 1 — initial paint run: | - set -x - C=${{ steps.container.outputs.id }} - # Run as `node` to match the X server owner. Capture stderr explicitly - # so we can see *why* import fails if it does, instead of inferring. - docker exec -u node "$C" bash -c ' - set -x - DISPLAY=:99 import -window root /tmp/shot1.png 2>&1 - echo "import exit=$?" - ls -la /tmp/shot1.png 2>&1 - ' - docker cp "$C:/tmp/shot1.png" ./shot1.png - ls -la ./shot1.png + scripts/devcontainer-screenshot.sh shot1.png + ls -la shot1.png - name: Drive xdotool — focus app and send Tab run: | @@ -153,14 +131,8 @@ jobs: - name: Screenshot 2 — after input run: | - C=${{ steps.container.outputs.id }} - docker exec -u node "$C" bash -c ' - DISPLAY=:99 import -window root /tmp/shot2.png 2>&1 - echo "import exit=$?" - ls -la /tmp/shot2.png 2>&1 - ' - docker cp "$C:/tmp/shot2.png" ./shot2.png - ls -la ./shot2.png + scripts/devcontainer-screenshot.sh shot2.png + ls -la shot2.png - name: Workspace smoke check — type-check inside container id: type_check diff --git a/scripts/devcontainer-screenshot.sh b/scripts/devcontainer-screenshot.sh new file mode 100755 index 000000000..50e478586 --- /dev/null +++ b/scripts/devcontainer-screenshot.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Capture the running ToolHive Studio screen via X11 (Xvfb on :99) inside +# the project's devcontainer and place the PNG at the given host path. +# +# Default host path: ./toolhive-shot.png +# +# Internally: +# 1. Locate the devcontainer attached to the current workspace (or +# $GITHUB_WORKSPACE in CI). +# 2. Run `import -window root` inside the container to capture the root +# window of Xvfb on display :99. +# 3. Stream the file back to the host via devcontainer-steal.sh +# (necessary because /tmp inside the container is tmpfs — see that +# script for full context). +# 4. Clean up the intermediate file inside the container. +# +# Usage: +# devcontainer-screenshot.sh [host-path] +# +# Prints the absolute host path on stdout for ergonomic chaining: +# SHOT=$(scripts/devcontainer-screenshot.sh) +# +set -euo pipefail + +HOST_PATH="${1:-./toolhive-shot.png}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WORKSPACE="${GITHUB_WORKSPACE:-$(pwd)}" + +CONTAINER=$(docker ps \ + --filter "label=devcontainer.local_folder=$WORKSPACE" \ + --format '{{.ID}}' | head -1) + +if [ -z "$CONTAINER" ]; then + echo "devcontainer-screenshot.sh: no devcontainer found for $WORKSPACE" >&2 + echo " (is the devcontainer running? try: pnpm devContainer:dev)" >&2 + exit 1 +fi + +CONTAINER_TMP="/tmp/.devcontainer-shot-$$.png" + +docker exec -u node "$CONTAINER" bash -c \ + "DISPLAY=:99 import -window root '$CONTAINER_TMP'" + +"$SCRIPT_DIR/devcontainer-steal.sh" "$CONTAINER" "$CONTAINER_TMP" "$HOST_PATH" + +docker exec -u node "$CONTAINER" rm -f "$CONTAINER_TMP" || true + +# Resolve to absolute path for downstream consumers. +case "$HOST_PATH" in + /*) echo "$HOST_PATH" ;; + *) echo "$(cd "$(dirname "$HOST_PATH")" && pwd)/$(basename "$HOST_PATH")" ;; +esac diff --git a/scripts/devcontainer-steal.sh b/scripts/devcontainer-steal.sh new file mode 100755 index 000000000..ab3af7b3e --- /dev/null +++ b/scripts/devcontainer-steal.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Stream a file out of a running devcontainer to the host filesystem. +# +# Why this exists: `/tmp` (and possibly other paths) inside the +# ToolHive Studio devcontainer are mounted as tmpfs. Docker's `docker cp` +# cannot read from tmpfs mounts — it only traverses the container's overlay +# layers — so `docker cp $C:/tmp/foo .` returns "Could not find the file" +# even when `ls /tmp/foo` inside the container confirms it exists. This +# script bypasses `docker cp` by streaming the file via `docker exec cat`, +# which works regardless of mount type. +# +# Usage: +# devcontainer-steal.sh +# +# Example: +# devcontainer-steal.sh "$C" /tmp/foo.bin ./foo.bin +# +# Tip: the screenshot helper (devcontainer-screenshot.sh) calls this +# internally, so most callers don't need to invoke it directly. Reach for +# this script when extracting non-screenshot artifacts (logs, generated +# files, etc.) from /tmp or any other tmpfs-backed path. +set -euo pipefail + +CONTAINER="${1:?usage: devcontainer-steal.sh }" +SRC="${2:?missing container path}" +DEST="${3:?missing host path}" + +docker exec -u node "$CONTAINER" cat "$SRC" > "$DEST" From c645045ca5c585fd154afbe883edecccbb638146 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Tue, 28 Apr 2026 15:11:45 +0200 Subject: [PATCH 7/8] ci(experiment): tighten readiness gate; fix gha cache; sync skill --- .claude/skills/devcontainer-dev/SKILL.md | 40 ++++++++++++++++++- .codex/skills/devcontainer-dev/SKILL.md | 40 ++++++++++++++++++- .cursor/skills/devcontainer-dev/SKILL.md | 40 ++++++++++++++++++- .../experiment-devcontainer-proof.yml | 23 +++++++++-- 4 files changed, 134 insertions(+), 9 deletions(-) diff --git a/.claude/skills/devcontainer-dev/SKILL.md b/.claude/skills/devcontainer-dev/SKILL.md index 67e93072c..1a7db53be 100644 --- a/.claude/skills/devcontainer-dev/SKILL.md +++ b/.claude/skills/devcontainer-dev/SKILL.md @@ -72,6 +72,22 @@ The readiness banner is what you care about. It gates on **three** signals simul Only once all three are true does the banner fire and the host's browser auto-open. +> **Process running ≠ UI rendered.** The three gates above tell you the +> backing processes are alive, not that Electron has actually painted. +> A devcontainer-in-CI proof captured a blank screen because all three +> gates passed while the window was still mid-first-paint. For +> agent-grade readiness (e.g. before driving the app via `xdotool` or +> taking a screenshot to feed to a vision model), add this gate: +> +> ```bash +> docker exec -u node "$CONTAINER" bash -c \ +> 'DISPLAY=:99 xdotool search --class ToolHive >/dev/null 2>&1' +> ``` +> +> `xdotool search --class` only succeeds once the main window is mapped +> on Xvfb. A short settling sleep (~2s) after that first match is cheap +> insurance against catching a partially rendered frame. + --- ## Per-worktree isolation @@ -146,13 +162,33 @@ All commands run via `docker exec` against the container with `DISPLAY=:99` set ### See the screen (screenshots) +Use the project's helper script — it auto-finds the container, captures the +root window, and streams the PNG out to the host. Prints the absolute host +path on stdout so it composes: + ```bash -# Take a PNG of the whole virtual framebuffer +SHOT=$(scripts/devcontainer-screenshot.sh) +# or with an explicit path: +scripts/devcontainer-screenshot.sh /tmp/shot.png +``` + +**Why a helper script and not just `docker cp`?** `/tmp` (and possibly +other paths) inside the devcontainer is mounted as `tmpfs`. Docker's +`docker cp` cannot read from tmpfs mounts — it only traverses the +container's overlay layers — so the obvious one-liner + +```bash +# DON'T — silently broken: import succeeds, ls confirms the file, +# but docker cp says "Could not find the file in container". docker exec "$CONTAINER" bash -c 'DISPLAY=:99 import -window root /tmp/shot.png' -# Copy it to the host for viewing / feeding to a vision model docker cp "$CONTAINER:/tmp/shot.png" /tmp/shot.png ``` +**fails everywhere** (CI and local). The helper bypasses `docker cp` by +streaming the file via `docker exec cat` (see `scripts/devcontainer-steal.sh` +for the generic version — use that one to extract any tmpfs file, e.g. logs +or generated artifacts, not just screenshots). + `import` is from ImageMagick. For a specific window only, use `xwininfo` to get the WID then `import -window `. ### See the window tree (what's there, where, which is focused) diff --git a/.codex/skills/devcontainer-dev/SKILL.md b/.codex/skills/devcontainer-dev/SKILL.md index 67e93072c..1a7db53be 100644 --- a/.codex/skills/devcontainer-dev/SKILL.md +++ b/.codex/skills/devcontainer-dev/SKILL.md @@ -72,6 +72,22 @@ The readiness banner is what you care about. It gates on **three** signals simul Only once all three are true does the banner fire and the host's browser auto-open. +> **Process running ≠ UI rendered.** The three gates above tell you the +> backing processes are alive, not that Electron has actually painted. +> A devcontainer-in-CI proof captured a blank screen because all three +> gates passed while the window was still mid-first-paint. For +> agent-grade readiness (e.g. before driving the app via `xdotool` or +> taking a screenshot to feed to a vision model), add this gate: +> +> ```bash +> docker exec -u node "$CONTAINER" bash -c \ +> 'DISPLAY=:99 xdotool search --class ToolHive >/dev/null 2>&1' +> ``` +> +> `xdotool search --class` only succeeds once the main window is mapped +> on Xvfb. A short settling sleep (~2s) after that first match is cheap +> insurance against catching a partially rendered frame. + --- ## Per-worktree isolation @@ -146,13 +162,33 @@ All commands run via `docker exec` against the container with `DISPLAY=:99` set ### See the screen (screenshots) +Use the project's helper script — it auto-finds the container, captures the +root window, and streams the PNG out to the host. Prints the absolute host +path on stdout so it composes: + ```bash -# Take a PNG of the whole virtual framebuffer +SHOT=$(scripts/devcontainer-screenshot.sh) +# or with an explicit path: +scripts/devcontainer-screenshot.sh /tmp/shot.png +``` + +**Why a helper script and not just `docker cp`?** `/tmp` (and possibly +other paths) inside the devcontainer is mounted as `tmpfs`. Docker's +`docker cp` cannot read from tmpfs mounts — it only traverses the +container's overlay layers — so the obvious one-liner + +```bash +# DON'T — silently broken: import succeeds, ls confirms the file, +# but docker cp says "Could not find the file in container". docker exec "$CONTAINER" bash -c 'DISPLAY=:99 import -window root /tmp/shot.png' -# Copy it to the host for viewing / feeding to a vision model docker cp "$CONTAINER:/tmp/shot.png" /tmp/shot.png ``` +**fails everywhere** (CI and local). The helper bypasses `docker cp` by +streaming the file via `docker exec cat` (see `scripts/devcontainer-steal.sh` +for the generic version — use that one to extract any tmpfs file, e.g. logs +or generated artifacts, not just screenshots). + `import` is from ImageMagick. For a specific window only, use `xwininfo` to get the WID then `import -window `. ### See the window tree (what's there, where, which is focused) diff --git a/.cursor/skills/devcontainer-dev/SKILL.md b/.cursor/skills/devcontainer-dev/SKILL.md index 67e93072c..1a7db53be 100644 --- a/.cursor/skills/devcontainer-dev/SKILL.md +++ b/.cursor/skills/devcontainer-dev/SKILL.md @@ -72,6 +72,22 @@ The readiness banner is what you care about. It gates on **three** signals simul Only once all three are true does the banner fire and the host's browser auto-open. +> **Process running ≠ UI rendered.** The three gates above tell you the +> backing processes are alive, not that Electron has actually painted. +> A devcontainer-in-CI proof captured a blank screen because all three +> gates passed while the window was still mid-first-paint. For +> agent-grade readiness (e.g. before driving the app via `xdotool` or +> taking a screenshot to feed to a vision model), add this gate: +> +> ```bash +> docker exec -u node "$CONTAINER" bash -c \ +> 'DISPLAY=:99 xdotool search --class ToolHive >/dev/null 2>&1' +> ``` +> +> `xdotool search --class` only succeeds once the main window is mapped +> on Xvfb. A short settling sleep (~2s) after that first match is cheap +> insurance against catching a partially rendered frame. + --- ## Per-worktree isolation @@ -146,13 +162,33 @@ All commands run via `docker exec` against the container with `DISPLAY=:99` set ### See the screen (screenshots) +Use the project's helper script — it auto-finds the container, captures the +root window, and streams the PNG out to the host. Prints the absolute host +path on stdout so it composes: + ```bash -# Take a PNG of the whole virtual framebuffer +SHOT=$(scripts/devcontainer-screenshot.sh) +# or with an explicit path: +scripts/devcontainer-screenshot.sh /tmp/shot.png +``` + +**Why a helper script and not just `docker cp`?** `/tmp` (and possibly +other paths) inside the devcontainer is mounted as `tmpfs`. Docker's +`docker cp` cannot read from tmpfs mounts — it only traverses the +container's overlay layers — so the obvious one-liner + +```bash +# DON'T — silently broken: import succeeds, ls confirms the file, +# but docker cp says "Could not find the file in container". docker exec "$CONTAINER" bash -c 'DISPLAY=:99 import -window root /tmp/shot.png' -# Copy it to the host for viewing / feeding to a vision model docker cp "$CONTAINER:/tmp/shot.png" /tmp/shot.png ``` +**fails everywhere** (CI and local). The helper bypasses `docker cp` by +streaming the file via `docker exec cat` (see `scripts/devcontainer-steal.sh` +for the generic version — use that one to extract any tmpfs file, e.g. logs +or generated artifacts, not just screenshots). + `import` is from ImageMagick. For a specific window only, use `xwininfo` to get the WID then `import -window `. ### See the window tree (what's there, where, which is focused) diff --git a/.github/workflows/experiment-devcontainer-proof.yml b/.github/workflows/experiment-devcontainer-proof.yml index 471f22062..ec2ff0a60 100644 --- a/.github/workflows/experiment-devcontainer-proof.yml +++ b/.github/workflows/experiment-devcontainer-proof.yml @@ -46,6 +46,12 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - name: Set up Docker Buildx + # Required so BuildKit's `gha` cache backend is available below. + # Without this, `cacheFrom: type=gha` fails with + # "unknown cache importer: gha" and the build goes fully cold. + uses: docker/setup-buildx-action@v3 + - name: Build & start devcontainer (cached via gha) uses: devcontainers/ci@v0.3 with: @@ -95,12 +101,23 @@ jobs: C=${{ steps.container.outputs.id }} T_START=$(date +%s) for i in $(seq 1 120); do - # See devcontainer-dev skill — bracket trick avoids pgrep self-match. - if docker exec "$C" bash -c ' + # Process gates AND a window-mapped check via xdotool. The previous + # version only checked process presence, which is not the same as + # "UI has rendered" — the first proof run captured a blank screen + # because Electron was alive but had not painted yet. The + # `xdotool search --class ToolHive` call only succeeds once the + # app's main window is mapped on Xvfb. Bracket trick on pgrep + # avoids self-match (see devcontainer-dev skill). + if docker exec -u node "$C" bash -c ' curl -fsS http://localhost:6080/ >/dev/null 2>&1 \ && pgrep -f "[e]lectron/dist/electron" >/dev/null \ - && pgrep -f "[t]hv serve" >/dev/null + && pgrep -f "[t]hv serve" >/dev/null \ + && DISPLAY=:99 xdotool search --class ToolHive >/dev/null 2>&1 '; then + # Settling pause — the window may be mid-first-paint when + # xdotool first finds it. Cheap insurance against a partially + # rendered screenshot. + sleep 2 NOW=$(date +%s) ELAPSED=$(( NOW - T_START )) SINCE_T1=$(( NOW - ${{ steps.t1.outputs.epoch }} )) From 8ef323d1b361087cb5d249119eff0460bfe9bf83 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Tue, 28 Apr 2026 17:31:03 +0200 Subject: [PATCH 8/8] ci(experiment): cache devcontainer node_modules across runs --- .../experiment-devcontainer-proof.yml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/.github/workflows/experiment-devcontainer-proof.yml b/.github/workflows/experiment-devcontainer-proof.yml index ec2ff0a60..ddc482f7d 100644 --- a/.github/workflows/experiment-devcontainer-proof.yml +++ b/.github/workflows/experiment-devcontainer-proof.yml @@ -52,6 +52,35 @@ jobs: # "unknown cache importer: gha" and the build goes fully cold. uses: docker/setup-buildx-action@v3 + - name: Cache devcontainer node_modules + # Image-layer caching alone doesn't help us, because the + # devcontainer's `node_modules` lives in a named Docker volume that + # is created fresh per run (volumes don't survive across CI jobs). + # postCreateCommand then runs `pnpm install` from scratch on every + # run, which dominates the build step's wall time. We cache the + # contents of that volume here and restore them into the volume + # before devcontainers/ci starts the container, so postCreateCommand + # sees an already-populated node_modules and pnpm install is a no-op + # (lockfile-equal). + id: nm-cache + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + with: + path: nm-cache + key: dc-nm-${{ runner.os }}-${{ hashFiles('pnpm-lock.yaml') }} + + - name: Pre-populate devcontainer node_modules volume + if: steps.nm-cache.outputs.cache-hit == 'true' + run: | + # The volume name follows devcontainer.json's + # source=toolhive-node-modules-${localWorkspaceFolderBasename}. + # In CI the workspace basename is always `toolhive-studio`. + VOLUME=toolhive-node-modules-toolhive-studio + docker volume create "$VOLUME" >/dev/null + docker run --rm \ + -v "$PWD/nm-cache:/src:ro" \ + -v "$VOLUME:/dst" \ + alpine:3 sh -c 'cp -a /src/. /dst/' + - name: Build & start devcontainer (cached via gha) uses: devcontainers/ci@v0.3 with: @@ -168,6 +197,21 @@ jobs: echo "::warning::type-check failed" fi + - name: Dump devcontainer node_modules for cache + # Runs only on cache miss. Populates nm-cache/ with the volume + # contents so actions/cache's post-step picks it up for next time. + # Multiple containers can mount the same named volume simultaneously, + # so this doesn't disturb the running devcontainer. + if: always() && steps.nm-cache.outputs.cache-hit != 'true' + run: | + VOLUME=toolhive-node-modules-toolhive-studio + rm -rf nm-cache && mkdir -p nm-cache + docker run --rm \ + -v "$VOLUME:/src:ro" \ + -v "$PWD/nm-cache:/dst" \ + alpine:3 sh -c 'cp -a /src/. /dst/' + echo "::notice::node_modules dumped: $(du -sh nm-cache | cut -f1)" + - name: Upload screenshots if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7