skills/.github/workflows/run-coder-eval.yml at main · UiPath/skills · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
name: Run Coder Eval

# GH-hosted runner for coder-eval against the skills task tree. Use cases:
# single task ad-hoc, a folder, or (with explicit confirmation) the full
# suite that the VM cron runs nightly. Long-term replacement for the
# coder-eval-runner VM.
#
# Tasks are split by the `windows` tag (see tests/README.md#tag-taxonomy):
# windows-tagged tasks run on `windows-latest` (Helm/Studio is Windows-only,
# tempdir driver), everything else runs on `ubuntu-latest` (Docker driver
# + experiments/nightly.yaml). Both jobs receive the resolved file list
# from the `partition` job — never the raw glob.

on:
  workflow_dispatch:
    inputs:
      task_globs:
        description: 'REQUIRED. Space-separated globs under tests/. E.g. tasks/uipath-agents/**/*.yaml'
        type: string
        required: true
      # Host-level concurrency for the Linux job's `coder-eval -j`. Default 4:
      # ubuntu-latest is 4 vCPU, and j=20 oversubscribes ~5:1 — agents miss the
      # 1200s turn timeout and ERROR (false negatives, including a bindings
      # test) instead of failing on logic. Repro on a j=20 run:
      # https://github.com/UiPath/skills/actions/runs/26651052875
      parallelism:
        description: 'Parallel tasks for the Linux job (-j). Keep ≤4 on ubuntu-latest (4 vCPU).'
        type: string
        required: false
        default: '4'
      confirm_large_run:
        description: 'Allow >50 tasks (cost guardrail).'
        type: boolean
        default: false
      coder_eval_version:
        description: 'coder_eval version to test (blank = pinned).'
        type: string
        default: ''
      # Decouples the GHCR agent image from the wheel version, so an ad-hoc /
      # unreleased image (e.g. a codex-baked `sha-<commit>` build) can be tested
      # while the host CLI installs a matching dev wheel via coder_eval_version.
      coder_eval_image_tag:
        description: 'Agent image tag (blank = coder_eval_version).'
        type: string
        default: ''
      cli_version:
        description: '@uipath/cli version.'
        type: string
        default: alpha
      agent:
        description: 'Agent (codex = gpt-5.4).'
        type: choice
        options:
          - claude
          - codex
        default: claude

concurrency:
  group: run-coder-eval-${{ github.ref }}
  cancel-in-progress: false

jobs:
  partition:
    runs-on: ubuntu-latest
    name: Partition globs by `windows` tag
    outputs:
      linux_globs:    ${{ steps.split.outputs.linux_globs }}
      windows_globs:  ${{ steps.split.outputs.windows_globs }}
      linux_count:    ${{ steps.split.outputs.linux_count }}
      windows_count:  ${{ steps.split.outputs.windows_count }}
      coder_eval_version: ${{ steps.ceref.outputs.version }}
      coder_eval_image_tag: ${{ steps.ceref.outputs.image_tag }}
    steps:
      - uses: actions/checkout@v4

      # Single source of truth for the coder_eval version: the pin in
      # tests/.coder-eval-version. A non-blank coder_eval_version input overrides
      # it for ad-hoc runs. Resolved once here and passed to both run jobs as the
      # wheel version (host CLI) + agent image tag (sandbox).
      - name: Resolve coder_eval version (pinned; input overrides)
        id: ceref
        env:
          CE_VERSION_INPUT: ${{ inputs.coder_eval_version }}
          CE_IMAGE_TAG_INPUT: ${{ inputs.coder_eval_image_tag }}
        run: |
          if [ -n "$CE_VERSION_INPUT" ]; then
            version="$CE_VERSION_INPUT"
          else
            version="$(tr -d '[:space:]' < tests/.coder-eval-version)"
          fi
          echo "version=$version" >> "$GITHUB_OUTPUT"
          # Agent image tag defaults to the wheel version; a non-blank input
          # decouples them (ad-hoc / unreleased image vs. dev-wheel host CLI).
          echo "image_tag=${CE_IMAGE_TAG_INPUT:-$version}" >> "$GITHUB_OUTPUT"

      - name: Resolve globs, split by tag, enforce large-run gate
        id: split
        env:
          INPUT_GLOBS: ${{ inputs.task_globs }}
          CONFIRMED:   ${{ inputs.confirm_large_run }}
        run: |
          set -euo pipefail
          if [ -z "$INPUT_GLOBS" ]; then
            echo "::error::task_globs is required"
            exit 1
          fi
          shopt -s globstar nullglob
          cd tests
          LINUX=()
          WINDOWS=()
          for pat in $INPUT_GLOBS; do
            for f in $pat; do
              [ -f "$f" ] || continue
              # Tag is the literal token `windows` on the `tags:` line.
              # Word boundary keeps `feature:windows-foo` (hypothetical)
              # from matching.
              if grep -qE '^tags:.*\bwindows\b' "$f"; then
                WINDOWS+=("$f")
              else
                LINUX+=("$f")
              fi
            done
          done
          LCOUNT=${#LINUX[@]}
          WCOUNT=${#WINDOWS[@]}
          TOTAL=$((LCOUNT + WCOUNT))
          echo "Matched $TOTAL task file(s): $LCOUNT linux + $WCOUNT windows"
          if [ "$TOTAL" -eq 0 ]; then
            echo "::error::No task files matched the provided globs"
            exit 1
          fi
          if [ "$TOTAL" -gt 50 ] && [ "$CONFIRMED" != "true" ]; then
            echo "::error::Glob matches $TOTAL tasks (> 50). Tick confirm_large_run to authorize."
            exit 1
          fi
          # Space-separated; downstream jobs word-split on consumption.
          echo "linux_globs=${LINUX[*]:-}"     >> "$GITHUB_OUTPUT"
          echo "windows_globs=${WINDOWS[*]:-}" >> "$GITHUB_OUTPUT"
          echo "linux_count=$LCOUNT"            >> "$GITHUB_OUTPUT"
          echo "windows_count=$WCOUNT"          >> "$GITHUB_OUTPUT"

  run-linux:
    needs: partition
    if: needs.partition.outputs.linux_count != '0'
    # TODO: switch to `ubuntu-latest-16-cores` (16 vCPU / 64 GB RAM) once
    # the UiPath/skills repo is allowlisted for that runner group. Standard
    # ubuntu-latest (4 vCPU) handles ad-hoc single-task / small-folder runs at
    # the default `parallelism` (4); the larger tier is needed to raise -j for
    # the full nightly sweep to finish in ~30-45min instead of multi-hour.
    runs-on: ubuntu-latest
    # Full skills suite on the VM (~700 tasks, j=1) runs 3-4h. 330 min
    # covers worst-case stalls (one slow task pinning a slot, network
    # hiccups, etc.).
    timeout-minutes: 330
    name: Run coder-eval (Linux)
    # Disable uip CLI version-sync: a task's `uip login status` re-pins
    # ~/.uipath/config.json to a stale line and, via nightly.yaml's shared
    # ~/.uipath mount, downgrades the CLI for later tasks. Mirrors daily.sh.
    env:
      UIPATH_CLI_DISABLE_VERSION_SYNC: "1"
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: '3.13'

      - uses: astral-sh/setup-uv@v4

      # Host coder-eval CLI = pinned wheel (coder_eval feed + ml-packages for
      # deps). No source checkout.
      # codex is NOT installed on the host here: under the docker driver the
      # agent runs inside skills-image, so the codex SDK belongs in that image
      # (separate TODO), not on the host CLI.
      - name: Install coder-eval
        env:
          UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@pkgs.dev.azure.com/uipath/ML%20Platform/_packaging/coder_eval/pypi/simple/ https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
        run: uv pip install --system "coder-eval==${{ needs.partition.outputs.coder_eval_version }}"

      # Pull the agent image from GHCR (never built from source); the skills
      # image extends it below. Tag defaults to the wheel version but can be
      # overridden (coder_eval_image_tag) for ad-hoc / unreleased images.
      # GH_PAT needs read:packages.
      - name: Pull coder-eval-agent image
        run: |
          echo "${{ secrets.GH_PAT }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
          docker pull ghcr.io/uipath/coder-eval-agent:${{ needs.partition.outputs.coder_eval_image_tag }}

      # Build the skills extension image (`@uipath/cli` + `@uipath/admin-tool`
      # on top of coder-eval-agent). Matches smoke-skills.yml + daily.sh.
      - name: Build skills Docker image
        run: |
          docker build \
            --build-arg CODER_EVAL_IMAGE=ghcr.io/uipath/coder-eval-agent:${{ needs.partition.outputs.coder_eval_image_tag }} \
            --build-arg NPM_AUTH_TOKEN="${{ secrets.GH_PAT }}" \
            --build-arg CLI_VERSION="${{ inputs.cli_version }}" \
            -t skills-image:latest \
            -f tests/docker/Dockerfile \
            .

      # ROPC bot-user auth via this repo's vendored script (mirrors the VM
      # cron's ROPC flow). Required because `uip flow debug` needs a real user
      # `sub` claim — s2s/client_credentials tokens lack one. Writes
      # ~/.uipath/.auth which experiments/nightly.yaml mounts into each eval
      # container.
      - name: Mint UiPath bot token (ROPC)
        env:
          UIPATH_URL:               https://alpha.uipath.com
          UIPATH_ORGANIZATION_NAME: ${{ secrets.UIPATH_ORG_NAME }}
          UIPATH_ORGANIZATION_ID:   ${{ secrets.UIPATH_ORG_ID }}
          UIPATH_TENANT_NAME:       ${{ secrets.UIPATH_TENANT_NAME }}
          UIPATH_TENANT_ID:         ${{ secrets.UIPATH_TENANT_ID }}
          CLIENT_ID:                ${{ secrets.UIPATH_ROPC_CLIENT_ID }}
          CLIENT_SECRET:            ${{ secrets.UIPATH_ROPC_CLIENT_SECRET }}
          CE_USERNAME:              ${{ secrets.UIPATH_BOT_USERNAME }}
          CE_PASSWORD:              ${{ secrets.UIPATH_BOT_PASSWORD }}
        run: bash .github/scripts/refresh-auth.sh

      # TASK_GLOBS here is the partition-resolved list of *file paths* (not
      # globs). Word-splitting is intentional — do not quote.
      - name: Run coder-eval
        env:
          SKILLS_REPO_PATH:         ${{ github.workspace }}
          API_BACKEND:              bedrock
          AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
          AWS_REGION:               ${{ secrets.AWS_REGION }}
          BEDROCK_MODEL:            ${{ secrets.BEDROCK_MODEL }}
          ANTHROPIC_API_KEY:        ${{ secrets.ANTHROPIC_API_KEY }}
          TRACES_SMOKE_PROCESS_KEY: ${{ secrets.TRACES_SMOKE_PROCESS_KEY }}
          E2E_PROCESS_KEY:          ${{ secrets.E2E_PROCESS_KEY }}
          E2E_LONG_PROCESS_KEY:     ${{ secrets.E2E_LONG_PROCESS_KEY }}
          TASK_GLOBS:               ${{ needs.partition.outputs.linux_globs }}
          TASK_COUNT:               ${{ needs.partition.outputs.linux_count }}
          TASK_PARALLELISM:         ${{ inputs.parallelism }}
          AGENT:                    ${{ inputs.agent }}
          CODEX_API_KEY:            ${{ secrets.CODEX_API_KEY }}
          CODEX_BASE_URL:           ${{ secrets.CODEX_BASE_URL }}
        working-directory: tests
        id: eval
        run: |
          # Guard the empty case (e.g. a future non-dispatch trigger): the
          # `parallelism` input default (4) only applies on workflow_dispatch.
          j="${TASK_PARALLELISM:-4}"
          # Agent selection. claude (default) takes the experiment YAML's
          # claude-code/sonnet config; codex overrides to gpt-5.4 and authenticates
          # via CODEX_API_KEY/CODEX_BASE_URL. Driver is unchanged (docker here).
          agent_flags=""
          if [ "$AGENT" = "codex" ]; then
            agent_flags="--type codex --model gpt-5.4"
          fi
          echo "Running: coder-eval run $TASK_GLOBS -e experiments/nightly.yaml $agent_flags -j $j ($TASK_COUNT task files)"
          coder-eval run $TASK_GLOBS \
            -e experiments/nightly.yaml \
            $agent_flags \
            -j "$j" -v \
            --run-dir /tmp/runs

      - name: Fix permissions on /tmp/runs
        if: always()
        run: sudo chown -R $(id -u):$(id -g) /tmp/runs && sudo chmod -R 755 /tmp/runs 2>/dev/null || true

      # Strip heavy task scratch dirs before upload to keep artifact size
      # manageable. Matches smoke-skills.yml.
      - name: Clean up artifacts (remove .venv and node_modules)
        if: always()
        run: |
          find /tmp/runs -type d \( -name .venv -o -name node_modules \) -path "*/artifacts/*" -exec rm -rf {} + || true

      - name: Upload eval report
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: coder-eval-linux-${{ github.run_id }}
          # ** so the pattern matches the replicate-index segment that
          # coder_eval adds to per-task run dirs (/tmp/runs/<ts>/<variant>/<task>/<NN>/).
          path: |
            /tmp/runs/*/experiment.html
            /tmp/runs/*/experiment.md
            /tmp/runs/*/experiment.json
            /tmp/runs/*/experiment.log
            /tmp/runs/*/*/variant.html
            /tmp/runs/*/*/variant.md
            /tmp/runs/*/*/variant.json
            /tmp/runs/**/task.html
            /tmp/runs/**/task.json
            /tmp/runs/**/task.log
            /tmp/runs/**/artifacts
          if-no-files-found: warn
          retention-days: 14

      - name: Plaintext summary + verdict
        if: always()
        run: |
          set -uo pipefail
          run_dir=$(ls -td /tmp/runs/*/ 2>/dev/null | head -n 1 || true)
          if [ -z "$run_dir" ]; then
            echo "FAIL — no run directory produced" | tee -a "$GITHUB_STEP_SUMMARY"
            exit 1
          fi
          python - "$run_dir" <<'PY' | tee -a "$GITHUB_STEP_SUMMARY"
          import json, pathlib, sys
          rows = [json.loads(p.read_text(encoding='utf-8')) for p in sorted(pathlib.Path(sys.argv[1]).rglob('task.json'))]
          for r in rows:
              print(f"{r.get('task_id','?')}: {r.get('final_status','?')}")
          ok = sum(1 for r in rows if r.get('final_status') == 'SUCCESS')
          print(f"\n{ok}/{len(rows)} PASS (linux)")
          sys.exit(0 if rows and ok == len(rows) else 1)
          PY

  run-windows:
    needs: partition
    if: needs.partition.outputs.windows_count != '0'
    runs-on: windows-latest
    # Windows RPA tasks spin up Helm on each `uip rpa` call (30–60s each)
    # and run j=1 (Helm state leaks between concurrent tasks). Budget
    # mirrors smoke-rpa-skills.yml plus headroom for ad-hoc / e2e runs.
    timeout-minutes: 240
    name: Run coder-eval (Windows)
    # Disable uip CLI version-sync: tempdir tasks share one host ~/.uipath /
    # global uip, so a `uip login status` re-pin downgrades later tasks.
    # Mirrors daily.sh.
    env:
      UIPATH_CLI_DISABLE_VERSION_SYNC: "1"
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: '3.13'

      - uses: astral-sh/setup-uv@v4

      - uses: actions/setup-node@v4
        with:
          node-version: '20'

      - uses: actions/setup-dotnet@v4
        with:
          dotnet-version: '8.0.x'

      # Host coder-eval CLI = pinned wheel (coder_eval feed + ml-packages for
      # deps). Windows tasks run under the tempdir driver -- no agent image.
      - name: Install coder-eval
        shell: bash
        env:
          UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@pkgs.dev.azure.com/uipath/ML%20Platform/_packaging/coder_eval/pypi/simple/ https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
          CE_VERSION: ${{ needs.partition.outputs.coder_eval_version }}
          AGENT: ${{ inputs.agent }}
        run: |
          uv pip install --system "coder-eval==${CE_VERSION}"
          # Windows runs the tempdir driver (agent on host), so the codex SDK
          # installed here is what the codex agent actually uses. openai-codex
          # 0.1.0b3 (verified against coder-eval 0.6.2) pulls a pre-release
          # cli-bin that spans both internal feeds, so allow pre-releases and
          # best-match across indexes.
          if [ "$AGENT" = "codex" ]; then
            uv pip install --system --prerelease=allow --index-strategy unsafe-best-match \
              "openai-codex==0.1.0b3"
          fi

      - name: Configure NuGet feed for Helm packages
        shell: bash
        run: |
          dotnet nuget add source \
            "https://uipath.pkgs.visualstudio.com/Public.Feeds/_packaging/UiPath-Internal/nuget/v3/index.json" \
            --name UiPath-Internal \
            --username az \
            --password "${{ secrets.UV_INDEX_UIPATH_PASSWORD }}" \
            --store-password-in-clear-text

      # Two-feed install: `@uipath/cli` comes from the internal GitHub
      # Packages feed (so the `cli_version` input — e.g. `alpha`, pinned
      # prereleases — resolves the same way as the Linux Docker build).
      # `@uipath/rpa-tool` + `@uipath/rpa-legacy-tool` come from public npm
      # because the internal feed carries a divergent `1.0.0-alpha.*`
      # rpa-tool line under the same scope; pulling those from the public
      # feed (with the `^0.9` pin) keeps Helm on the known-good 0.9.x
      # line. See smoke-rpa-skills.yml for the full rationale.
      - name: Install uip CLI + RPA tools (GH Packages cli, public-npm tools)
        shell: bash
        env:
          CLI_VERSION:    ${{ inputs.cli_version }}
          NPM_AUTH_TOKEN: ${{ secrets.GH_PAT }}
        run: |
          set -e
          install_dir="$(mktemp -d)"
          cd "$install_dir"
          npm config get registry

          # Step 1: install the CLI from GH Packages. Use a scratch
          # NPM_CONFIG_USERCONFIG so the @uipath→npm.pkg.github.com mapping
          # and auth token don't leak into other steps. Mirrors the
          # tests/docker/Dockerfile config (line 14-17).
          tmp_npmrc="$(mktemp)"
          cat > "$tmp_npmrc" <<EOF
          @uipath:registry=https://npm.pkg.github.com/
          //npm.pkg.github.com/:_authToken=${NPM_AUTH_TOKEN}
          EOF
          NPM_CONFIG_USERCONFIG="$tmp_npmrc" npm install -g "@uipath/cli@${CLI_VERSION}"
          rm -f "$tmp_npmrc"
          uip --version

          # Step 2: install the RPA tools from public npm. CLI-flag scope
          # override beats any lingering global config, so this stays on
          # the public registry regardless of runner image defaults.
          npm install -g \
            --@uipath:registry=https://registry.npmjs.org/ \
            "@uipath/rpa-tool@>=0.9" \
            @uipath/rpa-legacy-tool@latest
          echo "--- Installed @uipath/* packages ---"
          npm ls -g --depth=0 2>/dev/null | grep '@uipath/' || true
          echo "--- uip tools list ---"
          uip tools list --output json

      # Pre-auth uip as a real licensed Studio user via Studio's e2e helper.
      # Required for Helm's HelmLicenseSkuFeatureSourceService gate, which
      # client_credentials principals don't clear. See smoke-rpa-skills.yml
      # for the full rationale.
      - name: Install Puppeteer (auth helper dep)
        shell: bash
        run: npm install --no-save puppeteer

      - name: Authenticate uip as licensed Studio user
        id: auth
        shell: bash
        env:
          AUTHORITY: https://alpha.uipath.com
          EMAIL:    ${{ secrets.UIPATH_EMAIL }}
          PASSWORD: ${{ secrets.UIPATH_PASSWORD }}
          TENANT:   ${{ secrets.UIPATH_TENANT }}
          ORG:      ${{ secrets.UIPATH_ORG }}
          AUTH_DEBUG_DIR: ${{ github.workspace }}/auth-debug
        run: |
          set -euo pipefail
          mkdir -p "$AUTH_DEBUG_DIR"
          for attempt in 1 2 3; do
            echo "--- Auth attempt $attempt ---"
            if node .github/scripts/uipath-oauth-login.mjs; then
              break
            fi
            if [ "$attempt" = "3" ]; then
              echo "::error::Auth failed after 3 attempts"
              exit 1
            fi
            sleep 5
          done
          uip login status --output json

      - name: Upload auth-debug artifacts
        if: always() && steps.auth.outcome != 'skipped'
        uses: actions/upload-artifact@v4
        with:
          name: auth-debug-windows-${{ github.run_id }}
          path: ${{ github.workspace }}/auth-debug
          if-no-files-found: ignore
          retention-days: 7

      - name: Pre-warm Helm (download NuGet package before tests)
        shell: bash
        run: |
          mkdir -p /tmp/helm-warmup && cd /tmp/helm-warmup
          uip rpa list-instances --output json 2>&1 || true
          taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true

      # Run tasks one at a time on Windows: Helm state leaks between tasks
      # (stale Studio session, locked NuGet cache), so the loop kills Helm
      # between tasks. j=1 by construction.
      #
      # Uses experiments/smoke-windows.yaml (the only existing Windows
      # tempdir experiment); per-task `run_limits` in the YAML extend the
      # smoke budget where needed (e.g. legacy/refactor_with_retry_scope_e2e
      # sets max_turns: 50).
      - name: Run coder-eval (Windows)
        env:
          SKILLS_REPO_PATH:         ${{ github.workspace }}
          API_BACKEND:              bedrock
          AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
          AWS_REGION:               ${{ secrets.AWS_REGION }}
          BEDROCK_MODEL:            ${{ secrets.BEDROCK_MODEL }}
          ANTHROPIC_API_KEY:        ${{ secrets.ANTHROPIC_API_KEY }}
          TASK_GLOBS:               ${{ needs.partition.outputs.windows_globs }}
          AGENT:                    ${{ inputs.agent }}
          CODEX_API_KEY:            ${{ secrets.CODEX_API_KEY }}
          CODEX_BASE_URL:           ${{ secrets.CODEX_BASE_URL }}
          # Redirect the temp root off the default %TEMP%. On the GH-hosted
          # Windows runner %TEMP% resolves to its 8.3 short form
          # (C:\Users\RUNNER~1\AppData\Local\Temp). coder-eval creates each
          # agent sandbox under %TEMP%, so the sandbox path inherits the
          # `RUNNER~1` tilde segment — and Claude Code's permission layer
          # rejects writes to paths containing that 8.3 short-name pattern as
          # "suspicious", blocking the agent's own file edits until the 1200s
          # turn watchdog hard-kills it (false ERROR). `C:\cetmp` is ≤8 chars
          # with no spaces, so Windows generates no `~1` alias for it.
          TMP:  C:\cetmp
          TEMP: C:\cetmp
        working-directory: tests
        id: eval
        shell: bash
        run: |
          shopt -s globstar nullglob
          # Create the tilde-free temp root pinned via TMP/TEMP above.
          mkdir -p /c/cetmp
          # Agent selection. claude (default) takes the experiment YAML config;
          # codex overrides to gpt-5.4 (auth via CODEX_API_KEY/CODEX_BASE_URL).
          # Driver is unchanged (tempdir here).
          agent_flags=""
          if [ "$AGENT" = "codex" ]; then
            agent_flags="--type codex --model gpt-5.4"
          fi
          overall_exit=0
          for task in $TASK_GLOBS; do
            echo "--- Killing leftover Helm/Studio processes ---"
            taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true
            echo "--- Running: $task ---"
            for attempt in 1 2 3; do
              echo "--- $task attempt $attempt ---"
              if coder-eval run "$task" \
                  -e experiments/smoke-windows.yaml $agent_flags -j 1 -v; then
                break
              fi
              # Only retry on Bedrock content-filter ERRORs. Real task
              # failures are real signal — don't burn retries on them.
              latest_json="$(ls -t runs/*/default/*/00/task.json 2>/dev/null | head -n 1 || true)"
              if [ -n "$latest_json" ] && LATEST_JSON="$latest_json" python -c "import json,os,sys; d=json.load(open(os.environ['LATEST_JSON'])); det=d.get('error_details') or {}; msg=json.dumps(det)+(d.get('error_message') or ''); sys.exit(0 if d.get('final_status')=='ERROR' and 'content filter' in msg.lower() else 2)"; then
                echo "::warning::$task: content-filter error on attempt $attempt — retrying"
                if [ "$attempt" = "3" ]; then
                  overall_exit=1
                fi
                continue
              fi
              overall_exit=1
              break
            done
          done
          exit $overall_exit

      - name: Upload eval report
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: coder-eval-windows-${{ github.run_id }}
          path: |
            tests/runs/*/experiment.html
            tests/runs/*/experiment.md
            tests/runs/*/experiment.json
            tests/runs/*/experiment.log
            tests/runs/*/*/variant.html
            tests/runs/*/*/variant.md
            tests/runs/*/*/variant.json
            tests/runs/**/task.html
            tests/runs/**/task.json
            tests/runs/**/task.log
            tests/runs/**/artifacts
          if-no-files-found: warn
          retention-days: 14

      - name: Plaintext summary + verdict
        if: always()
        shell: bash
        run: |
          set -uo pipefail
          run_dir=$(ls -td tests/runs/*/ 2>/dev/null | head -n 1 || true)
          if [ -z "$run_dir" ]; then
            echo "FAIL — no run directory produced" | tee -a "$GITHUB_STEP_SUMMARY"
            exit 1
          fi
          python - "$run_dir" <<'PY' | tee -a "$GITHUB_STEP_SUMMARY"
          import json, pathlib, sys
          rows = [json.loads(p.read_text(encoding='utf-8')) for p in sorted(pathlib.Path(sys.argv[1]).rglob('task.json'))]
          for r in rows:
              print(f"{r.get('task_id','?')}: {r.get('final_status','?')}")
          ok = sum(1 for r in rows if r.get('final_status') == 'SUCCESS')
          print(f"\n{ok}/{len(rows)} PASS (windows)")
          sys.exit(0 if rows and ok == len(rows) else 1)
          PY