-
Notifications
You must be signed in to change notification settings - Fork 37
575 lines (540 loc) · 25.6 KB
/
Copy pathrun-coder-eval.yml
File metadata and controls
575 lines (540 loc) · 25.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
name: Run Coder Eval
# GH-hosted runner for coder-eval against the skills task tree. Use cases:
# single task ad-hoc, a folder, or (with explicit confirmation) the full
# suite that the VM cron runs nightly. Long-term replacement for the
# coder-eval-runner VM.
#
# Tasks are split by the `windows` tag (see tests/README.md#tag-taxonomy):
# windows-tagged tasks run on `windows-latest` (Helm/Studio is Windows-only,
# tempdir driver), everything else runs on `ubuntu-latest` (Docker driver
# + experiments/nightly.yaml). Both jobs receive the resolved file list
# from the `partition` job — never the raw glob.
on:
workflow_dispatch:
inputs:
task_globs:
description: 'REQUIRED. Space-separated globs under tests/. E.g. tasks/uipath-agents/**/*.yaml'
type: string
required: true
# Host-level concurrency for the Linux job's `coder-eval -j`. Default 4:
# ubuntu-latest is 4 vCPU, and j=20 oversubscribes ~5:1 — agents miss the
# 1200s turn timeout and ERROR (false negatives, including a bindings
# test) instead of failing on logic. Repro on a j=20 run:
# https://github.com/UiPath/skills/actions/runs/26651052875
parallelism:
description: 'Parallel tasks for the Linux job (-j). Keep ≤4 on ubuntu-latest (4 vCPU).'
type: string
required: false
default: '4'
confirm_large_run:
description: 'Allow >50 tasks (cost guardrail).'
type: boolean
default: false
coder_eval_version:
description: 'coder_eval version to test (blank = pinned).'
type: string
default: ''
# Decouples the GHCR agent image from the wheel version, so an ad-hoc /
# unreleased image (e.g. a codex-baked `sha-<commit>` build) can be tested
# while the host CLI installs a matching dev wheel via coder_eval_version.
coder_eval_image_tag:
description: 'Agent image tag (blank = coder_eval_version).'
type: string
default: ''
cli_version:
description: '@uipath/cli version.'
type: string
default: alpha
agent:
description: 'Agent (codex = gpt-5.4).'
type: choice
options:
- claude
- codex
default: claude
concurrency:
group: run-coder-eval-${{ github.ref }}
cancel-in-progress: false
jobs:
partition:
runs-on: ubuntu-latest
name: Partition globs by `windows` tag
outputs:
linux_globs: ${{ steps.split.outputs.linux_globs }}
windows_globs: ${{ steps.split.outputs.windows_globs }}
linux_count: ${{ steps.split.outputs.linux_count }}
windows_count: ${{ steps.split.outputs.windows_count }}
coder_eval_version: ${{ steps.ceref.outputs.version }}
coder_eval_image_tag: ${{ steps.ceref.outputs.image_tag }}
steps:
- uses: actions/checkout@v4
# Single source of truth for the coder_eval version: the pin in
# tests/.coder-eval-version. A non-blank coder_eval_version input overrides
# it for ad-hoc runs. Resolved once here and passed to both run jobs as the
# wheel version (host CLI) + agent image tag (sandbox).
- name: Resolve coder_eval version (pinned; input overrides)
id: ceref
env:
CE_VERSION_INPUT: ${{ inputs.coder_eval_version }}
CE_IMAGE_TAG_INPUT: ${{ inputs.coder_eval_image_tag }}
run: |
if [ -n "$CE_VERSION_INPUT" ]; then
version="$CE_VERSION_INPUT"
else
version="$(tr -d '[:space:]' < tests/.coder-eval-version)"
fi
echo "version=$version" >> "$GITHUB_OUTPUT"
# Agent image tag defaults to the wheel version; a non-blank input
# decouples them (ad-hoc / unreleased image vs. dev-wheel host CLI).
echo "image_tag=${CE_IMAGE_TAG_INPUT:-$version}" >> "$GITHUB_OUTPUT"
- name: Resolve globs, split by tag, enforce large-run gate
id: split
env:
INPUT_GLOBS: ${{ inputs.task_globs }}
CONFIRMED: ${{ inputs.confirm_large_run }}
run: |
set -euo pipefail
if [ -z "$INPUT_GLOBS" ]; then
echo "::error::task_globs is required"
exit 1
fi
shopt -s globstar nullglob
cd tests
LINUX=()
WINDOWS=()
for pat in $INPUT_GLOBS; do
for f in $pat; do
[ -f "$f" ] || continue
# Tag is the literal token `windows` on the `tags:` line.
# Word boundary keeps `feature:windows-foo` (hypothetical)
# from matching.
if grep -qE '^tags:.*\bwindows\b' "$f"; then
WINDOWS+=("$f")
else
LINUX+=("$f")
fi
done
done
LCOUNT=${#LINUX[@]}
WCOUNT=${#WINDOWS[@]}
TOTAL=$((LCOUNT + WCOUNT))
echo "Matched $TOTAL task file(s): $LCOUNT linux + $WCOUNT windows"
if [ "$TOTAL" -eq 0 ]; then
echo "::error::No task files matched the provided globs"
exit 1
fi
if [ "$TOTAL" -gt 50 ] && [ "$CONFIRMED" != "true" ]; then
echo "::error::Glob matches $TOTAL tasks (> 50). Tick confirm_large_run to authorize."
exit 1
fi
# Space-separated; downstream jobs word-split on consumption.
echo "linux_globs=${LINUX[*]:-}" >> "$GITHUB_OUTPUT"
echo "windows_globs=${WINDOWS[*]:-}" >> "$GITHUB_OUTPUT"
echo "linux_count=$LCOUNT" >> "$GITHUB_OUTPUT"
echo "windows_count=$WCOUNT" >> "$GITHUB_OUTPUT"
run-linux:
needs: partition
if: needs.partition.outputs.linux_count != '0'
# TODO: switch to `ubuntu-latest-16-cores` (16 vCPU / 64 GB RAM) once
# the UiPath/skills repo is allowlisted for that runner group. Standard
# ubuntu-latest (4 vCPU) handles ad-hoc single-task / small-folder runs at
# the default `parallelism` (4); the larger tier is needed to raise -j for
# the full nightly sweep to finish in ~30-45min instead of multi-hour.
runs-on: ubuntu-latest
# Full skills suite on the VM (~700 tasks, j=1) runs 3-4h. 330 min
# covers worst-case stalls (one slow task pinning a slot, network
# hiccups, etc.).
timeout-minutes: 330
name: Run coder-eval (Linux)
# Disable uip CLI version-sync: a task's `uip login status` re-pins
# ~/.uipath/config.json to a stale line and, via nightly.yaml's shared
# ~/.uipath mount, downgrades the CLI for later tasks. Mirrors daily.sh.
env:
UIPATH_CLI_DISABLE_VERSION_SYNC: "1"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- uses: astral-sh/setup-uv@v4
# Host coder-eval CLI = pinned wheel (coder_eval feed + ml-packages for
# deps). No source checkout.
# codex is NOT installed on the host here: under the docker driver the
# agent runs inside skills-image, so the codex SDK belongs in that image
# (separate TODO), not on the host CLI.
- name: Install coder-eval
env:
UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@pkgs.dev.azure.com/uipath/ML%20Platform/_packaging/coder_eval/pypi/simple/ https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
run: uv pip install --system "coder-eval==${{ needs.partition.outputs.coder_eval_version }}"
# Pull the agent image from GHCR (never built from source); the skills
# image extends it below. Tag defaults to the wheel version but can be
# overridden (coder_eval_image_tag) for ad-hoc / unreleased images.
# GH_PAT needs read:packages.
- name: Pull coder-eval-agent image
run: |
echo "${{ secrets.GH_PAT }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
docker pull ghcr.io/uipath/coder-eval-agent:${{ needs.partition.outputs.coder_eval_image_tag }}
# Build the skills extension image (`@uipath/cli` + `@uipath/admin-tool`
# on top of coder-eval-agent). Matches smoke-skills.yml + daily.sh.
- name: Build skills Docker image
run: |
docker build \
--build-arg CODER_EVAL_IMAGE=ghcr.io/uipath/coder-eval-agent:${{ needs.partition.outputs.coder_eval_image_tag }} \
--build-arg NPM_AUTH_TOKEN="${{ secrets.GH_PAT }}" \
--build-arg CLI_VERSION="${{ inputs.cli_version }}" \
-t skills-image:latest \
-f tests/docker/Dockerfile \
.
# ROPC bot-user auth via this repo's vendored script (mirrors the VM
# cron's ROPC flow). Required because `uip flow debug` needs a real user
# `sub` claim — s2s/client_credentials tokens lack one. Writes
# ~/.uipath/.auth which experiments/nightly.yaml mounts into each eval
# container.
- name: Mint UiPath bot token (ROPC)
env:
UIPATH_URL: https://alpha.uipath.com
UIPATH_ORGANIZATION_NAME: ${{ secrets.UIPATH_ORG_NAME }}
UIPATH_ORGANIZATION_ID: ${{ secrets.UIPATH_ORG_ID }}
UIPATH_TENANT_NAME: ${{ secrets.UIPATH_TENANT_NAME }}
UIPATH_TENANT_ID: ${{ secrets.UIPATH_TENANT_ID }}
CLIENT_ID: ${{ secrets.UIPATH_ROPC_CLIENT_ID }}
CLIENT_SECRET: ${{ secrets.UIPATH_ROPC_CLIENT_SECRET }}
CE_USERNAME: ${{ secrets.UIPATH_BOT_USERNAME }}
CE_PASSWORD: ${{ secrets.UIPATH_BOT_PASSWORD }}
run: bash .github/scripts/refresh-auth.sh
# TASK_GLOBS here is the partition-resolved list of *file paths* (not
# globs). Word-splitting is intentional — do not quote.
- name: Run coder-eval
env:
SKILLS_REPO_PATH: ${{ github.workspace }}
API_BACKEND: bedrock
AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
AWS_REGION: ${{ secrets.AWS_REGION }}
BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
TRACES_SMOKE_PROCESS_KEY: ${{ secrets.TRACES_SMOKE_PROCESS_KEY }}
E2E_PROCESS_KEY: ${{ secrets.E2E_PROCESS_KEY }}
E2E_LONG_PROCESS_KEY: ${{ secrets.E2E_LONG_PROCESS_KEY }}
TASK_GLOBS: ${{ needs.partition.outputs.linux_globs }}
TASK_COUNT: ${{ needs.partition.outputs.linux_count }}
TASK_PARALLELISM: ${{ inputs.parallelism }}
AGENT: ${{ inputs.agent }}
CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
CODEX_BASE_URL: ${{ secrets.CODEX_BASE_URL }}
working-directory: tests
id: eval
run: |
# Guard the empty case (e.g. a future non-dispatch trigger): the
# `parallelism` input default (4) only applies on workflow_dispatch.
j="${TASK_PARALLELISM:-4}"
# Agent selection. claude (default) takes the experiment YAML's
# claude-code/sonnet config; codex overrides to gpt-5.4 and authenticates
# via CODEX_API_KEY/CODEX_BASE_URL. Driver is unchanged (docker here).
agent_flags=""
if [ "$AGENT" = "codex" ]; then
agent_flags="--type codex --model gpt-5.4"
fi
echo "Running: coder-eval run $TASK_GLOBS -e experiments/nightly.yaml $agent_flags -j $j ($TASK_COUNT task files)"
coder-eval run $TASK_GLOBS \
-e experiments/nightly.yaml \
$agent_flags \
-j "$j" -v \
--run-dir /tmp/runs
- name: Fix permissions on /tmp/runs
if: always()
run: sudo chown -R $(id -u):$(id -g) /tmp/runs && sudo chmod -R 755 /tmp/runs 2>/dev/null || true
# Strip heavy task scratch dirs before upload to keep artifact size
# manageable. Matches smoke-skills.yml.
- name: Clean up artifacts (remove .venv and node_modules)
if: always()
run: |
find /tmp/runs -type d \( -name .venv -o -name node_modules \) -path "*/artifacts/*" -exec rm -rf {} + || true
- name: Upload eval report
if: always()
uses: actions/upload-artifact@v4
with:
name: coder-eval-linux-${{ github.run_id }}
# ** so the pattern matches the replicate-index segment that
# coder_eval adds to per-task run dirs (/tmp/runs/<ts>/<variant>/<task>/<NN>/).
path: |
/tmp/runs/*/experiment.html
/tmp/runs/*/experiment.md
/tmp/runs/*/experiment.json
/tmp/runs/*/experiment.log
/tmp/runs/*/*/variant.html
/tmp/runs/*/*/variant.md
/tmp/runs/*/*/variant.json
/tmp/runs/**/task.html
/tmp/runs/**/task.json
/tmp/runs/**/task.log
/tmp/runs/**/artifacts
if-no-files-found: warn
retention-days: 14
- name: Plaintext summary + verdict
if: always()
run: |
set -uo pipefail
run_dir=$(ls -td /tmp/runs/*/ 2>/dev/null | head -n 1 || true)
if [ -z "$run_dir" ]; then
echo "FAIL — no run directory produced" | tee -a "$GITHUB_STEP_SUMMARY"
exit 1
fi
python - "$run_dir" <<'PY' | tee -a "$GITHUB_STEP_SUMMARY"
import json, pathlib, sys
rows = [json.loads(p.read_text(encoding='utf-8')) for p in sorted(pathlib.Path(sys.argv[1]).rglob('task.json'))]
for r in rows:
print(f"{r.get('task_id','?')}: {r.get('final_status','?')}")
ok = sum(1 for r in rows if r.get('final_status') == 'SUCCESS')
print(f"\n{ok}/{len(rows)} PASS (linux)")
sys.exit(0 if rows and ok == len(rows) else 1)
PY
run-windows:
needs: partition
if: needs.partition.outputs.windows_count != '0'
runs-on: windows-latest
# Windows RPA tasks spin up Helm on each `uip rpa` call (30–60s each)
# and run j=1 (Helm state leaks between concurrent tasks). Budget
# mirrors smoke-rpa-skills.yml plus headroom for ad-hoc / e2e runs.
timeout-minutes: 240
name: Run coder-eval (Windows)
# Disable uip CLI version-sync: tempdir tasks share one host ~/.uipath /
# global uip, so a `uip login status` re-pin downgrades later tasks.
# Mirrors daily.sh.
env:
UIPATH_CLI_DISABLE_VERSION_SYNC: "1"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- uses: astral-sh/setup-uv@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- uses: actions/setup-dotnet@v4
with:
dotnet-version: '8.0.x'
# Host coder-eval CLI = pinned wheel (coder_eval feed + ml-packages for
# deps). Windows tasks run under the tempdir driver -- no agent image.
- name: Install coder-eval
shell: bash
env:
UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@pkgs.dev.azure.com/uipath/ML%20Platform/_packaging/coder_eval/pypi/simple/ https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
CE_VERSION: ${{ needs.partition.outputs.coder_eval_version }}
AGENT: ${{ inputs.agent }}
run: |
uv pip install --system "coder-eval==${CE_VERSION}"
# Windows runs the tempdir driver (agent on host), so the codex SDK
# installed here is what the codex agent actually uses. openai-codex
# 0.1.0b3 (verified against coder-eval 0.6.2) pulls a pre-release
# cli-bin that spans both internal feeds, so allow pre-releases and
# best-match across indexes.
if [ "$AGENT" = "codex" ]; then
uv pip install --system --prerelease=allow --index-strategy unsafe-best-match \
"openai-codex==0.1.0b3"
fi
- name: Configure NuGet feed for Helm packages
shell: bash
run: |
dotnet nuget add source \
"https://uipath.pkgs.visualstudio.com/Public.Feeds/_packaging/UiPath-Internal/nuget/v3/index.json" \
--name UiPath-Internal \
--username az \
--password "${{ secrets.UV_INDEX_UIPATH_PASSWORD }}" \
--store-password-in-clear-text
# Two-feed install: `@uipath/cli` comes from the internal GitHub
# Packages feed (so the `cli_version` input — e.g. `alpha`, pinned
# prereleases — resolves the same way as the Linux Docker build).
# `@uipath/rpa-tool` + `@uipath/rpa-legacy-tool` come from public npm
# because the internal feed carries a divergent `1.0.0-alpha.*`
# rpa-tool line under the same scope; pulling those from the public
# feed (with the `^0.9` pin) keeps Helm on the known-good 0.9.x
# line. See smoke-rpa-skills.yml for the full rationale.
- name: Install uip CLI + RPA tools (GH Packages cli, public-npm tools)
shell: bash
env:
CLI_VERSION: ${{ inputs.cli_version }}
NPM_AUTH_TOKEN: ${{ secrets.GH_PAT }}
run: |
set -e
install_dir="$(mktemp -d)"
cd "$install_dir"
npm config get registry
# Step 1: install the CLI from GH Packages. Use a scratch
# NPM_CONFIG_USERCONFIG so the @uipath→npm.pkg.github.com mapping
# and auth token don't leak into other steps. Mirrors the
# tests/docker/Dockerfile config (line 14-17).
tmp_npmrc="$(mktemp)"
cat > "$tmp_npmrc" <<EOF
@uipath:registry=https://npm.pkg.github.com/
//npm.pkg.github.com/:_authToken=${NPM_AUTH_TOKEN}
EOF
NPM_CONFIG_USERCONFIG="$tmp_npmrc" npm install -g "@uipath/cli@${CLI_VERSION}"
rm -f "$tmp_npmrc"
uip --version
# Step 2: install the RPA tools from public npm. CLI-flag scope
# override beats any lingering global config, so this stays on
# the public registry regardless of runner image defaults.
npm install -g \
--@uipath:registry=https://registry.npmjs.org/ \
"@uipath/rpa-tool@>=0.9" \
@uipath/rpa-legacy-tool@latest
echo "--- Installed @uipath/* packages ---"
npm ls -g --depth=0 2>/dev/null | grep '@uipath/' || true
echo "--- uip tools list ---"
uip tools list --output json
# Pre-auth uip as a real licensed Studio user via Studio's e2e helper.
# Required for Helm's HelmLicenseSkuFeatureSourceService gate, which
# client_credentials principals don't clear. See smoke-rpa-skills.yml
# for the full rationale.
- name: Install Puppeteer (auth helper dep)
shell: bash
run: npm install --no-save puppeteer
- name: Authenticate uip as licensed Studio user
id: auth
shell: bash
env:
AUTHORITY: https://alpha.uipath.com
EMAIL: ${{ secrets.UIPATH_EMAIL }}
PASSWORD: ${{ secrets.UIPATH_PASSWORD }}
TENANT: ${{ secrets.UIPATH_TENANT }}
ORG: ${{ secrets.UIPATH_ORG }}
AUTH_DEBUG_DIR: ${{ github.workspace }}/auth-debug
run: |
set -euo pipefail
mkdir -p "$AUTH_DEBUG_DIR"
for attempt in 1 2 3; do
echo "--- Auth attempt $attempt ---"
if node .github/scripts/uipath-oauth-login.mjs; then
break
fi
if [ "$attempt" = "3" ]; then
echo "::error::Auth failed after 3 attempts"
exit 1
fi
sleep 5
done
uip login status --output json
- name: Upload auth-debug artifacts
if: always() && steps.auth.outcome != 'skipped'
uses: actions/upload-artifact@v4
with:
name: auth-debug-windows-${{ github.run_id }}
path: ${{ github.workspace }}/auth-debug
if-no-files-found: ignore
retention-days: 7
- name: Pre-warm Helm (download NuGet package before tests)
shell: bash
run: |
mkdir -p /tmp/helm-warmup && cd /tmp/helm-warmup
uip rpa list-instances --output json 2>&1 || true
taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true
# Run tasks one at a time on Windows: Helm state leaks between tasks
# (stale Studio session, locked NuGet cache), so the loop kills Helm
# between tasks. j=1 by construction.
#
# Uses experiments/smoke-windows.yaml (the only existing Windows
# tempdir experiment); per-task `run_limits` in the YAML extend the
# smoke budget where needed (e.g. legacy/refactor_with_retry_scope_e2e
# sets max_turns: 50).
- name: Run coder-eval (Windows)
env:
SKILLS_REPO_PATH: ${{ github.workspace }}
API_BACKEND: bedrock
AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
AWS_REGION: ${{ secrets.AWS_REGION }}
BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
TASK_GLOBS: ${{ needs.partition.outputs.windows_globs }}
AGENT: ${{ inputs.agent }}
CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
CODEX_BASE_URL: ${{ secrets.CODEX_BASE_URL }}
# Redirect the temp root off the default %TEMP%. On the GH-hosted
# Windows runner %TEMP% resolves to its 8.3 short form
# (C:\Users\RUNNER~1\AppData\Local\Temp). coder-eval creates each
# agent sandbox under %TEMP%, so the sandbox path inherits the
# `RUNNER~1` tilde segment — and Claude Code's permission layer
# rejects writes to paths containing that 8.3 short-name pattern as
# "suspicious", blocking the agent's own file edits until the 1200s
# turn watchdog hard-kills it (false ERROR). `C:\cetmp` is ≤8 chars
# with no spaces, so Windows generates no `~1` alias for it.
TMP: C:\cetmp
TEMP: C:\cetmp
working-directory: tests
id: eval
shell: bash
run: |
shopt -s globstar nullglob
# Create the tilde-free temp root pinned via TMP/TEMP above.
mkdir -p /c/cetmp
# Agent selection. claude (default) takes the experiment YAML config;
# codex overrides to gpt-5.4 (auth via CODEX_API_KEY/CODEX_BASE_URL).
# Driver is unchanged (tempdir here).
agent_flags=""
if [ "$AGENT" = "codex" ]; then
agent_flags="--type codex --model gpt-5.4"
fi
overall_exit=0
for task in $TASK_GLOBS; do
echo "--- Killing leftover Helm/Studio processes ---"
taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true
echo "--- Running: $task ---"
for attempt in 1 2 3; do
echo "--- $task attempt $attempt ---"
if coder-eval run "$task" \
-e experiments/smoke-windows.yaml $agent_flags -j 1 -v; then
break
fi
# Only retry on Bedrock content-filter ERRORs. Real task
# failures are real signal — don't burn retries on them.
latest_json="$(ls -t runs/*/default/*/00/task.json 2>/dev/null | head -n 1 || true)"
if [ -n "$latest_json" ] && LATEST_JSON="$latest_json" python -c "import json,os,sys; d=json.load(open(os.environ['LATEST_JSON'])); det=d.get('error_details') or {}; msg=json.dumps(det)+(d.get('error_message') or ''); sys.exit(0 if d.get('final_status')=='ERROR' and 'content filter' in msg.lower() else 2)"; then
echo "::warning::$task: content-filter error on attempt $attempt — retrying"
if [ "$attempt" = "3" ]; then
overall_exit=1
fi
continue
fi
overall_exit=1
break
done
done
exit $overall_exit
- name: Upload eval report
if: always()
uses: actions/upload-artifact@v4
with:
name: coder-eval-windows-${{ github.run_id }}
path: |
tests/runs/*/experiment.html
tests/runs/*/experiment.md
tests/runs/*/experiment.json
tests/runs/*/experiment.log
tests/runs/*/*/variant.html
tests/runs/*/*/variant.md
tests/runs/*/*/variant.json
tests/runs/**/task.html
tests/runs/**/task.json
tests/runs/**/task.log
tests/runs/**/artifacts
if-no-files-found: warn
retention-days: 14
- name: Plaintext summary + verdict
if: always()
shell: bash
run: |
set -uo pipefail
run_dir=$(ls -td tests/runs/*/ 2>/dev/null | head -n 1 || true)
if [ -z "$run_dir" ]; then
echo "FAIL — no run directory produced" | tee -a "$GITHUB_STEP_SUMMARY"
exit 1
fi
python - "$run_dir" <<'PY' | tee -a "$GITHUB_STEP_SUMMARY"
import json, pathlib, sys
rows = [json.loads(p.read_text(encoding='utf-8')) for p in sorted(pathlib.Path(sys.argv[1]).rglob('task.json'))]
for r in rows:
print(f"{r.get('task_id','?')}: {r.get('final_status','?')}")
ok = sum(1 for r in rows if r.get('final_status') == 'SUCCESS')
print(f"\n{ok}/{len(rows)} PASS (windows)")
sys.exit(0 if rows and ok == len(rows) else 1)
PY