pi_agent_rust/.github/workflows/bench.yml at main · chatek/pi_agent_rust · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Benchmark CI workflow
# Runs on pushes to main and PRs to detect performance regressions
name: Benchmarks

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
  workflow_dispatch:  # Allow manual triggering

env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: -D warnings

jobs:
  benchmark:
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: pi_agent_rust
        shell: bash
    steps:
      - name: Free disk space
        working-directory: /
        run: |
          set -euxo pipefail
          df -h /
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
            /opt/hostedtoolcache/CodeQL /opt/hostedtoolcache/go \
            /opt/hostedtoolcache/Python /opt/hostedtoolcache/Ruby \
            /opt/hostedtoolcache/Java_Temurin-Hotspot_jdk \
            /usr/local/share/powershell /usr/share/swift \
            /usr/local/graalvm /usr/local/.ghcup /usr/local/julia*
          sudo docker image prune --all --force || true
          sudo apt-get clean || true
          df -h /

      - name: Checkout pi_agent_rust
        uses: actions/checkout@v4
        with:
          path: pi_agent_rust

      - name: Install system deps (bc, fd, rg, xcb) [linux]
        run: |
          set -euxo pipefail
          sudo apt-get update
          sudo apt-get install -y bc fd-find ripgrep libxcb1-dev libxcb-render0-dev libxcb-shape0-dev libxcb-xfixes0-dev
          sudo ln -sf "$(command -v fdfind)" /usr/local/bin/fd

      - name: Install Rust nightly
        uses: dtolnay/rust-toolchain@nightly
        with:
          components: clippy

      - name: Cache cargo registry
        uses: actions/cache@v4
        with:
          path: |
            ~/.cargo/registry
            ~/.cargo/git
            pi_agent_rust/target
          key: ${{ runner.os }}-cargo-bench-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-bench-
            ${{ runner.os }}-cargo-

      - name: Standardize benchmark environment
        run: |
          set -euxo pipefail
          # Apply benchmark-optimal OS settings (best-effort, some may fail in containers)
          sudo scripts/bench_env_setup.sh apply || echo "::warning::Could not fully apply bench env (expected in containers)"
          # Emit environment fingerprint for artifact tracking
          mkdir -p target/perf
          scripts/bench_env_setup.sh fingerprint | tee target/perf/bench_env_fingerprint.json || true
          # Validate and report
          echo "## Benchmark Environment" >> $GITHUB_STEP_SUMMARY
          echo '```' >> $GITHUB_STEP_SUMMARY
          scripts/bench_env_setup.sh validate 2>&1 | sed 's/\x1b\[[0-9;]*m//g' >> $GITHUB_STEP_SUMMARY || true
          echo '```' >> $GITHUB_STEP_SUMMARY

      - name: Build release binary
        run: cargo build --release

      - name: Check binary size
        run: |
          SIZE_BYTES=$(stat --printf="%s" target/release/pi)
          SIZE_MB=$(echo "scale=2; $SIZE_BYTES / 1024 / 1024" | bc)
          echo "## Binary Size" >> $GITHUB_STEP_SUMMARY
          echo "- Size: ${SIZE_MB}MB" >> $GITHUB_STEP_SUMMARY
          echo "- Budget: 20MB" >> $GITHUB_STEP_SUMMARY
          if (( $(echo "$SIZE_MB > 20" | bc -l) )); then
            echo "::error::Binary size ${SIZE_MB}MB exceeds 20MB budget"
            exit 1
          fi
          echo "- Status: ✅ Within budget" >> $GITHUB_STEP_SUMMARY

      - name: Run micro-benchmarks (tools)
        run: cargo bench --bench tools -- --noplot

      - name: Run micro-benchmarks (extensions)
        run: cargo bench --bench extensions -- --noplot

      - name: Run system benchmarks
        run: cargo bench --bench system -- --noplot
        env:
          PI_BENCH_BINARY: target/release/pi

      - name: Run TUI performance benchmarks (PERF-8/9)
        run: cargo bench --bench tui_perf -- --noplot --save-baseline pr

      - name: Generate PiJS workload perf data (JSONL)
        run: |
          set -euxo pipefail
          PERF_PROFILE=perf
          mkdir -p "target/perf/${PERF_PROFILE}"
          PI_BENCH_BUILD_PROFILE="${PERF_PROFILE}" cargo run --profile "${PERF_PROFILE}" --bin pijs_workload -- --iterations 2000 --tool-calls 1 > "target/perf/${PERF_PROFILE}/pijs_workload_${PERF_PROFILE}.jsonl"
          PI_BENCH_BUILD_PROFILE="${PERF_PROFILE}" cargo run --profile "${PERF_PROFILE}" --bin pijs_workload -- --iterations 2000 --tool-calls 10 >> "target/perf/${PERF_PROFILE}/pijs_workload_${PERF_PROFILE}.jsonl"

      - name: Generate benchmark summary
        run: |
          echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Startup Time" >> $GITHUB_STEP_SUMMARY
          if [ -d "target/criterion/startup" ]; then
            for dir in target/criterion/startup/*/; do
              name=$(basename "$dir")
              if [ -f "$dir/new/estimates.json" ]; then
                mean=$(jq -r '.mean.point_estimate' "$dir/new/estimates.json" 2>/dev/null || echo "N/A")
                echo "- $name: ${mean}ns" >> $GITHUB_STEP_SUMMARY
              fi
            done
          fi
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### Truncation" >> $GITHUB_STEP_SUMMARY
          if [ -d "target/criterion/truncation" ]; then
            for dir in target/criterion/truncation/*/; do
              name=$(basename "$dir")
              if [ -f "$dir/new/estimates.json" ]; then
                mean=$(jq -r '.mean.point_estimate' "$dir/new/estimates.json" 2>/dev/null || echo "N/A")
                echo "- $name: ${mean}ns" >> $GITHUB_STEP_SUMMARY
              fi
            done
          fi
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "### TUI Rendering (PERF-8)" >> $GITHUB_STEP_SUMMARY
          for group in build_conversation_content view viewport_operations markdown_rendering; do
            if [ -d "target/criterion/$group" ]; then
              echo "**$group**" >> $GITHUB_STEP_SUMMARY
              for dir in target/criterion/"$group"/*/; do
                name=$(basename "$dir")
                if [ -f "$dir/new/estimates.json" ]; then
                  mean=$(jq -r '.mean.point_estimate' "$dir/new/estimates.json" 2>/dev/null || echo "N/A")
                  echo "- $name: ${mean}ns" >> $GITHUB_STEP_SUMMARY
                fi
              done
            fi
          done

      - name: Perf budget gate
        run: cargo test --test perf_budgets -- --nocapture

      - name: Restore benchmark environment
        if: always()
        run: sudo scripts/bench_env_setup.sh restore || true

      - name: Upload benchmark results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results-${{ github.sha }}
          path: |
            pi_agent_rust/target/criterion/
            pi_agent_rust/target/perf/
          retention-days: 30

  # Compare with main branch baseline (only on PRs)
  compare:
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    needs: benchmark
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          ref: main
          path: main-branch

      - uses: actions/checkout@v4
        with:
          path: pr-branch

      - name: Install system deps (xcb) [linux]
        run: |
          set -euxo pipefail
          sudo apt-get update
          sudo apt-get install -y libxcb1-dev libxcb-render0-dev libxcb-shape0-dev libxcb-xfixes0-dev

      - name: Install Rust nightly
        uses: dtolnay/rust-toolchain@nightly

      - name: Cache cargo registry
        uses: actions/cache@v4
        with:
          path: |
            ~/.cargo/registry
            ~/.cargo/git
          key: ${{ runner.os }}-cargo-compare-${{ hashFiles('**/Cargo.lock') }}

      - name: Install critcmp
        run: cargo install critcmp

      - name: Build and benchmark main
        working-directory: main-branch
        run: |
          cargo build --release
          cargo bench --bench tools -- --noplot --save-baseline main 2>/dev/null || true
          cargo bench --bench tui_perf -- --noplot --save-baseline main 2>/dev/null || true

      - name: Copy main baseline to PR target dir
        run: |
          set -euxo pipefail
          if [ -d "main-branch/target/criterion" ]; then
            mkdir -p pr-branch/target
            mkdir -p pr-branch/target/criterion
            cp -R main-branch/target/criterion/. pr-branch/target/criterion/
          fi

      - name: Build and benchmark PR
        working-directory: pr-branch
        run: |
          cargo build --release
          cargo bench --bench tools -- --noplot --baseline main 2>/dev/null || echo "Baseline comparison skipped"
          cargo bench --bench tui_perf -- --noplot --baseline main 2>/dev/null || echo "TUI baseline comparison skipped"

      - name: Compare TUI performance (fail on >20% regression)
        working-directory: pr-branch
        env:
          PERF_REGRESSION_THRESHOLD: ${{ vars.PERF_REGRESSION_THRESHOLD || '20' }}
        run: |
          set -euo pipefail
          echo "## TUI Performance Comparison" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY

          # Run critcmp with threshold
          if critcmp main pr --threshold "${PERF_REGRESSION_THRESHOLD}" 2>/dev/null; then
            echo "✅ No TUI performance regressions detected (threshold: ${PERF_REGRESSION_THRESHOLD}%)" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ TUI performance regression detected (>${PERF_REGRESSION_THRESHOLD}%)" >> $GITHUB_STEP_SUMMARY
            # Show detailed comparison
            critcmp main pr 2>/dev/null >> $GITHUB_STEP_SUMMARY || true
            exit 1
          fi