From 0623c1f019781e8f821e91e04c5700f02f40261c Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 03:57:04 -0400 Subject: [PATCH 1/4] benchmark: alloc profile testitem (Ipopt + MadNLP, bilinear N=51) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the new HBJ analyzer (`benchmark_memory!` + `report_alloc_profile`) into the benchmark suite as a fourth testitem. Produces JLD2 allocation-profile artifacts under `benchmark/results/allocs/` (already covered by the existing workflow's `benchmark/results/` upload) and prints a top-K breakdown by type / leaf call site / frame for each solver. The testitem uses the same bilinear N=51 problem as the existing Ipopt-vs-MadNLP timing testitem so allocation hotspots line up with the timing numbers. `sample_rate = 0.01` keeps the trace tractable — `Profile.Allocs` slows the solve roughly linearly in number of allocations, and the bilinear solve produces millions of allocs; full-rate sampling on N=10 hung >15 min in earlier experiments (cf. closed DTO#71). The 1/sample_rate extrapolation applied by `report_alloc_profile` rebuilds the totals. Bumps HBJ pin from 5401542c (v0.2.0 prep) to c38418cb (post-#12, analyzer + Piccolo-aligned CI) since the analyzer's exports (`top_alloc_types`, `report_alloc_profile`, …) didn't exist at the v0.2.0 prep commit. Other consumers of the bench env (timing, scaling) already work against c38418cb — no behavioral change for them. Stacked on `benchmarks/directtrajopt-initial-v2` (PR #75) so reviewers see only this testitem's diff. Will retarget to main when #75 lands. --- benchmark/Project.toml | 6 ++- benchmark/alloc_profile.jl | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 benchmark/alloc_profile.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 2255f41..f65629f 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -20,4 +20,8 @@ DirectTrajOpt = {path = ".."} # results are reproducible. Bump this SHA (and the local Manifest) when HBJ # ships a new feature we want to use. Drop in favor of [compat] once HBJ # registers in General. -HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl", rev = "5401542c477c0f2da6d66028c513e8a278f4875f"} +# +# Bumped from 5401542c (v0.2.0 prep) to c38418cb (post-#12) to pick up the +# alloc profile analyzer (`top_alloc_types`, `report_alloc_profile`, …) +# used by `benchmark/alloc_profile.jl`. +HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl", rev = "c38418cb7f932f2ff9a9c6c6eacf9a11ff1018c1"} diff --git a/benchmark/alloc_profile.jl b/benchmark/alloc_profile.jl new file mode 100644 index 0000000..92d8e37 --- /dev/null +++ b/benchmark/alloc_profile.jl @@ -0,0 +1,86 @@ +using TestItems + +@testitem "Alloc profile: bilinear N=51 (Ipopt + MadNLP)" begin + using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories + using SparseArrays, ExponentialAction, Random, Dates + import MadNLP + + include("$(joinpath(@__DIR__, "problem_utils.jl"))") + + runner = get(ENV, "BENCHMARK_RUNNER", "local") + + # `Profile.Allocs` slows the solve linearly in number of allocations and + # the bilinear problem allocates millions per solve — `sample_rate = 1.0` + # is intractable for a full Ipopt/MadNLP run (>15 min on N=10 in early + # experiments). `0.01` keeps the trace tractable while still giving + # statistically useful per-frame breakdowns. The 1/sample_rate scaling + # applied by `report_alloc_profile` extrapolates back to total bytes. + sample_rate = 0.01 + + # JIT warmup so first-call compile of Ipopt/MadNLP extensions, KKT/AD + # codegen, and the Profile.Allocs machinery itself doesn't dominate the + # sampled trace. Discard the warmup results. + let warmup_prob = make_bilinear_problem(; N = 11, seed = 0) + DirectTrajOpt.solve!( + warmup_prob; + options = IpoptOptions(max_iter = 2, print_level = 0), + ) + end + let warmup_prob = make_bilinear_problem(; N = 11, seed = 0) + DirectTrajOpt.solve!( + warmup_prob; + options = MadNLPOptions(max_iter = 2, print_level = 6), + ) + end + + results_dir = joinpath(@__DIR__, "results", "allocs") + pdims = problem_dims(make_bilinear_problem(; N = 51, seed = 42)) + + # Ipopt + let prob = make_bilinear_problem(; N = 51, seed = 42) + profile = benchmark_memory!( + () -> DirectTrajOpt.solve!( + prob; + options = IpoptOptions(max_iter = 200, print_level = 0), + ); + package = "DirectTrajOpt", + solver = "Ipopt", + benchmark_name = "alloc_bilinear_N51_ipopt", + N = 51, + state_dim = pdims.state_dim, + control_dim = pdims.control_dim, + sample_rate = sample_rate, + warmup = false, # we did our own warmup above + runner = runner, + ) + path = save_alloc_profile(results_dir, profile.benchmark_name, profile) + println("\n=== Alloc profile: Ipopt (bilinear N=51, sample_rate=$sample_rate) ===") + println(" samples=$(profile.total_count) total≈$(profile.total_bytes) B") + println(" saved $path") + report_alloc_profile(profile; k_types = 10, k_leaves = 15, k_frames = 15) + end + + # MadNLP + let prob = make_bilinear_problem(; N = 51, seed = 42) + profile = benchmark_memory!( + () -> DirectTrajOpt.solve!( + prob; + options = MadNLPOptions(max_iter = 200, print_level = 6), + ); + package = "DirectTrajOpt", + solver = "MadNLP", + benchmark_name = "alloc_bilinear_N51_madnlp", + N = 51, + state_dim = pdims.state_dim, + control_dim = pdims.control_dim, + sample_rate = sample_rate, + warmup = false, + runner = runner, + ) + path = save_alloc_profile(results_dir, profile.benchmark_name, profile) + println("\n=== Alloc profile: MadNLP (bilinear N=51, sample_rate=$sample_rate) ===") + println(" samples=$(profile.total_count) total≈$(profile.total_bytes) B") + println(" saved $path") + report_alloc_profile(profile; k_types = 10, k_leaves = 15, k_frames = 15) + end +end From a0361c396c91e364506c21ef54dd0d2b5749f17d Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 11:22:06 -0400 Subject: [PATCH 2/4] benchmark(alloc): drop max_iter to 30, raise workflow timeout to 90 min --- .github/workflows/benchmark.yml | 2 +- benchmark/alloc_profile.jl | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 803ae0a..c256a8f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,7 +17,7 @@ jobs: benchmark: name: Benchmark suite runs-on: ubuntu-latest - timeout-minutes: 60 + timeout-minutes: 90 permissions: actions: write contents: read diff --git a/benchmark/alloc_profile.jl b/benchmark/alloc_profile.jl index 92d8e37..812521a 100644 --- a/benchmark/alloc_profile.jl +++ b/benchmark/alloc_profile.jl @@ -9,12 +9,15 @@ using TestItems runner = get(ENV, "BENCHMARK_RUNNER", "local") - # `Profile.Allocs` slows the solve linearly in number of allocations and - # the bilinear problem allocates millions per solve — `sample_rate = 1.0` - # is intractable for a full Ipopt/MadNLP run (>15 min on N=10 in early - # experiments). `0.01` keeps the trace tractable while still giving - # statistically useful per-frame breakdowns. The 1/sample_rate scaling - # applied by `report_alloc_profile` extrapolates back to total bytes. + # `Profile.Allocs` slows the solve dramatically — `sample_rate = 1.0` is + # intractable for a full Ipopt/MadNLP run (>15 min on N=10 in early + # experiments), and even `0.01` runs MadNLP at ~3000× slowdown vs the + # un-profiled solve. `0.01` keeps the trace tractable while still giving + # statistically useful per-frame breakdowns; combined with `max_iter = 30` + # (representative per-iter allocation pattern — convergence isn't the + # goal) the testitem completes well inside the workflow timeout. The + # `1 / sample_rate` scaling applied by `report_alloc_profile` extrapolates + # back to total bytes. sample_rate = 0.01 # JIT warmup so first-call compile of Ipopt/MadNLP extensions, KKT/AD @@ -41,7 +44,7 @@ using TestItems profile = benchmark_memory!( () -> DirectTrajOpt.solve!( prob; - options = IpoptOptions(max_iter = 200, print_level = 0), + options = IpoptOptions(max_iter = 30, print_level = 0), ); package = "DirectTrajOpt", solver = "Ipopt", @@ -65,7 +68,7 @@ using TestItems profile = benchmark_memory!( () -> DirectTrajOpt.solve!( prob; - options = MadNLPOptions(max_iter = 200, print_level = 6), + options = MadNLPOptions(max_iter = 30, print_level = 6), ); package = "DirectTrajOpt", solver = "MadNLP", From d79ccf196eab6b4af008e0330539b68d31df744b Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 13:42:55 -0400 Subject: [PATCH 3/4] ci(benchmark): raise workflow timeout 90 -> 180 min --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c256a8f..ac8469d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,7 +17,7 @@ jobs: benchmark: name: Benchmark suite runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 180 permissions: actions: write contents: read From 5daa0be6addcda76b71b046a07edf7be66f41b5a Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Thu, 21 May 2026 23:10:32 -0400 Subject: [PATCH 4/4] ci: split alloc-profile testitem into its own workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profile.Allocs adds ~30-40min per solve regardless of max_iter — local timing on a fast workstation showed 47m for the testitem alone, ~74m on GH Actions runners. That makes it impractical to gate every PR on the alloc profile (push to 180min timeout would burn ~115min of CI per benchmark/src/ change). Local hard data: bilinear N=51, max_iter=30, sample_rate=0.01: Ipopt section ~22min local / ~42min CI, MadNLP section ~25min local / ~32min CI. Solution: split into a dedicated alloc-profile workflow with a paths filter (benchmark/alloc_profile.jl, benchmark/problem_utils.jl, benchmark/Project.toml, .github/workflows/alloc-profile.yml). Main benchmark workflow filters the alloc-profile testitem out and reverts to timeout-minutes: 60 (now ~32min wall time again). The alloc-profile workflow gets its own 90min budget, uploads artifacts under benchmark/results/allocs/. TestItemRunner filter mirrors the same approach already used in test/runtests.jl to keep main-package CI from picking up benchmark testitems. --- .github/workflows/alloc-profile.yml | 57 +++++++++++++++++++++++++++++ .github/workflows/benchmark.yml | 9 +++-- 2 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/alloc-profile.yml diff --git a/.github/workflows/alloc-profile.yml b/.github/workflows/alloc-profile.yml new file mode 100644 index 0000000..b7690a2 --- /dev/null +++ b/.github/workflows/alloc-profile.yml @@ -0,0 +1,57 @@ +name: Alloc Profile +on: + push: + tags: ['v*'] + pull_request: + paths: + - 'benchmark/alloc_profile.jl' + - 'benchmark/problem_utils.jl' + - 'benchmark/Project.toml' + - '.github/workflows/alloc-profile.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + +jobs: + alloc-profile: + name: Alloc profile (Ipopt + MadNLP) + runs-on: ubuntu-latest + # Profile.Allocs has high per-allocation overhead that doesn't scale down + # with sample_rate — each Ipopt/MadNLP solve under sampling takes ~30-40 + # min on GH Actions runners even at max_iter=30. Two solves + startup = + # ~75 min observed in practice; 90 min gives a comfortable cushion. + timeout-minutes: 90 + permissions: + actions: write + contents: read + steps: + - uses: actions/checkout@v6 + + - uses: julia-actions/setup-julia@v2 + with: + version: '1.11' + arch: x64 + + - uses: julia-actions/cache@v2 + + - name: Instantiate benchmark environment + run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()' + + - name: Run alloc profile + env: + BENCHMARK_RUNNER: github-actions + run: | + julia --project=benchmark -t auto -e ' + using TestItemRunner + TestItemRunner.run_tests("benchmark/"; filter = ti -> occursin("alloc_profile", ti.filename)) + ' + + - name: Upload alloc-profile artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: alloc-profile-${{ github.event.pull_request.number || github.ref_name }}-${{ github.sha }} + path: benchmark/results/allocs/ + retention-days: 90 diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ac8469d..529fd5e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,7 +17,7 @@ jobs: benchmark: name: Benchmark suite runs-on: ubuntu-latest - timeout-minutes: 180 + timeout-minutes: 60 permissions: actions: write contents: read @@ -34,13 +34,16 @@ jobs: - name: Instantiate benchmark environment run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()' - - name: Run benchmarks + - name: Run benchmarks (excluding alloc profile) env: BENCHMARK_RUNNER: github-actions run: | julia --project=benchmark -t auto -e ' using TestItemRunner - TestItemRunner.run_tests("benchmark/") + # Alloc profile testitem runs in `.github/workflows/alloc-profile.yml` + # because Profile.Allocs adds ~30-40min per solve regardless of + # max_iter, making it impractical to gate every PR on it. + TestItemRunner.run_tests("benchmark/"; filter = ti -> !occursin("alloc_profile", ti.filename)) ' - name: Upload benchmark artifacts