diff --git a/.github/workflows/alloc-profile.yml b/.github/workflows/alloc-profile.yml new file mode 100644 index 0000000..b7690a2 --- /dev/null +++ b/.github/workflows/alloc-profile.yml @@ -0,0 +1,57 @@ +name: Alloc Profile +on: + push: + tags: ['v*'] + pull_request: + paths: + - 'benchmark/alloc_profile.jl' + - 'benchmark/problem_utils.jl' + - 'benchmark/Project.toml' + - '.github/workflows/alloc-profile.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + +jobs: + alloc-profile: + name: Alloc profile (Ipopt + MadNLP) + runs-on: ubuntu-latest + # Profile.Allocs has high per-allocation overhead that doesn't scale down + # with sample_rate — each Ipopt/MadNLP solve under sampling takes ~30-40 + # min on GH Actions runners even at max_iter=30. Two solves + startup = + # ~75 min observed in practice; 90 min gives a comfortable cushion. + timeout-minutes: 90 + permissions: + actions: write + contents: read + steps: + - uses: actions/checkout@v6 + + - uses: julia-actions/setup-julia@v2 + with: + version: '1.11' + arch: x64 + + - uses: julia-actions/cache@v2 + + - name: Instantiate benchmark environment + run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()' + + - name: Run alloc profile + env: + BENCHMARK_RUNNER: github-actions + run: | + julia --project=benchmark -t auto -e ' + using TestItemRunner + TestItemRunner.run_tests("benchmark/"; filter = ti -> occursin("alloc_profile", ti.filename)) + ' + + - name: Upload alloc-profile artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: alloc-profile-${{ github.event.pull_request.number || github.ref_name }}-${{ github.sha }} + path: benchmark/results/allocs/ + retention-days: 90 diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 803ae0a..529fd5e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -34,13 +34,16 @@ jobs: - name: Instantiate benchmark environment run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()' - - name: Run benchmarks + - name: Run benchmarks (excluding alloc profile) env: BENCHMARK_RUNNER: github-actions run: | julia --project=benchmark -t auto -e ' using TestItemRunner - TestItemRunner.run_tests("benchmark/") + # Alloc profile testitem runs in `.github/workflows/alloc-profile.yml` + # because Profile.Allocs adds ~30-40min per solve regardless of + # max_iter, making it impractical to gate every PR on it. + TestItemRunner.run_tests("benchmark/"; filter = ti -> !occursin("alloc_profile", ti.filename)) ' - name: Upload benchmark artifacts diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 2255f41..f65629f 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -20,4 +20,8 @@ DirectTrajOpt = {path = ".."} # results are reproducible. Bump this SHA (and the local Manifest) when HBJ # ships a new feature we want to use. Drop in favor of [compat] once HBJ # registers in General. -HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl", rev = "5401542c477c0f2da6d66028c513e8a278f4875f"} +# +# Bumped from 5401542c (v0.2.0 prep) to c38418cb (post-#12) to pick up the +# alloc profile analyzer (`top_alloc_types`, `report_alloc_profile`, …) +# used by `benchmark/alloc_profile.jl`. +HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl", rev = "c38418cb7f932f2ff9a9c6c6eacf9a11ff1018c1"} diff --git a/benchmark/alloc_profile.jl b/benchmark/alloc_profile.jl new file mode 100644 index 0000000..812521a --- /dev/null +++ b/benchmark/alloc_profile.jl @@ -0,0 +1,89 @@ +using TestItems + +@testitem "Alloc profile: bilinear N=51 (Ipopt + MadNLP)" begin + using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories + using SparseArrays, ExponentialAction, Random, Dates + import MadNLP + + include("$(joinpath(@__DIR__, "problem_utils.jl"))") + + runner = get(ENV, "BENCHMARK_RUNNER", "local") + + # `Profile.Allocs` slows the solve dramatically — `sample_rate = 1.0` is + # intractable for a full Ipopt/MadNLP run (>15 min on N=10 in early + # experiments), and even `0.01` runs MadNLP at ~3000× slowdown vs the + # un-profiled solve. `0.01` keeps the trace tractable while still giving + # statistically useful per-frame breakdowns; combined with `max_iter = 30` + # (representative per-iter allocation pattern — convergence isn't the + # goal) the testitem completes well inside the workflow timeout. The + # `1 / sample_rate` scaling applied by `report_alloc_profile` extrapolates + # back to total bytes. + sample_rate = 0.01 + + # JIT warmup so first-call compile of Ipopt/MadNLP extensions, KKT/AD + # codegen, and the Profile.Allocs machinery itself doesn't dominate the + # sampled trace. Discard the warmup results. + let warmup_prob = make_bilinear_problem(; N = 11, seed = 0) + DirectTrajOpt.solve!( + warmup_prob; + options = IpoptOptions(max_iter = 2, print_level = 0), + ) + end + let warmup_prob = make_bilinear_problem(; N = 11, seed = 0) + DirectTrajOpt.solve!( + warmup_prob; + options = MadNLPOptions(max_iter = 2, print_level = 6), + ) + end + + results_dir = joinpath(@__DIR__, "results", "allocs") + pdims = problem_dims(make_bilinear_problem(; N = 51, seed = 42)) + + # Ipopt + let prob = make_bilinear_problem(; N = 51, seed = 42) + profile = benchmark_memory!( + () -> DirectTrajOpt.solve!( + prob; + options = IpoptOptions(max_iter = 30, print_level = 0), + ); + package = "DirectTrajOpt", + solver = "Ipopt", + benchmark_name = "alloc_bilinear_N51_ipopt", + N = 51, + state_dim = pdims.state_dim, + control_dim = pdims.control_dim, + sample_rate = sample_rate, + warmup = false, # we did our own warmup above + runner = runner, + ) + path = save_alloc_profile(results_dir, profile.benchmark_name, profile) + println("\n=== Alloc profile: Ipopt (bilinear N=51, sample_rate=$sample_rate) ===") + println(" samples=$(profile.total_count) total≈$(profile.total_bytes) B") + println(" saved $path") + report_alloc_profile(profile; k_types = 10, k_leaves = 15, k_frames = 15) + end + + # MadNLP + let prob = make_bilinear_problem(; N = 51, seed = 42) + profile = benchmark_memory!( + () -> DirectTrajOpt.solve!( + prob; + options = MadNLPOptions(max_iter = 30, print_level = 6), + ); + package = "DirectTrajOpt", + solver = "MadNLP", + benchmark_name = "alloc_bilinear_N51_madnlp", + N = 51, + state_dim = pdims.state_dim, + control_dim = pdims.control_dim, + sample_rate = sample_rate, + warmup = false, + runner = runner, + ) + path = save_alloc_profile(results_dir, profile.benchmark_name, profile) + println("\n=== Alloc profile: MadNLP (bilinear N=51, sample_rate=$sample_rate) ===") + println(" samples=$(profile.total_count) total≈$(profile.total_bytes) B") + println(" saved $path") + report_alloc_profile(profile; k_types = 10, k_leaves = 15, k_frames = 15) + end +end