From 0dc263a70754e703de4810cfecfbddcd821850b8 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Wed, 15 Apr 2026 01:00:39 -0400
Subject: [PATCH 01/13] Add benchmark environment with evaluator and solver
 @testitems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sets up benchmark/ with Project.toml, .gitignore, and three @testitem
benchmarks: evaluator micro-benchmarks, Ipopt vs MadNLP comparison, and
memory scaling sweep (N × state_dim grid).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmark/.gitignore    |   1 +
 benchmark/Project.toml  |  16 +++++
 benchmark/benchmarks.jl | 155 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 benchmark/.gitignore
 create mode 100644 benchmark/Project.toml
 create mode 100644 benchmark/benchmarks.jl

diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 0000000..fbca225
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1 @@
+results/
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..9782442
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
+ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HarmoniqsBenchmarks = "f45d0b76-2d23-4568-9599-481e0da131db"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
new file mode 100644
index 0000000..a7ecc6f
--- /dev/null
+++ b/benchmark/benchmarks.jl
@@ -0,0 +1,155 @@
+using TestItems
+
+@testitem "Evaluator micro-benchmarks: bilinear N=51" begin
+    using HarmoniqsBenchmarks, BenchmarkTools, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, MathOptInterface, Random, Dates, Printf
+    const MOI = MathOptInterface
+
+    Random.seed!(42)
+    N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+    traj = NamedTrajectory(
+        (x=2rand(4,N).-1, u=u_bound*(2rand(2,N).-1), du=randn(2,N), ddu=randn(2,N), Δt=fill(Δt,N));
+        controls=(:ddu,:Δt), timestep=:Δt, bounds=(u=u_bound, Δt=(0.01,0.5)),
+        initial=(x=[1.0,0.0,0.0,0.0], u=zeros(2)), final=(u=zeros(2),),
+        goal=(x=[0.0,1.0,0.0,0.0],),
+    )
+    integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj), DerivativeIntegrator(:du,:ddu,traj)]
+    J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+    prob = DirectTrajOptProblem(traj, J, integrators)
+
+    evaluator, Z_vec = build_evaluator(prob)
+    dims = evaluator_dims(evaluator)
+
+    g = zeros(dims.n_constraints)
+    grad = zeros(dims.n_variables)
+    H = zeros(dims.n_hessian_entries)
+    Jac = zeros(dims.n_jacobian_entries)
+    sigma = 1.0
+    mu = ones(dims.n_constraints)
+
+    benchmarks = Dict{Symbol,EvalBenchmark}(
+        :eval_objective => trial_to_eval_benchmark(@benchmark(MOI.eval_objective($evaluator, $Z_vec))),
+        :eval_gradient => trial_to_eval_benchmark(@benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))),
+        :eval_constraint => trial_to_eval_benchmark(@benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))),
+        :eval_jacobian => trial_to_eval_benchmark(@benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))),
+        :eval_hessian_lagrangian => trial_to_eval_benchmark(@benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))),
+    )
+
+    result = MicroBenchmarkResult(
+        package="DirectTrajOpt", package_version="0.8.10",
+        commit=(try String(strip(read(`git rev-parse --short HEAD`, String))) catch; "unknown" end),
+        benchmark_name="evaluator_micro_bilinear_N51", N=N, state_dim=4, control_dim=2,
+        eval_benchmarks=benchmarks, julia_version=string(VERSION),
+        timestamp=Dates.now(), runner=get(ENV, "BENCHMARK_RUNNER", "local"), n_threads=Threads.nthreads(),
+    )
+
+    println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
+    for (name, eb) in sort(collect(result.eval_benchmarks), by=first)
+        @printf("  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n", name, eb.median_ns, eb.allocs, eb.memory_bytes)
+    end
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_micro_results(results_dir, result.benchmark_name, result)
+    println("  Saved to $results_dir/")
+end
+
+@testitem "Ipopt vs MadNLP: bilinear N=51" begin
+    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, Random, Dates
+    import MadNLP
+
+    const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+
+    function make_bilinear_problem(; seed=42)
+        Random.seed!(seed)
+        N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+        Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+        Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+        Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+        G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+        traj = NamedTrajectory(
+            (x=2rand(4,N).-1, u=u_bound*(2rand(2,N).-1), du=randn(2,N), ddu=randn(2,N), Δt=fill(Δt,N));
+            controls=(:ddu,:Δt), timestep=:Δt, bounds=(u=u_bound, Δt=(0.01,0.5)),
+            initial=(x=[1.0,0.0,0.0,0.0], u=zeros(2)), final=(u=zeros(2),),
+            goal=(x=[0.0,1.0,0.0,0.0],),
+        )
+        integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj), DerivativeIntegrator(:du,:ddu,traj)]
+        J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    prob_ipopt = make_bilinear_problem()
+    result_ipopt = benchmark_solve!(prob_ipopt, IpoptOptions(max_iter=200, print_level=0); benchmark_name="bilinear_N51_ipopt")
+
+    prob_madnlp = make_bilinear_problem()
+    result_madnlp = benchmark_solve!(prob_madnlp, MadNLPSolverExt.MadNLPOptions(max_iter=200, print_level=1); benchmark_name="bilinear_N51_madnlp")
+
+    println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
+    println("  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc")
+    println("  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc")
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
+end
+
+@testitem "Memory scaling: N and state_dim sweep" begin
+    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
+    using SparseArrays, ExponentialAction, Random, Dates, Printf
+    import MadNLP
+
+    const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+
+    function make_scaled_problem(; N, state_dim, n_controls=2, seed=42)
+        Random.seed!(seed)
+        G_drift = sparse(randn(state_dim, state_dim))
+        G_drives = [sparse(randn(state_dim, state_dim)) for _ in 1:n_controls]
+        G(u) = G_drift + sum(u[i] * G_drives[i] for i in 1:n_controls)
+
+        x_init = zeros(state_dim); x_init[1] = 1.0
+        x_goal = zeros(state_dim); x_goal[min(2,state_dim)] = 1.0
+
+        traj = NamedTrajectory(
+            (x=randn(state_dim,N), u=0.1*randn(n_controls,N), du=randn(n_controls,N), Δt=fill(0.1,N));
+            controls=(:du,:Δt), timestep=:Δt, bounds=(u=1.0, Δt=(0.01,0.5)),
+            initial=(x=x_init, u=zeros(n_controls)), final=(u=zeros(n_controls),),
+            goal=(x=x_goal,),
+        )
+        integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj)]
+        J = QuadraticRegularizer(:u, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    N_values = [25, 51, 101]
+    dim_values = [4, 8, 16]
+    results = BenchmarkResult[]
+
+    println("\n=== Memory Scaling Study ===")
+    @printf("  %5s | %5s | %12s | %12s | %12s | %12s\n", "N", "dim", "Ipopt (s)", "Ipopt (KB)", "MadNLP (s)", "MadNLP (KB)")
+    @printf("  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n", "-"^5, "-"^5, "-"^12, "-"^12, "-"^12, "-"^12)
+
+    for N in N_values
+        for dim in dim_values
+            prob = make_scaled_problem(; N=N, state_dim=dim)
+            r_ipopt = benchmark_solve!(prob, IpoptOptions(max_iter=50, print_level=0); benchmark_name="scaling_N$(N)_d$(dim)_ipopt")
+            push!(results, r_ipopt)
+
+            prob = make_scaled_problem(; N=N, state_dim=dim)
+            r_madnlp = benchmark_solve!(prob, MadNLPSolverExt.MadNLPOptions(max_iter=50, print_level=1); benchmark_name="scaling_N$(N)_d$(dim)_madnlp")
+            push!(results, r_madnlp)
+
+            @printf("  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
+                N, dim, r_ipopt.wall_time_s, r_ipopt.total_allocations_bytes ÷ 1024,
+                r_madnlp.wall_time_s, r_madnlp.total_allocations_bytes ÷ 1024)
+        end
+    end
+
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "memory_scaling", results)
+    println("\n  Saved $(length(results)) results to $results_dir/")
+end

From 354cabc0f3ad4e0a50befebedce7b1536a77db63 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Wed, 15 Apr 2026 16:35:31 -0400
Subject: [PATCH 02/13] docs: add benchmarking spec and implementation plan

Specs for:
- Overall HarmoniqsBenchmarks.jl architecture
- Altissimo GPU benchmarks (3-way: Ipopt CPU / MadNLP-GPU / Altissimo-GPU)
- Implementation plan for DirectTrajOpt + HarmoniqsBenchmarks tasks
---
 .../2026-04-15-benchmarking-infrastructure.md | 1620 +++++++++++++++++
 ...6-04-15-altissimo-gpu-benchmarks-design.md |  198 ++
 .../specs/2026-04-15-benchmarking-design.md   |  383 ++++
 3 files changed, 2201 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
 create mode 100644 docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
 create mode 100644 docs/superpowers/specs/2026-04-15-benchmarking-design.md

diff --git a/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md b/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
new file mode 100644
index 0000000..99dee3f
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
@@ -0,0 +1,1620 @@
+# HarmoniqsBenchmarks.jl + DirectTrajOpt Benchmark Suite — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Create a shared benchmarking package (`HarmoniqsBenchmarks.jl`) and wire up the first benchmark suite in DirectTrajOpt.jl comparing Ipopt vs MadNLP, with micro-benchmarks, full-solve benchmarks, and memory scaling studies.
+
+**Architecture:** HarmoniqsBenchmarks.jl provides schema types, a profiling harness, and JLD2 storage/comparison. DirectTrajOpt.jl's `benchmark/` directory contains `@testitem`-based benchmarks that use the shared harness. Both Ipopt and MadNLP benchmarks use the same shared `Evaluator` (in `src/solvers/evaluator.jl`), so micro-benchmarks are solver-agnostic while macro-benchmarks compare the two solver backends.
+
+**Tech Stack:** Julia 1.11+, BenchmarkTools.jl, JLD2.jl, TestItems/TestItemRunner, MathOptInterface
+
+**Spec:** `docs/superpowers/specs/2026-04-15-benchmarking-design.md`
+
+---
+
+## File Structure
+
+### New repo: `HarmoniqsBenchmarks.jl` (at `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/`)
+
+| File | Responsibility |
+|------|---------------|
+| `Project.toml` | Package metadata + deps (BenchmarkTools, JLD2, Dates, DirectTrajOpt, MathOptInterface, NamedTrajectories) |
+| `src/HarmoniqsBenchmarks.jl` | Module definition + exports |
+| `src/schema.jl` | `BenchmarkResult`, `MicroBenchmarkResult`, `EvalBenchmark` structs |
+| `src/harness.jl` | `build_evaluator`, `benchmark_solve!`, GC/allocation capture |
+| `src/storage.jl` | `save_results`, `save_micro_results`, `load_results`, `load_micro_results` |
+| `src/report.jl` | `compare_results` — diff tables + regression flagging |
+| `test/runtests.jl` | Tests for all of the above |
+
+### Modified repo: `DirectTrajOpt.jl` (benchmark directory)
+
+| File | Responsibility |
+|------|---------------|
+| `benchmark/Project.toml` | Benchmark env deps (HarmoniqsBenchmarks, BenchmarkTools, TestItems, MadNLP) |
+| `benchmark/benchmarks.jl` | `@testitem` definitions: micro, macro, scaling |
+| `benchmark/.gitignore` | Ignore `results/` directory |
+
+---
+
+## Task 1: Create HarmoniqsBenchmarks.jl Project Skeleton
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/Project.toml`
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
+
+- [ ] **Step 1: Initialize the package directory**
+
+```bash
+mkdir -p /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src
+mkdir -p /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git init
+```
+
+- [ ] **Step 2: Create Project.toml**
+
+```toml
+name = "HarmoniqsBenchmarks"
+uuid = "GENERATE_UUID"
+version = "0.1.0"
+authors = ["harmoniqs contributors"]
+
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+
+[compat]
+BenchmarkTools = "1.6"
+Dates = "1.10, 1.11, 1.12"
+DirectTrajOpt = "0.8"
+JLD2 = "0.5"
+MathOptInterface = "1.49"
+NamedTrajectories = "0.8"
+julia = "1.10, 1.11, 1.12"
+```
+
+Generate the UUID with: `using UUIDs; uuid4()`
+
+- [ ] **Step 3: Create module stub**
+
+```julia
+# src/HarmoniqsBenchmarks.jl
+module HarmoniqsBenchmarks
+
+end
+```
+
+- [ ] **Step 4: Dev-install dependencies and verify the package loads**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e '
+    using Pkg
+    Pkg.develop(path="../DirectTrajOpt.jl")
+    Pkg.develop(path="../NamedTrajectories.jl")
+    Pkg.instantiate()
+    using HarmoniqsBenchmarks
+    println("Package loads OK")
+'
+```
+
+Expected: "Package loads OK"
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add Project.toml src/HarmoniqsBenchmarks.jl
+git commit -m "feat: initialize HarmoniqsBenchmarks.jl package skeleton"
+```
+
+---
+
+## Task 2: Implement Schema Types
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/schema.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Write tests for schema types**
+
+```julia
+# test/runtests.jl
+using Test
+using HarmoniqsBenchmarks
+using Dates
+
+@testset "HarmoniqsBenchmarks" begin
+
+@testset "Schema" begin
+    @testset "EvalBenchmark construction" begin
+        eb = EvalBenchmark(
+            times_ns = [100.0, 110.0, 105.0],
+            gctimes_ns = [0.0, 0.0, 5.0],
+            memory_bytes = 1024,
+            allocs = 3,
+        )
+        @test eb.median_ns == 105.0
+        @test eb.min_ns == 100.0
+        @test 104.0 < eb.mean_ns < 106.0
+    end
+
+    @testset "BenchmarkResult construction" begin
+        r = BenchmarkResult(
+            package = "DirectTrajOpt",
+            package_version = "0.8.10",
+            commit = "abc1234",
+            benchmark_name = "test_bench",
+            N = 51,
+            state_dim = 4,
+            control_dim = 2,
+            n_constraints = 200,
+            n_variables = 765,
+            wall_time_s = 1.5,
+            iterations = 42,
+            objective_value = 0.001,
+            constraint_violation = 1e-8,
+            solver_status = :Optimal,
+            solver = "ipopt",
+            total_allocations_bytes = 1_000_000,
+            total_allocs_count = 500,
+            gc_time_ns = 10_000,
+            gc_count = 2,
+            gc_full_count = 0,
+            solver_options = Dict{Symbol,Any}(:tol => 1e-8, :max_iter => 1000),
+            julia_version = string(VERSION),
+            timestamp = now(),
+            runner = "local",
+            n_threads = 1,
+        )
+        @test r.package == "DirectTrajOpt"
+        @test r.solver_status == :Optimal
+    end
+
+    @testset "MicroBenchmarkResult construction" begin
+        eb = EvalBenchmark(
+            times_ns = [100.0],
+            gctimes_ns = [0.0],
+            memory_bytes = 0,
+            allocs = 0,
+        )
+        mr = MicroBenchmarkResult(
+            package = "DirectTrajOpt",
+            package_version = "0.8.10",
+            commit = "abc1234",
+            benchmark_name = "micro_test",
+            N = 51,
+            state_dim = 4,
+            control_dim = 2,
+            eval_benchmarks = Dict{Symbol,EvalBenchmark}(
+                :eval_objective => eb,
+            ),
+            julia_version = string(VERSION),
+            timestamp = now(),
+            runner = "local",
+            n_threads = 1,
+        )
+        @test mr.eval_benchmarks[:eval_objective].min_ns == 100.0
+    end
+end
+
+end # HarmoniqsBenchmarks testset
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `EvalBenchmark` not defined
+
+- [ ] **Step 3: Implement schema types**
+
+```julia
+# src/schema.jl
+using Dates
+using Statistics: median, mean
+
+struct EvalBenchmark
+    times_ns::Vector{Float64}
+    gctimes_ns::Vector{Float64}
+    memory_bytes::Int
+    allocs::Int
+    # Derived stats (computed at construction)
+    median_ns::Float64
+    min_ns::Float64
+    mean_ns::Float64
+end
+
+function EvalBenchmark(;
+    times_ns::Vector{Float64},
+    gctimes_ns::Vector{Float64},
+    memory_bytes::Int,
+    allocs::Int,
+)
+    return EvalBenchmark(
+        times_ns,
+        gctimes_ns,
+        memory_bytes,
+        allocs,
+        median(times_ns),
+        minimum(times_ns),
+        mean(times_ns),
+    )
+end
+
+struct BenchmarkResult
+    # Identity
+    package::String
+    package_version::String
+    commit::String
+    benchmark_name::String
+    # Problem dimensions
+    N::Int
+    state_dim::Int
+    control_dim::Int
+    n_constraints::Int
+    n_variables::Int
+    # Solve metrics
+    wall_time_s::Float64
+    iterations::Int
+    objective_value::Float64
+    constraint_violation::Float64
+    solver_status::Symbol
+    solver::String
+    # Memory & allocations
+    total_allocations_bytes::Int
+    total_allocs_count::Int
+    gc_time_ns::Int
+    gc_count::Int
+    gc_full_count::Int
+    # Solver options snapshot
+    solver_options::Dict{Symbol,Any}
+    # Metadata
+    julia_version::String
+    timestamp::DateTime
+    runner::String
+    n_threads::Int
+end
+
+struct MicroBenchmarkResult
+    package::String
+    package_version::String
+    commit::String
+    benchmark_name::String
+    N::Int
+    state_dim::Int
+    control_dim::Int
+    eval_benchmarks::Dict{Symbol,EvalBenchmark}
+    julia_version::String
+    timestamp::DateTime
+    runner::String
+    n_threads::Int
+end
+```
+
+- [ ] **Step 4: Update module to include schema and export types**
+
+```julia
+# src/HarmoniqsBenchmarks.jl
+module HarmoniqsBenchmarks
+
+export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
+
+include("schema.jl")
+
+end
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/schema.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add BenchmarkResult, MicroBenchmarkResult, EvalBenchmark schema types"
+```
+
+---
+
+## Task 3: Implement JLD2 Storage
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/storage.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Add storage tests**
+
+Append to `test/runtests.jl`, inside the top-level `@testset "HarmoniqsBenchmarks"`:
+
+```julia
+@testset "Storage" begin
+    mktempdir() do dir
+        r = BenchmarkResult(
+            package = "DirectTrajOpt",
+            package_version = "0.8.10",
+            commit = "abc1234",
+            benchmark_name = "storage_test",
+            N = 51, state_dim = 4, control_dim = 2,
+            n_constraints = 200, n_variables = 765,
+            wall_time_s = 1.5, iterations = 42,
+            objective_value = 0.001, constraint_violation = 1e-8,
+            solver_status = :Optimal, solver = "ipopt",
+            total_allocations_bytes = 1_000_000, total_allocs_count = 500,
+            gc_time_ns = 10_000, gc_count = 2, gc_full_count = 0,
+            solver_options = Dict{Symbol,Any}(:tol => 1e-8),
+            julia_version = string(VERSION),
+            timestamp = now(), runner = "local", n_threads = 1,
+        )
+
+        path = save_results(dir, "test_bench", [r])
+        @test isfile(path)
+        @test endswith(path, ".jld2")
+
+        loaded = load_results(path)
+        @test length(loaded) == 1
+        @test loaded[1].package == "DirectTrajOpt"
+        @test loaded[1].wall_time_s == 1.5
+        @test loaded[1].solver_options[:tol] == 1e-8
+    end
+
+    mktempdir() do dir
+        eb = EvalBenchmark(
+            times_ns = [100.0, 110.0],
+            gctimes_ns = [0.0, 0.0],
+            memory_bytes = 512, allocs = 1,
+        )
+        mr = MicroBenchmarkResult(
+            package = "DirectTrajOpt",
+            package_version = "0.8.10",
+            commit = "abc1234",
+            benchmark_name = "micro_storage_test",
+            N = 51, state_dim = 4, control_dim = 2,
+            eval_benchmarks = Dict(:eval_objective => eb),
+            julia_version = string(VERSION),
+            timestamp = now(), runner = "local", n_threads = 1,
+        )
+
+        path = save_micro_results(dir, "micro_test", mr)
+        @test isfile(path)
+
+        loaded = load_micro_results(path)
+        @test loaded.benchmark_name == "micro_storage_test"
+        @test loaded.eval_benchmarks[:eval_objective].min_ns == 100.0
+    end
+end
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `save_results` not defined
+
+- [ ] **Step 3: Implement storage functions**
+
+```julia
+# src/storage.jl
+using JLD2
+
+"""
+    save_results(dir, name, results::Vector{BenchmarkResult}) -> String
+
+Save benchmark results to a JLD2 file in `dir`. Returns the file path.
+"""
+function save_results(dir::String, name::String, results::Vector{BenchmarkResult})
+    mkpath(dir)
+    commit = isempty(results) ? "unknown" : results[1].commit
+    filename = "$(name)_$(commit).jld2"
+    path = joinpath(dir, filename)
+    JLD2.jldsave(path; results=results)
+    return path
+end
+
+"""
+    load_results(path) -> Vector{BenchmarkResult}
+
+Load benchmark results from a JLD2 file.
+"""
+function load_results(path::String)
+    return JLD2.load(path, "results")
+end
+
+"""
+    save_micro_results(dir, name, result::MicroBenchmarkResult) -> String
+
+Save micro-benchmark results to a JLD2 file in `dir`. Returns the file path.
+"""
+function save_micro_results(dir::String, name::String, result::MicroBenchmarkResult)
+    mkpath(dir)
+    filename = "$(name)_$(result.commit).jld2"
+    path = joinpath(dir, filename)
+    JLD2.jldsave(path; result=result)
+    return path
+end
+
+"""
+    load_micro_results(path) -> MicroBenchmarkResult
+
+Load micro-benchmark results from a JLD2 file.
+"""
+function load_micro_results(path::String)
+    return JLD2.load(path, "result")
+end
+```
+
+- [ ] **Step 4: Update module**
+
+```julia
+# src/HarmoniqsBenchmarks.jl
+module HarmoniqsBenchmarks
+
+export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
+export save_results, load_results, save_micro_results, load_micro_results
+
+include("schema.jl")
+include("storage.jl")
+
+end
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/storage.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add JLD2 save/load for BenchmarkResult and MicroBenchmarkResult"
+```
+
+---
+
+## Task 4: Implement build_evaluator Harness
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Add test for build_evaluator**
+
+Append to `test/runtests.jl`, inside top-level testset:
+
+```julia
+@testset "Harness" begin
+    using DirectTrajOpt
+    using NamedTrajectories
+    using SparseArrays
+    using ExponentialAction
+    using MathOptInterface
+    const MOI = MathOptInterface
+
+    # Build a simple bilinear problem (same as DirectTrajOpt test_utils.jl)
+    N = 10; Δt = 0.1; u_bound = 0.1; ω = 0.1
+    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+    traj = NamedTrajectory(
+        (
+            x = 2rand(4, N) .- 1,
+            u = u_bound * (2rand(2, N) .- 1),
+            du = randn(2, N),
+            ddu = randn(2, N),
+            Δt = fill(Δt, N),
+        );
+        controls = (:ddu, :Δt),
+        timestep = :Δt,
+        bounds = (u = u_bound, Δt = (0.01, 0.5)),
+        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+        final = (u = zeros(2),),
+        goal = (x = [0.0, 1.0, 0.0, 0.0],),
+    )
+
+    integrators = [
+        BilinearIntegrator(G, :x, :u, traj),
+        DerivativeIntegrator(:u, :du, traj),
+        DerivativeIntegrator(:du, :ddu, traj),
+    ]
+
+    J = QuadraticRegularizer(:u, traj, 1.0)
+    prob = DirectTrajOptProblem(traj, J, integrators)
+
+    @testset "build_evaluator returns evaluator and Z vector" begin
+        evaluator, Z_vec = build_evaluator(prob)
+        @test evaluator isa MOI.AbstractNLPEvaluator
+        @test length(Z_vec) == traj.dim * traj.N + traj.global_dim
+
+        # Verify eval functions are callable
+        obj = MOI.eval_objective(evaluator, Z_vec)
+        @test obj isa Float64
+        @test isfinite(obj)
+    end
+
+    @testset "evaluator_dims returns correct sizes" begin
+        evaluator, Z_vec = build_evaluator(prob)
+        dims = evaluator_dims(evaluator)
+        @test dims.n_constraints == evaluator.n_constraints
+        @test dims.n_variables == length(Z_vec)
+        @test dims.n_jacobian_entries == length(evaluator.jacobian_structure)
+        @test dims.n_hessian_entries == length(evaluator.hessian_structure)
+    end
+end
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `build_evaluator` not defined
+
+- [ ] **Step 3: Implement build_evaluator and evaluator_dims**
+
+```julia
+# src/harness.jl
+using DirectTrajOpt
+using NamedTrajectories
+using MathOptInterface
+const MOI = MathOptInterface
+
+"""
+    build_evaluator(prob::DirectTrajOptProblem; eval_hessian=true) -> (evaluator, Z_vec)
+
+Extract a MOI evaluator and the initial decision variable vector from a
+DirectTrajOptProblem. Used for micro-benchmarking individual eval functions.
+
+Returns:
+- `evaluator`: An `MOI.AbstractNLPEvaluator` ready for `MOI.eval_*` calls
+- `Z_vec`: The flat decision variable vector `[trajectory_data; global_data]`
+"""
+function build_evaluator(prob::DirectTrajOpt.Problems.DirectTrajOptProblem; eval_hessian::Bool=true)
+    evaluator = DirectTrajOpt.Solvers.Evaluator(prob; eval_hessian=eval_hessian, verbose=false)
+    traj = prob.trajectory
+    Z_vec = vcat(collect(traj.datavec), collect(traj.global_data))
+    return evaluator, Z_vec
+end
+
+"""
+    evaluator_dims(evaluator) -> NamedTuple
+
+Return key dimensions of the evaluator for buffer pre-allocation.
+"""
+function evaluator_dims(evaluator::DirectTrajOpt.Solvers.Evaluator)
+    return (
+        n_constraints = evaluator.n_constraints,
+        n_variables = evaluator.trajectory.dim * evaluator.trajectory.N + evaluator.trajectory.global_dim,
+        n_jacobian_entries = length(evaluator.jacobian_structure),
+        n_hessian_entries = length(evaluator.hessian_structure),
+    )
+end
+```
+
+- [ ] **Step 4: Update module**
+
+```julia
+# src/HarmoniqsBenchmarks.jl
+module HarmoniqsBenchmarks
+
+export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
+export save_results, load_results, save_micro_results, load_micro_results
+export build_evaluator, evaluator_dims
+
+include("schema.jl")
+include("storage.jl")
+include("harness.jl")
+
+end
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add build_evaluator and evaluator_dims harness functions"
+```
+
+---
+
+## Task 5: Implement benchmark_solve! Harness
+
+**Files:**
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Add test for benchmark_solve!**
+
+Append inside the `@testset "Harness"` block in `test/runtests.jl`:
+
+```julia
+@testset "benchmark_solve! captures metrics" begin
+    # Rebuild a fresh problem (solve! mutates in place)
+    traj2 = NamedTrajectory(
+        (
+            x = 2rand(4, N) .- 1,
+            u = u_bound * (2rand(2, N) .- 1),
+            du = randn(2, N),
+            ddu = randn(2, N),
+            Δt = fill(Δt, N),
+        );
+        controls = (:ddu, :Δt),
+        timestep = :Δt,
+        bounds = (u = u_bound, Δt = (0.01, 0.5)),
+        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+        final = (u = zeros(2),),
+        goal = (x = [0.0, 1.0, 0.0, 0.0],),
+    )
+    integrators2 = [
+        BilinearIntegrator(G, :x, :u, traj2),
+        DerivativeIntegrator(:u, :du, traj2),
+        DerivativeIntegrator(:du, :ddu, traj2),
+    ]
+    J2 = QuadraticRegularizer(:u, traj2, 1.0)
+    prob2 = DirectTrajOptProblem(traj2, J2, integrators2)
+
+    result = benchmark_solve!(
+        prob2, IpoptOptions(max_iter=10, print_level=0);
+        benchmark_name = "test_solve",
+    )
+
+    @test result isa BenchmarkResult
+    @test result.package == "DirectTrajOpt"
+    @test result.solver == "ipopt"
+    @test result.wall_time_s > 0.0
+    @test result.iterations >= 0
+    @test result.total_allocations_bytes >= 0
+    @test result.gc_count >= 0
+    @test result.N == N
+    @test result.state_dim == 4
+    @test haskey(result.solver_options, :max_iter)
+    @test result.solver_options[:max_iter] == 10
+end
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `benchmark_solve!` not defined
+
+- [ ] **Step 3: Implement benchmark_solve!**
+
+Append to `src/harness.jl`:
+
+```julia
+using Dates
+
+"""
+    benchmark_solve!(prob, options; benchmark_name, runner="local", kwargs...) -> BenchmarkResult
+
+Run `solve!(prob; options, kwargs...)` and capture timing, memory, GC stats, and solver options.
+"""
+function benchmark_solve!(
+    prob::DirectTrajOpt.Problems.DirectTrajOptProblem,
+    options::DirectTrajOpt.Solvers.AbstractSolverOptions;
+    benchmark_name::String = "unnamed",
+    runner::String = "local",
+    verbose::Bool = false,
+    kwargs...,
+)
+    traj = prob.trajectory
+
+    # Capture problem dimensions before solve
+    n_vars = traj.dim * traj.N + traj.global_dim
+    state_dim = _infer_state_dim(prob)
+    control_dim = _infer_control_dim(prob)
+    n_constraints_total = _count_constraints(prob, options)
+
+    # Snapshot solver options
+    opts_snapshot = Dict{Symbol,Any}()
+    for name in fieldnames(typeof(options))
+        opts_snapshot[name] = getfield(options, name)
+    end
+
+    # GC baseline
+    GC.gc()
+    gc_before = Base.gc_num()
+
+    # Timed solve
+    timed = @timed solve!(prob; options=options, verbose=verbose, kwargs...)
+
+    gc_after = Base.gc_num()
+
+    # Compute GC deltas
+    gc_time = timed.gctime  # in seconds, convert to ns
+    gc_count_delta = gc_after.pause - gc_before.pause
+    gc_full_delta = gc_after.full_sweep - gc_before.full_sweep
+
+    # Package version from Project.toml
+    pkg_version = _get_package_version("DirectTrajOpt")
+    commit = _get_git_commit()
+
+    return BenchmarkResult(
+        package = "DirectTrajOpt",
+        package_version = pkg_version,
+        commit = commit,
+        benchmark_name = benchmark_name,
+        N = traj.N,
+        state_dim = state_dim,
+        control_dim = control_dim,
+        n_constraints = n_constraints_total,
+        n_variables = n_vars,
+        wall_time_s = timed.time,
+        iterations = -1,  # TODO: extract from solver output when available
+        objective_value = NaN,  # TODO: extract from solver
+        constraint_violation = NaN,
+        solver_status = :Unknown,
+        solver = _solver_name(options),
+        total_allocations_bytes = timed.bytes,
+        total_allocs_count = -1,  # @timed doesn't give count; use gc_num delta
+        gc_time_ns = round(Int, timed.gctime * 1e9),
+        gc_count = gc_count_delta,
+        gc_full_count = gc_full_delta,
+        solver_options = opts_snapshot,
+        julia_version = string(VERSION),
+        timestamp = now(),
+        runner = runner,
+        n_threads = Threads.nthreads(),
+    )
+end
+
+# --- helpers ---
+
+function _solver_name(options::DirectTrajOpt.Solvers.AbstractSolverOptions)
+    name = string(typeof(options).name.name)
+    if occursin("Ipopt", name)
+        return "ipopt"
+    elseif occursin("MadNLP", name)
+        return "madnlp"
+    else
+        return lowercase(name)
+    end
+end
+
+function _infer_state_dim(prob)
+    traj = prob.trajectory
+    # Heuristic: look for common state variable names
+    for name in [:x, :ψ̃, :Ũ⃗, :ρ̃]
+        if haskey(traj.dims, name)
+            return traj.dims[name]
+        end
+    end
+    # Fallback: first non-control component
+    return first(values(traj.dims))
+end
+
+function _infer_control_dim(prob)
+    traj = prob.trajectory
+    total = 0
+    for name in traj.control_names
+        if name != traj.timestep_name
+            total += traj.dims[name]
+        end
+    end
+    return total
+end
+
+function _count_constraints(prob, options)
+    n_dynamics = sum(integrator.dim for integrator in prob.integrators; init=0)
+    n_nonlinear = sum(
+        c.dim for c in prob.constraints
+        if c isa DirectTrajOpt.Constraints.AbstractNonlinearConstraint;
+        init=0
+    )
+    return n_dynamics * (prob.trajectory.N - 1) + n_nonlinear
+end
+
+function _get_package_version(pkg_name::String)
+    try
+        deps = Pkg.dependencies()
+        for (_, info) in deps
+            if info.name == pkg_name
+                return string(info.version)
+            end
+        end
+    catch
+    end
+    return "unknown"
+end
+
+function _get_git_commit()
+    try
+        return strip(read(`git rev-parse --short HEAD`, String))
+    catch
+        return "unknown"
+    end
+end
+```
+
+- [ ] **Step 4: Add `Pkg` import to harness.jl**
+
+Add at the top of `src/harness.jl`:
+
+```julia
+import Pkg
+```
+
+- [ ] **Step 5: Update module exports**
+
+In `src/HarmoniqsBenchmarks.jl`, add to exports:
+
+```julia
+export benchmark_solve!
+```
+
+- [ ] **Step 6: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 7: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add benchmark_solve! harness with GC stats and options snapshot"
+```
+
+---
+
+## Task 6: Implement BenchmarkTools→EvalBenchmark Conversion
+
+**Files:**
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Add test for trial_to_eval_benchmark**
+
+Append inside `@testset "Harness"`:
+
+```julia
+@testset "trial_to_eval_benchmark extracts data from BenchmarkTools.Trial" begin
+    using BenchmarkTools
+    trial = @benchmark 1 + 1
+    eb = trial_to_eval_benchmark(trial)
+    @test eb isa EvalBenchmark
+    @test length(eb.times_ns) > 0
+    @test eb.min_ns > 0.0
+    @test eb.memory_bytes >= 0
+    @test eb.allocs >= 0
+end
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `trial_to_eval_benchmark` not defined
+
+- [ ] **Step 3: Implement trial_to_eval_benchmark**
+
+Append to `src/harness.jl`:
+
+```julia
+using BenchmarkTools
+
+"""
+    trial_to_eval_benchmark(trial::BenchmarkTools.Trial) -> EvalBenchmark
+
+Convert a BenchmarkTools.Trial to an EvalBenchmark, extracting raw timing data.
+"""
+function trial_to_eval_benchmark(trial::BenchmarkTools.Trial)
+    return EvalBenchmark(
+        times_ns = Float64.(trial.times),
+        gctimes_ns = Float64.(trial.gctimes),
+        memory_bytes = trial.memory,
+        allocs = trial.allocs,
+    )
+end
+```
+
+- [ ] **Step 4: Export the function**
+
+Add `trial_to_eval_benchmark` to exports in `src/HarmoniqsBenchmarks.jl`.
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add trial_to_eval_benchmark for BenchmarkTools integration"
+```
+
+---
+
+## Task 7: Implement compare_results Reporter
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/report.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
+- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
+
+- [ ] **Step 1: Add test for compare_results**
+
+Append to `test/runtests.jl`, inside top-level testset:
+
+```julia
+@testset "Report" begin
+    @testset "compare_results detects regressions" begin
+        baseline = BenchmarkResult(
+            package="DirectTrajOpt", package_version="0.8.9",
+            commit="aaa1111", benchmark_name="test",
+            N=51, state_dim=4, control_dim=2,
+            n_constraints=200, n_variables=765,
+            wall_time_s=1.0, iterations=50,
+            objective_value=0.001, constraint_violation=1e-8,
+            solver_status=:Optimal, solver="ipopt",
+            total_allocations_bytes=1_000_000, total_allocs_count=500,
+            gc_time_ns=10_000, gc_count=2, gc_full_count=0,
+            solver_options=Dict{Symbol,Any}(),
+            julia_version=string(VERSION), timestamp=now(),
+            runner="local", n_threads=1,
+        )
+
+        # 20% regression in wall time
+        current = BenchmarkResult(
+            package="DirectTrajOpt", package_version="0.8.10",
+            commit="bbb2222", benchmark_name="test",
+            N=51, state_dim=4, control_dim=2,
+            n_constraints=200, n_variables=765,
+            wall_time_s=1.2, iterations=50,
+            objective_value=0.001, constraint_violation=1e-8,
+            solver_status=:Optimal, solver="ipopt",
+            total_allocations_bytes=900_000, total_allocs_count=450,
+            gc_time_ns=10_000, gc_count=2, gc_full_count=0,
+            solver_options=Dict{Symbol,Any}(),
+            julia_version=string(VERSION), timestamp=now(),
+            runner="local", n_threads=1,
+        )
+
+        comparison = compare_results([baseline], [current])
+        @test length(comparison) == 1
+        row = comparison[1]
+        @test row.benchmark_name == "test"
+        @test row.wall_time_pct_change > 15.0  # 20% regression
+        @test row.alloc_bytes_pct_change < 0.0  # 10% improvement
+        @test row.has_regression == true         # wall time regressed >10%
+    end
+end
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: FAIL — `compare_results` not defined
+
+- [ ] **Step 3: Implement compare_results**
+
+```julia
+# src/report.jl
+
+struct ComparisonRow
+    benchmark_name::String
+    solver::String
+    N::Int
+    state_dim::Int
+    # Wall time
+    baseline_wall_s::Float64
+    current_wall_s::Float64
+    wall_time_pct_change::Float64
+    # Allocations
+    baseline_alloc_bytes::Int
+    current_alloc_bytes::Int
+    alloc_bytes_pct_change::Float64
+    # Regression flag
+    has_regression::Bool
+end
+
+"""
+    compare_results(baseline, current; regression_threshold=10.0) -> Vector{ComparisonRow}
+
+Compare two sets of BenchmarkResults by matching on `benchmark_name`.
+Returns comparison rows with percent changes and regression flags.
+
+A regression is flagged when wall_time or allocations increase by more than
+`regression_threshold` percent.
+"""
+function compare_results(
+    baseline::Vector{BenchmarkResult},
+    current::Vector{BenchmarkResult};
+    regression_threshold::Float64 = 10.0,
+)
+    baseline_by_name = Dict(r.benchmark_name => r for r in baseline)
+    rows = ComparisonRow[]
+
+    for r in current
+        b = get(baseline_by_name, r.benchmark_name, nothing)
+        isnothing(b) && continue
+
+        wall_pct = _pct_change(b.wall_time_s, r.wall_time_s)
+        alloc_pct = _pct_change(Float64(b.total_allocations_bytes), Float64(r.total_allocations_bytes))
+        has_regression = wall_pct > regression_threshold || alloc_pct > regression_threshold
+
+        push!(rows, ComparisonRow(
+            r.benchmark_name, r.solver, r.N, r.state_dim,
+            b.wall_time_s, r.wall_time_s, wall_pct,
+            b.total_allocations_bytes, r.total_allocations_bytes, alloc_pct,
+            has_regression,
+        ))
+    end
+
+    return rows
+end
+
+function _pct_change(old::Float64, new::Float64)
+    old == 0.0 && return new == 0.0 ? 0.0 : 100.0
+    return (new - old) / abs(old) * 100.0
+end
+```
+
+- [ ] **Step 4: Update module**
+
+Add exports to `src/HarmoniqsBenchmarks.jl`:
+
+```julia
+export compare_results, ComparisonRow
+```
+
+And add the include:
+
+```julia
+include("report.jl")
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+Expected: All tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
+git add src/report.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
+git commit -m "feat: add compare_results reporter with regression detection"
+```
+
+---
+
+## Task 8: Set Up DirectTrajOpt.jl Benchmark Environment
+
+**Files:**
+- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/Project.toml`
+- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/.gitignore`
+- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
+
+- [ ] **Step 1: Create benchmark directory**
+
+```bash
+mkdir -p /home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/results
+```
+
+- [ ] **Step 2: Create .gitignore**
+
+```
+# benchmark/.gitignore
+results/
+```
+
+- [ ] **Step 3: Create benchmark/Project.toml**
+
+```toml
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
+ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HarmoniqsBenchmarks = "INSERT_UUID"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
+```
+
+Replace `INSERT_UUID` with the UUID generated in Task 1.
+
+- [ ] **Step 4: Instantiate the benchmark environment**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+julia --project=benchmark -e '
+    using Pkg
+    Pkg.develop(path=".")
+    Pkg.develop(path="../HarmoniqsBenchmarks.jl")
+    Pkg.develop(path="../NamedTrajectories.jl")
+    Pkg.instantiate()
+    using HarmoniqsBenchmarks
+    println("Benchmark env OK")
+'
+```
+
+Expected: "Benchmark env OK"
+
+- [ ] **Step 5: Create benchmarks.jl stub**
+
+```julia
+# benchmark/benchmarks.jl
+using TestItems
+```
+
+- [ ] **Step 6: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+git add benchmark/Project.toml benchmark/.gitignore benchmark/benchmarks.jl
+git commit -m "feat: add benchmark/ environment for HarmoniqsBenchmarks integration"
+```
+
+---
+
+## Task 9: Write Evaluator Micro-benchmarks
+
+**Files:**
+- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
+
+- [ ] **Step 1: Write the micro-benchmark @testitem**
+
+```julia
+# benchmark/benchmarks.jl
+using TestItems
+
+@testitem "Evaluator micro-benchmarks: bilinear N=51" begin
+    using HarmoniqsBenchmarks
+    using BenchmarkTools
+    using DirectTrajOpt
+    using NamedTrajectories
+    using SparseArrays
+    using ExponentialAction
+    using MathOptInterface
+    const MOI = MathOptInterface
+    using Dates
+
+    # Build a deterministic bilinear problem
+    Random.seed!(42)
+    N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+    traj = NamedTrajectory(
+        (
+            x = 2rand(4, N) .- 1,
+            u = u_bound * (2rand(2, N) .- 1),
+            du = randn(2, N),
+            ddu = randn(2, N),
+            Δt = fill(Δt, N),
+        );
+        controls = (:ddu, :Δt),
+        timestep = :Δt,
+        bounds = (u = u_bound, Δt = (0.01, 0.5)),
+        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+        final = (u = zeros(2),),
+        goal = (x = [0.0, 1.0, 0.0, 0.0],),
+    )
+
+    integrators = [
+        BilinearIntegrator(G, :x, :u, traj),
+        DerivativeIntegrator(:u, :du, traj),
+        DerivativeIntegrator(:du, :ddu, traj),
+    ]
+    J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+    prob = DirectTrajOptProblem(traj, J, integrators)
+
+    evaluator, Z_vec = build_evaluator(prob)
+    dims = evaluator_dims(evaluator)
+
+    # Pre-allocate buffers
+    g = zeros(dims.n_constraints)
+    grad = zeros(dims.n_variables)
+    H = zeros(dims.n_hessian_entries)
+    Jac = zeros(dims.n_jacobian_entries)
+    sigma = 1.0
+    mu = ones(dims.n_constraints)
+
+    # Run benchmarks
+    benchmarks = Dict{Symbol,EvalBenchmark}(
+        :eval_objective => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_objective($evaluator, $Z_vec))
+        ),
+        :eval_gradient => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))
+        ),
+        :eval_constraint => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))
+        ),
+        :eval_jacobian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))
+        ),
+        :eval_hessian_lagrangian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))
+        ),
+    )
+
+    result = MicroBenchmarkResult(
+        package = "DirectTrajOpt",
+        package_version = "0.8.10",
+        commit = try strip(read(`git rev-parse --short HEAD`, String)) catch; "unknown" end,
+        benchmark_name = "evaluator_micro_bilinear_N51",
+        N = N, state_dim = 4, control_dim = 2,
+        eval_benchmarks = benchmarks,
+        julia_version = string(VERSION),
+        timestamp = now(),
+        runner = get(ENV, "BENCHMARK_RUNNER", "local"),
+        n_threads = Threads.nthreads(),
+    )
+
+    # Print summary
+    println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
+    for (name, eb) in sort(collect(result.eval_benchmarks), by=first)
+        Printf = Base.Printf
+        @Printf.printf("  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n",
+            name, eb.median_ns, eb.allocs, eb.memory_bytes)
+    end
+
+    # Save
+    results_dir = joinpath(@__DIR__, "results")
+    save_micro_results(results_dir, result.benchmark_name, result)
+    println("  Saved to $results_dir/")
+end
+```
+
+- [ ] **Step 2: Run the micro-benchmark to verify it works**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+julia --project=benchmark -e '
+    using TestItemRunner
+    @run_package_tests(filter=ti -> occursin("micro", ti.name), benchmark)
+'
+```
+
+Expected: Benchmark runs, prints timing table, saves JLD2 to `benchmark/results/`
+
+- [ ] **Step 3: Verify the JLD2 output is loadable**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+julia --project=benchmark -e '
+    using HarmoniqsBenchmarks
+    files = filter(f -> endswith(f, ".jld2"), readdir("benchmark/results", join=true))
+    @assert length(files) >= 1 "Expected at least one JLD2 file"
+    result = load_micro_results(files[1])
+    println("Loaded: $(result.benchmark_name)")
+    println("Functions benchmarked: $(keys(result.eval_benchmarks))")
+'
+```
+
+Expected: Loads successfully, shows function names
+
+- [ ] **Step 4: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+git add benchmark/benchmarks.jl
+git commit -m "feat: add evaluator micro-benchmarks with BenchmarkTools"
+```
+
+---
+
+## Task 10: Write Ipopt vs MadNLP Macro-benchmarks
+
+**Files:**
+- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
+
+- [ ] **Step 1: Append the macro-benchmark @testitem**
+
+Append to `benchmark/benchmarks.jl`:
+
+```julia
+@testitem "Ipopt vs MadNLP: bilinear N=51" begin
+    using HarmoniqsBenchmarks
+    using DirectTrajOpt
+    using NamedTrajectories
+    using SparseArrays
+    using ExponentialAction
+    import MadNLP
+    using Dates
+
+    # Resolve MadNLPOptions from the extension
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order)
+        if Symbol(mod) == :MadNLPSolverExt
+    ][1]
+
+    function make_bilinear_problem(; seed=42)
+        Random.seed!(seed)
+        N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+        Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
+        Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
+        Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
+        G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
+
+        traj = NamedTrajectory(
+            (
+                x = 2rand(4, N) .- 1,
+                u = u_bound * (2rand(2, N) .- 1),
+                du = randn(2, N),
+                ddu = randn(2, N),
+                Δt = fill(Δt, N),
+            );
+            controls = (:ddu, :Δt),
+            timestep = :Δt,
+            bounds = (u = u_bound, Δt = (0.01, 0.5)),
+            initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+            final = (u = zeros(2),),
+            goal = (x = [0.0, 1.0, 0.0, 0.0],),
+        )
+
+        integrators = [
+            BilinearIntegrator(G, :x, :u, traj),
+            DerivativeIntegrator(:u, :du, traj),
+            DerivativeIntegrator(:du, :ddu, traj),
+        ]
+        J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    # Ipopt solve
+    prob_ipopt = make_bilinear_problem()
+    result_ipopt = benchmark_solve!(
+        prob_ipopt,
+        IpoptOptions(max_iter=200, print_level=0);
+        benchmark_name = "bilinear_N51_ipopt",
+    )
+
+    # MadNLP solve (fresh problem)
+    prob_madnlp = make_bilinear_problem()
+    result_madnlp = benchmark_solve!(
+        prob_madnlp,
+        MadNLPSolverExt.MadNLPOptions(max_iter=200, print_level=1);
+        benchmark_name = "bilinear_N51_madnlp",
+    )
+
+    # Print comparison
+    println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
+    println("  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc")
+    println("  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc")
+
+    # Save
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
+end
+```
+
+- [ ] **Step 2: Run the macro-benchmark**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+julia --project=benchmark -e '
+    using TestItemRunner
+    @run_package_tests(filter=ti -> occursin("Ipopt vs MadNLP", ti.name), benchmark)
+'
+```
+
+Expected: Both solvers run, prints wall time and allocation comparison
+
+- [ ] **Step 3: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+git add benchmark/benchmarks.jl
+git commit -m "feat: add Ipopt vs MadNLP macro-benchmark"
+```
+
+---
+
+## Task 11: Write Memory Scaling Study
+
+**Files:**
+- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
+
+- [ ] **Step 1: Append the scaling study @testitem**
+
+Append to `benchmark/benchmarks.jl`:
+
+```julia
+@testitem "Memory scaling: N and state_dim sweep" begin
+    using HarmoniqsBenchmarks
+    using DirectTrajOpt
+    using NamedTrajectories
+    using SparseArrays
+    using ExponentialAction
+    import MadNLP
+    using Dates, Printf
+
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order)
+        if Symbol(mod) == :MadNLPSolverExt
+    ][1]
+
+    function make_scaled_problem(; N, state_dim, n_controls=2, seed=42)
+        Random.seed!(seed)
+
+        # Build random bilinear system at given state dimension
+        G_drift = sparse(randn(state_dim, state_dim))
+        G_drives = [sparse(randn(state_dim, state_dim)) for _ in 1:n_controls]
+        G(u) = G_drift + sum(u[i] * G_drives[i] for i in 1:n_controls)
+
+        x_init = zeros(state_dim); x_init[1] = 1.0
+        x_goal = zeros(state_dim); x_goal[2] = 1.0
+
+        traj = NamedTrajectory(
+            (
+                x = randn(state_dim, N),
+                u = 0.1 * randn(n_controls, N),
+                du = randn(n_controls, N),
+                Δt = fill(0.1, N),
+            );
+            controls = (:du, :Δt),
+            timestep = :Δt,
+            bounds = (u = 1.0, Δt = (0.01, 0.5)),
+            initial = (x = x_init, u = zeros(n_controls)),
+            final = (u = zeros(n_controls),),
+            goal = (x = x_goal,),
+        )
+
+        integrators = [
+            BilinearIntegrator(G, :x, :u, traj),
+            DerivativeIntegrator(:u, :du, traj),
+        ]
+        J = QuadraticRegularizer(:u, traj, 1.0)
+        return DirectTrajOptProblem(traj, J, integrators)
+    end
+
+    N_values = [25, 51, 101]
+    dim_values = [4, 8, 16]
+    results = BenchmarkResult[]
+
+    println("\n=== Memory Scaling Study ===")
+    @printf("  %5s | %5s | %12s | %12s | %12s | %12s\n",
+        "N", "dim", "Ipopt (s)", "Ipopt (KB)", "MadNLP (s)", "MadNLP (KB)")
+    @printf("  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n",
+        "-"^5, "-"^5, "-"^12, "-"^12, "-"^12, "-"^12)
+
+    for N in N_values
+        for dim in dim_values
+            # Ipopt
+            prob = make_scaled_problem(; N=N, state_dim=dim)
+            r_ipopt = benchmark_solve!(
+                prob, IpoptOptions(max_iter=50, print_level=0);
+                benchmark_name = "scaling_N$(N)_d$(dim)_ipopt",
+            )
+            push!(results, r_ipopt)
+
+            # MadNLP
+            prob = make_scaled_problem(; N=N, state_dim=dim)
+            r_madnlp = benchmark_solve!(
+                prob, MadNLPSolverExt.MadNLPOptions(max_iter=50, print_level=1);
+                benchmark_name = "scaling_N$(N)_d$(dim)_madnlp",
+            )
+            push!(results, r_madnlp)
+
+            @printf("  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
+                N, dim,
+                r_ipopt.wall_time_s, r_ipopt.total_allocations_bytes ÷ 1024,
+                r_madnlp.wall_time_s, r_madnlp.total_allocations_bytes ÷ 1024)
+        end
+    end
+
+    # Save all results
+    results_dir = joinpath(@__DIR__, "results")
+    save_results(results_dir, "memory_scaling", results)
+    println("\n  Saved $(length(results)) results to $results_dir/")
+end
+```
+
+- [ ] **Step 2: Run the scaling study**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+julia --project=benchmark -e '
+    using TestItemRunner
+    @run_package_tests(filter=ti -> occursin("Memory scaling", ti.name), benchmark)
+'
+```
+
+Expected: Table printed with wall times and allocations for each (N, dim) combination
+
+- [ ] **Step 3: Commit**
+
+```bash
+cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
+git add benchmark/benchmarks.jl
+git commit -m "feat: add memory scaling study benchmark (N x state_dim sweep)"
+```
+
+---
+
+## Verification Checklist
+
+After all tasks are complete:
+
+- [ ] `cd HarmoniqsBenchmarks.jl && julia --project=. -e 'using Pkg; Pkg.test()'` — all tests pass
+- [ ] `cd DirectTrajOpt.jl && julia --project=benchmark -e 'using TestItemRunner; @run_package_tests(benchmark)'` — all three benchmark @testitems run
+- [ ] `ls DirectTrajOpt.jl/benchmark/results/` — contains `.jld2` files for each benchmark
+- [ ] Load and compare results:
+  ```julia
+  using HarmoniqsBenchmarks
+  results = load_results("benchmark/results/ipopt_vs_madnlp_N51_<sha>.jld2")
+  println("Ipopt: $(results[1].wall_time_s)s, MadNLP: $(results[2].wall_time_s)s")
+  ```
+
+---
+
+## Follow-up Plans (Not in Scope)
+
+- **Piccolissimo benchmark suite** — migrate existing `benchmark/complex_vs_real_ode.jl` and `constraint_comparison.jl` to use HarmoniqsBenchmarks schema
+- **Demo-repo problem generators** — clone bosonic-demo, nv-center-demo, atoms-demo, ions, fluxonium-demo, gkp-stanford and extract system Hamiltonians
+- **CI workflows** — `.github/workflows/benchmark.yml` for DirectTrajOpt and other packages
+- **Allocation profiling spike** — parallel worktree experiments with Profile.Allocs, AllocCheck.jl, --track-allocation
+- **Aggregator repo** — `harmoniqs-benchmarks` with cross-package comparison tables
diff --git a/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md b/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
new file mode 100644
index 0000000..50b7959
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
@@ -0,0 +1,198 @@
+# Altissimo GPU Benchmark Suite — Design
+
+**Date:** 2026-04-15
+**Status:** Design (follow-up to HarmoniqsBenchmarks.jl core plan)
+**Depends on:** HarmoniqsBenchmarks.jl (schema, harness, storage)
+**Reference:** `gpu_benchmark.py` (Colab notebook from Raghav, T4 results)
+
+## Context
+
+Altissimo.jl is a GPU-accelerated augmented Lagrangian optimizer for quantum trajectory optimization. It uses matrix-free JVP/VJP callbacks, making it GPU-compatible where Ipopt (which requires sparse Jacobians/Hessians) is CPU-only. Raghav demonstrated 4.5x GPU speedup at 1024 state dim on a T4. This benchmark suite formalizes those measurements and tracks them across versions.
+
+Three benchmark categories, matching the existing Colab notebook structure:
+
+1. **Ipopt vs Altissimo (CPU)** — real quantum gate optimization
+2. **Altissimo CPU vs GPU scaling** — structured optimization at increasing state dim
+3. **cuDensityMat vs cuSPARSE** — Liouvillian operator action for open-system trajectory optimization
+
+## Benchmark 0: Three-Way Solver Comparison (Ipopt vs MadNLP-GPU vs Altissimo-GPU)
+
+The harmoniqs org maintains a MadNLP.jl fork with `MadNLPGPU` (in `lib/MadNLPGPU/`), which uses CUDSS for GPU-accelerated sparse KKT system solves. This enables a three-way comparison at increasing problem sizes:
+
+| Solver | Method | Linear Algebra | GPU? |
+|--------|--------|---------------|------|
+| Ipopt | Interior-point | MUMPS/Pardiso (sparse, CPU) | No |
+| MadNLP + MadNLPGPU | Interior-point | CUDSS (sparse, GPU) | Yes |
+| Altissimo | Augmented Lagrangian | Matrix-free JVP/VJP (GPU) | Yes |
+
+**Hypothesis:** At small state dims (sd < 256), Ipopt wins due to mature sparse factorization. At medium dims (256-1024), MadNLP-GPU may win due to GPU-accelerated CUDSS. At large dims (1024+), Altissimo wins due to matrix-free scaling (no sparse assembly).
+
+**Problem:** Same quantum-control-structured problem as Benchmark 2 below, swept across sd ∈ {64, 128, 256, 512, 1024, 2048}. For MadNLP-GPU, the problem requires Jacobian/Hessian sparsity (MOI interface), so it uses the same evaluator as Ipopt but with GPU-side linear solves.
+
+**Dependencies:**
+- `MadNLPGPU` from `harmoniqs/MadNLP.jl` (lib/MadNLPGPU)
+- `CUDA.jl` + `CUDSS.jl` for GPU linear algebra
+- DirectTrajOpt MadNLP extension for evaluator hookup
+
+**Metrics:** Wall time, iterations, convergence quality, total allocations, GPU memory usage, speedup vs Ipopt baseline.
+
+**Note:** MadNLP-GPU requires the KKT system to fit in GPU memory. For very large problems, the sparse Jacobian/Hessian may exceed VRAM, which is exactly where Altissimo's matrix-free approach has the advantage.
+
+---
+
+## Benchmark 1: Ipopt vs Altissimo (CPU) — Quantum Gate Optimization
+
+Directly comparable: same X gate problem, same initial conditions, both on CPU.
+
+**Problem setup** (from Colab Part 2):
+- System: 1 qubit, H_drift = 0.5 σ_z, drives = [σ_x, σ_y], bounds = [1.0, 1.0]
+- Gate: X gate, T=10.0, N=100
+- Integrator: HermitianExponentialIntegrator
+- Template: SmoothPulseProblem(Q=100.0, R=1e-2, ddu_bound=1.0, Δt_bounds=(0.05, 0.15))
+- Deep copy for identical initial conditions
+
+**Metrics:**
+- Wall time (s)
+- Fidelity (infidelity = 1 - fidelity)
+- Total allocations (bytes)
+- GC time
+
+**Altissimo configuration** (reference values):
+```julia
+AltissimoOptions(
+    search_direction = :LBFGS,
+    lbfgs_memory = 50,
+    line_search = :StrongWolfe,
+    ls_max_evals = 100,
+    max_outer_iter = 20,
+    max_inner_iter = 500,
+    inner_tol = 1e-8,
+    ρ_init = 100.0,
+    ρ_max = 1e8,
+    polish = true,
+    polish_stall_min_iters = 10,
+    polish_δ_w = 1e-6,
+    polish_δ_c = 1e-8,
+)
+```
+
+**Integration with HarmoniqsBenchmarks:** Both produce `BenchmarkResult` with `solver="ipopt"` / `solver="altissimo"`. The `solver_options` field captures the full AltissimoOptions snapshot.
+
+## Benchmark 2: Altissimo CPU vs GPU Scaling
+
+The core scaling benchmark. Uses a quantum-control-structured problem (NOT a real quantum system) to isolate solver scaling behavior from physics complexity.
+
+**Problem structure** (from Colab Part 3):
+- Decision vector: z = [x_1; ...; x_N; u_1; ...; u_{N-1}]
+- Dynamics: x_{k+1} = Φ(u_k) x_k, where Φ(u) = A + Σⱼ uⱼ Cⱼ
+- A is orthogonal (norm-preserving, like unitary evolution)
+- Coupling scaled: ‖Cⱼ‖_spectral ≈ 0.4 independent of state_dim (σ_c = 0.2/√sd)
+- Target generated by forward simulation with known controls → guaranteed feasible
+- Objective: ½|x_N - x_target|² + (α/2) Σ|u_k|²
+- All callbacks GPU-native: cuBLAS matvec, broadcast, dot (no scalar indexing)
+
+**Sweep configurations** (from Colab):
+
+| state_dim | n_drives | N  | n_vars   | n_eq     |
+|-----------|----------|----|----------|----------|
+| 512       | 2        | 20 | 10,278   | 10,240   |
+| 1024      | 2        | 20 | 20,518   | 20,480   |
+| 2048      | 2        | 20 | 41,998   | 40,960   |
+| 4096      | 2        | 20 | 81,958   | 81,920   |
+
+**Metrics per (state_dim, device) pair:**
+- Wall time (s) — after JIT warmup
+- Objective value at convergence
+- Constraint violation ‖c‖
+- Converged (bool)
+- GPU speedup = CPU_time / GPU_time
+
+**Key implementation details:**
+- JIT warmup run before timed run
+- `CUDA.synchronize()` before and after timed run for accurate GPU timing
+- `build_callbacks()` returns obj!, grad!, hvp!, eq!, eq_jvp!, eq_vjp!
+- Optimizer: `Altissimo.LBFGS` with `Altissimo.StrongWolfe` line search
+- `initialize_z!` does forward propagation with u=0 for feasible init
+
+**Schema extension:** Add to `BenchmarkResult`:
+- `device::String` — "cpu" or "gpu"
+- `gpu_name::String` — e.g. "Tesla T4", "A100" (from `CUDA.name(CUDA.device())`)
+- `gpu_memory_bytes::Int` — VRAM (from `CUDA.totalmem`)
+
+OR: encode these in `solver_options` dict to avoid schema changes:
+```julia
+solver_options[:device] = "gpu"
+solver_options[:gpu_name] = CUDA.name(CUDA.device())
+solver_options[:gpu_memory_bytes] = CUDA.totalmem(CUDA.device())
+```
+
+Recommended: use `solver_options` dict to avoid breaking the schema for CPU-only packages.
+
+## Benchmark 3: cuDensityMat vs cuSPARSE — Liouvillian Operator
+
+This measures the fundamental operation for open-system trajectory optimization: applying a Liouvillian superoperator to a density matrix.
+
+**System:** M coupled cavities with Fock truncation d=3, Hilbert space D = 3^M.
+- Hamiltonian: H(t) = Σᵢ δᵢ(t) aᵢ†aᵢ + Σᵢ Kᵢ aᵢ†aᵢ†aᵢaᵢ + Σ⟨i,j⟩ gᵢⱼ(t)(aᵢ†aⱼ + h.c.)
+- Collapse operators: √κ aᵢ (photon loss)
+- Liouvillian: L = -i(H⊗I - I⊗Hᵀ) + Σₖ (Cₖ⊗Cₖ* - ½(Cₖ†Cₖ⊗I + I⊗Cₖᵀ Cₖ*))
+
+**Sweep:**
+
+| M | D    | ρ elements (D²) | cuDensityMat | cuSPARSE | Dense CPU |
+|---|------|-----------------|-------------|----------|-----------|
+| 2 | 9    | 81              | 0.27 ms     | 0.039 ms | 0.003 ms  |
+| 4 | 81   | 6,561           | 1.22 ms     | 0.048 ms | 31.8 ms   |
+| 6 | 729  | 531,441         | 6.45 ms     | 0.90 ms  | infeasible|
+| 8 | 6561 | 43,046,721      | 620 ms      | infeasible| infeasible|
+
+**Batched evolution** (trajectory optimization workload):
+
+| M | D  | Batch | Batched   | Sequential | Speedup |
+|---|----|-------|-----------|------------|---------|
+| 2 | 9  | 256   | 0.38 ms   | 70.1 ms    | 186x    |
+| 4 | 81 | 256   | 8.05 ms   | 280.7 ms   | 35x     |
+
+**Key insight:** cuSPARSE beats cuDensityMat for M ≤ 6 (tensor-network contraction overhead at small D). cuDensityMat wins at M=8+ where sparse Liouvillian can't be materialized (~50-70 GB). Batched evolution is critical for trajectory optimization (35-186x speedup).
+
+**Integration note:** This benchmark depends on CuQuantum.jl (harmoniqs org). The cuDensityMat portion requires the NVIDIA cuQuantum SDK and should run exclusively on EC2 GPU runners.
+
+## CI Runner Requirements
+
+| Benchmark | Runner | GPU Required |
+|-----------|--------|-------------|
+| Ipopt vs Altissimo (CPU) | `ubuntu-latest` (free) | No |
+| 3-way solver (Ipopt/MadNLP-GPU/Altissimo) | `[self-hosted, gpu]` (EC2) | Yes (T4 minimum, CUDSS for MadNLP) |
+| Altissimo CPU vs GPU scaling | `[self-hosted, gpu]` (EC2) | Yes (T4 minimum) |
+| cuSPARSE / cuDensityMat | `[self-hosted, gpu]` (EC2) | Yes (A100 recommended for M=8) |
+
+## Where Benchmarks Live
+
+- **Benchmark 1** (Ipopt vs Altissimo CPU): In `Piccolissimo.jl/benchmark/` since it uses `SmoothPulseProblem` + `HermitianExponentialIntegrator`
+- **Benchmark 2** (GPU scaling): In `Altissimo.jl/benchmark/` since it's Altissimo-specific with CUDA callbacks
+- **Benchmark 3** (Liouvillian): In `CuQuantum.jl/benchmark/` or `Piccolissimo.jl/benchmark/` (TBD based on where cuDensityMat integration lands)
+
+All use `HarmoniqsBenchmarks.jl` schema for consistent artifact format.
+
+## Adaptation for HarmoniqsBenchmarks Schema
+
+The Colab notebook uses ad-hoc timing (`@elapsed`, `CUDA.@elapsed`). To integrate with HarmoniqsBenchmarks:
+
+**Benchmark 2 adaptation:**
+- Wrap `run_one()` to return a `BenchmarkResult` instead of a NamedTuple
+- Add `solver_options` dict with Altissimo config + device info
+- Replace manual `time()` calls with `@timed` for allocation tracking
+- Save JLD2 artifacts instead of printing tables
+
+**Benchmark 3 adaptation:**
+- Create a `LiouvillianBenchmarkResult` (or use a new `MicroBenchmarkResult` variant)
+- Key fields: M, D, D², nnz(L), method (:cusparse, :cudensitymat, :cpu_dense), time_ms, memory_bytes
+- Batched results include batch_size and sequential/batched comparison
+
+## Implementation Notes
+
+- The `apply_Phi!` / `apply_Phi_t!` pattern from the notebook should be extracted into Altissimo's callback builder, not reimplemented in benchmarks
+- `CUDA.synchronize()` is critical for accurate GPU timing — always call before starting and after stopping the timer
+- JIT warmup run is mandatory — first Julia/CUDA execution compiles kernels
+- Memory estimation before large allocations: check `CUDA.totalmem()` and skip if would exceed 80% VRAM
+- The coupling scaling fix (σ_c = 0.2/√sd) is essential for well-conditioned problems at large state dim — without it, ‖C‖ ~ 0.1√sd makes convergence erratic
diff --git a/docs/superpowers/specs/2026-04-15-benchmarking-design.md b/docs/superpowers/specs/2026-04-15-benchmarking-design.md
new file mode 100644
index 0000000..bb9f943
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-15-benchmarking-design.md
@@ -0,0 +1,383 @@
+# HarmoniqsBenchmarks.jl — Cross-Package Benchmarking Infrastructure
+
+**Date:** 2026-04-15
+**Status:** Design
+
+## Context
+
+The harmoniqs quantum optimal control stack (DirectTrajOpt, Piccolo, Piccolissimo, Altissimo, Intonato) needs a unified benchmarking system to:
+
+- Compare Ipopt vs MadNLP solver performance on the DirectTrajOpt `feat/madnlp-integration` branch
+- Collect statistically robust histograms of key evaluator functions (eval_hessian_lagrangian, eval_constraint_jacobian, etc.) for regression detection
+- Profile memory usage and allocations in MadNLP and across all packages, understanding how memory scales with knot points (N), state dimension, and control dimension
+- Track allocations in the optimization hot path to drive them toward zero
+- Publish version-tagged JLD2 artifacts so labs and enterprises can evaluate problem-size scaling
+
+This is driven by all three active workstreams needing memory/performance benchmarks (MadNLP integration, Altissimo GPU scaling at 1024 state dim, Intonato convergence tracking).
+
+## Architecture
+
+**Approach:** Shared `HarmoniqsBenchmarks.jl` package + per-package `benchmark/` directories + central aggregator repo.
+
+- `HarmoniqsBenchmarks.jl` — lightweight Julia package (own repo in harmoniqs org) providing schema, profiling harness, problem generators, and reporters
+- Each downstream package (DirectTrajOpt, Piccolo, Piccolissimo, Altissimo, Intonato) has a `benchmark/` directory with `@testitem`-based benchmarks using `HarmoniqsBenchmarks`
+- Central `harmoniqs-benchmarks` repo aggregates artifacts and generates cross-package comparison tables
+- Artifacts are JLD2 files stored in CI (GitHub Actions artifact upload), not a live dashboard
+
+## Schema
+
+### BenchmarkResult
+
+```julia
+struct BenchmarkResult
+    # Identity
+    package::String               # "DirectTrajOpt", "Piccolissimo", etc.
+    package_version::String       # semver tag
+    commit::String                # short SHA
+    benchmark_name::String        # "cz_gate_ipopt", "madnlp_scaling_N101_d16"
+
+    # Problem dimensions
+    N::Int                        # knot points
+    state_dim::Int                # state vector dimension
+    control_dim::Int              # number of controls
+    n_constraints::Int            # total nonlinear constraints
+    n_variables::Int              # total NLP variables
+
+    # Solve metrics
+    wall_time_s::Float64
+    iterations::Int
+    objective_value::Float64
+    constraint_violation::Float64
+    solver_status::Symbol         # :Optimal, :MaxIter, :Infeasible
+    solver::String                # "ipopt", "madnlp", "altissimo"
+
+    # Memory & allocations
+    total_allocations_bytes::Int
+    total_allocs_count::Int       # number of allocation events
+    peak_memory_bytes::Int
+
+    # GC stats
+    gc_time_ns::Int
+    gc_count::Int
+    gc_full_count::Int
+
+    # Solver options snapshot
+    solver_options::Dict{Symbol,Any}
+
+    # Metadata
+    julia_version::String
+    timestamp::DateTime
+    runner::String                # "github-actions", "ec2-gpu", "local"
+    n_threads::Int
+end
+```
+
+### MicroBenchmarkResult
+
+```julia
+struct MicroBenchmarkResult
+    # Identity (same as above)
+    package::String
+    package_version::String
+    commit::String
+    benchmark_name::String
+
+    # Problem dimensions
+    N::Int
+    state_dim::Int
+    control_dim::Int
+
+    # Per-function BenchmarkTools results
+    # Each value is a serialized BenchmarkTools.Trial containing:
+    #   times (ns), gctimes (ns), memory (bytes), allocs (count)
+    eval_benchmarks::Dict{Symbol, Any}
+    # Keys: :eval_objective, :eval_gradient, :eval_constraint,
+    #        :eval_jacobian, :eval_hessian_lagrangian
+
+    # Metadata
+    julia_version::String
+    timestamp::DateTime
+    runner::String
+    n_threads::Int
+end
+```
+
+## Benchmarking Layers
+
+### Layer 1: Micro-benchmarks (Eval Function Histograms)
+
+Use `BenchmarkTools.@benchmark` on individual MOI evaluator methods. This gives statistically robust distributions with proper warmup, plus allocation counts per call.
+
+```julia
+@testitem "Evaluator micro-benchmarks: CZ N=51" begin
+    using HarmoniqsBenchmarks, BenchmarkTools, Piccolissimo, Piccolo
+
+    prob = build_cz_problem(N=51)
+    evaluator, Z_vec = build_evaluator(prob)
+
+    # Pre-allocate output buffers
+    g = zeros(n_constraints(evaluator))
+    grad = zeros(n_variables(evaluator))
+    H = zeros(n_hessian_entries(evaluator))
+    J = zeros(n_jacobian_entries(evaluator))
+    sigma = 1.0
+    mu = ones(n_constraints(evaluator))
+
+    benchmarks = Dict(
+        :eval_objective          => @benchmark(MOI.eval_objective($evaluator, $Z_vec)),
+        :eval_gradient           => @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec)),
+        :eval_constraint         => @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec)),
+        :eval_jacobian           => @benchmark(MOI.eval_constraint_jacobian($evaluator, $J, $Z_vec)),
+        :eval_hessian_lagrangian => @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu)),
+    )
+
+    save_micro_results("cz_N51_ipopt", benchmarks; prob)
+end
+```
+
+**Regression detection:** Compare median times and allocation counts across versions. A >10% regression in any eval function on the same problem size flags for review.
+
+### Layer 2: Macro-benchmarks (Full Solves)
+
+Use `@timed` for wall clock + total allocations on `solve!`. Full optimization is not repeatable in the BenchmarkTools sense (each call modifies the problem), so we capture single-run metrics.
+
+```julia
+@testitem "CZ gate Ipopt vs MadNLP" begin
+    using HarmoniqsBenchmarks, Piccolissimo, Piccolo
+
+    prob = build_cz_problem(N=51)
+    result_ipopt = benchmark_solve!(prob, IpoptOptions())
+
+    prob = build_cz_problem(N=51)  # fresh problem
+    result_madnlp = benchmark_solve!(prob, MadNLPOptions())
+
+    save_results("cz_gate_comparison", [result_ipopt, result_madnlp])
+end
+```
+
+### Layer 3: Scaling Studies
+
+Parameterized sweeps over problem dimensions to characterize memory and time growth.
+
+```julia
+@testitem "MadNLP memory scaling" begin
+    using HarmoniqsBenchmarks, Piccolissimo, Piccolo
+
+    results = BenchmarkResult[]
+    for N in [25, 51, 101, 201, 401]
+        for state_dim in [4, 8, 16, 32, 64]
+            prob = build_bilinear_problem(; N, state_dim, n_controls=2)
+            r = benchmark_solve!(prob, MadNLPOptions())
+            push!(results, r)
+        end
+    end
+    save_results("madnlp_memory_scaling", results)
+end
+```
+
+### Layer 4: Allocation Profiling
+
+Tools for tracking down and eliminating allocations in the optimization hot path.
+
+**Profile.Allocs** — captures per-allocation stack traces during a solve:
+```julia
+@testitem "Allocation profile: CZ solve" begin
+    using HarmoniqsBenchmarks, Profile, Piccolissimo, Piccolo
+
+    prob = build_cz_problem(N=51)
+    Profile.Allocs.clear()
+    Profile.Allocs.@profile sample_rate=1.0 solve!(prob)
+    alloc_results = Profile.Allocs.fetch()
+
+    save_alloc_profile("cz_N51_alloc_profile", alloc_results)
+    # Visualize locally: using PProf; PProf.Allocs.pprof(alloc_results)
+end
+```
+
+**AllocCheck.jl** — compile-time zero-allocation enforcement for evaluator hot paths. Can be added as an optional CI check:
+```julia
+@testitem "Zero-allocation check: evaluator methods" begin
+    using AllocCheck, DirectTrajOpt
+
+    # These should be allocation-free once optimized
+    @check_allocs MOI.eval_constraint(ev::Evaluator, g::Vector{Float64}, Z::Vector{Float64})
+    @check_allocs MOI.eval_constraint_jacobian(ev::Evaluator, J::Vector{Float64}, Z::Vector{Float64})
+    @check_allocs MOI.eval_hessian_lagrangian(ev::Evaluator, H::Vector{Float64}, Z::Vector{Float64}, s::Float64, m::Vector{Float64})
+end
+```
+
+**Per-line tracking** (local development, not CI):
+```bash
+julia --track-allocation=user --project=benchmark benchmark/benchmarks.jl
+# Generates .mem files with per-line allocation counts
+```
+
+**Implementation note:** The best allocation profiling approach for the evaluator hot path is TBD. During implementation, spike all three approaches (`Profile.Allocs`, `AllocCheck.jl`, `--track-allocation`) in parallel worktrees against a representative problem (e.g. CZ N=51) to determine which gives the most actionable results for tracking down and eliminating allocations in the MOI eval methods.
+
+## Problem Generators
+
+Deterministic, parameterized problem constructors for reproducibility.
+
+### DirectTrajOpt level
+- `build_bilinear_problem(; N=51, state_dim=4, n_controls=2, seed=42)` — random Hermitian system matrices, bilinear integrator + quadratic regularizer
+- `build_constrained_problem(; N=51, state_dim=4, n_nonlinear=3, seed=42)` — adds nonlinear knot-point constraints
+
+### Piccolo/Piccolissimo level
+- `build_cz_problem(; N=51, integrator=:hermitian_exp)` — 2-qubit CZ gate, exchange-only system (4-level), matches spin-qubit-demo
+- `build_cnot_problem(; N=101, integrator=:hermitian_exp)` — 2-qubit CNOT with 3 EDSR drives
+- `build_transmon_problem(; levels=3, N=51)` — single-qubit X gate on multi-level transmon
+
+### Altissimo level
+- `build_polish_problem(; N=51, state_dim=4)` — pre-solved Ipopt problem ready for Altissimo refinement
+- `build_gpu_scaling_problem(; state_dim=1024)` — large-state-dim problem for GPU benchmarking
+
+### Intonato level
+- `build_qilc_problem(; N=101, n_paulis=15, J_mismatch=1.3)` — QILC calibration loop with simulated experiment, matches spin-qubit-demo pattern
+
+### Demo-repo-derived problems
+
+The harmoniqs org has several hardware-platform demo repos that provide real-world benchmark problems. During implementation, clone and extract representative problem configurations from:
+
+| Repo | Platform | Typical Dimensions | Key Benchmark |
+|------|----------|-------------------|---------------|
+| `spin-qubit-demo` | Silicon spin qubits | N=51-101, 4-level, 1-3 drives | CZ, CNOT, QILC calibration |
+| `bosonic-demo` | Bosonic cavity QED | Higher Hilbert space dims | Cavity control |
+| `nv-center-demo` | NV centers | Spin-1 + nuclear spins | Dark matter sensing pulses |
+| `atoms-demo` | Neutral atoms | Rydberg levels | Multi-qubit gates |
+| `ions` | Trapped ions | Motional modes + qubits | MS gate, individual addressing |
+| `fluxonium-demo` | Fluxonium qubits | Multi-level transmon-like | Single-qubit gates |
+| `gkp-stanford` | GKP states | Bosonic Fock space | State preparation |
+
+These provide the "enterprise-scale" problem suite that demonstrates what problem sizes each solver can handle. Extract the system Hamiltonians and problem parameters from each demo, wrap them as generators in `HarmoniqsBenchmarks.problems/`.
+
+All generators use `Random.seed!(seed)` for determinism.
+
+## Harness Functions
+
+### build_evaluator(prob) -> (evaluator, Z_vec)
+
+Extracts the MOI evaluator and initial decision variable vector from a `DirectTrajOptProblem`. Used for micro-benchmarks so individual eval functions can be called directly.
+
+### benchmark_solve!(prob, options; kwargs...) -> BenchmarkResult
+
+```julia
+function benchmark_solve!(prob, options; kwargs...)
+    GC.gc()
+    gc_before = Base.gc_num()
+
+    timed = @timed solve!(prob; options, kwargs...)
+
+    gc_after = Base.gc_num()
+
+    return BenchmarkResult(
+        # ... populate from prob metadata, timed, gc delta, options snapshot
+    )
+end
+```
+
+### save_results(name, results) / save_micro_results(name, benchmarks)
+
+Write JLD2 to `benchmark/results/<name>_<commit_sha>.jld2`.
+
+### compare_results(baseline_path, current_path) -> ComparisonTable
+
+Load two result sets and produce a diff table with percent changes, flagging regressions.
+
+## CI Workflow
+
+### Per-package: `.github/workflows/benchmark.yml`
+
+```yaml
+name: Benchmarks
+on:
+  push:
+    tags: ['v*']
+  workflow_dispatch:
+    inputs:
+      baseline_tag:
+        description: 'Tag to compare against'
+        required: false
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest   # free for OSS
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: '1.11'
+      - name: Instantiate benchmark env
+        run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
+      - name: Run benchmarks
+        run: julia --project=benchmark -t auto -e '
+          using TestItemRunner
+          @run_package_tests(benchmark)
+        '
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-${{ github.ref_name }}-${{ github.sha }}
+          path: benchmark/results/
+          retention-days: 365
+
+  # GPU/large-scale benchmarks (Altissimo, large N)
+  benchmark-gpu:
+    if: contains(github.repository, 'Altissimo') || github.event_name == 'workflow_dispatch'
+    runs-on: [self-hosted, gpu]   # EC2 runners from CuQuantum.jl setup
+    steps:
+      # same as above but with CUDA-enabled Julia
+```
+
+### Central aggregator: `harmoniqs-benchmarks` repo
+
+Triggered by workflow_dispatch or cron. Downloads latest artifacts from each package repo, generates comparison tables, stores historical archive.
+
+## Package Structure
+
+```
+HarmoniqsBenchmarks.jl/
+  src/
+    HarmoniqsBenchmarks.jl       # module + exports
+    schema.jl                     # BenchmarkResult, MicroBenchmarkResult
+    harness.jl                    # benchmark_solve!, build_evaluator
+    storage.jl                    # save/load JLD2, save_alloc_profile
+    report.jl                     # compare_results, regression detection
+    problems/
+      bilinear.jl                 # DirectTrajOpt-level generators
+      quantum_gates.jl            # Piccolo/Piccolissimo-level generators
+      polish.jl                   # Altissimo-level generators
+      qilc.jl                     # Intonato-level generators
+  Project.toml                    # deps: BenchmarkTools, JLD2, Dates
+  README.md
+
+# Per downstream package:
+DirectTrajOpt.jl/
+  benchmark/
+    Project.toml                  # [deps] HarmoniqsBenchmarks, BenchmarkTools, TestItems, ...
+    benchmarks.jl                 # @testitems: micro, macro, scaling
+    results/                      # .gitignored JLD2 output
+```
+
+## Verification
+
+1. **Unit test the harness:** `benchmark_solve!` returns a valid `BenchmarkResult` with all fields populated
+2. **Run micro-benchmarks locally:** Confirm BenchmarkTools produces histograms for each eval function
+3. **Run scaling sweep:** Verify memory grows as expected with N and state_dim
+4. **CI dry run:** Trigger workflow_dispatch on DirectTrajOpt, confirm artifact upload
+5. **Cross-package comparison:** Run aggregator on two package artifacts, verify comparison table output
+6. **Allocation profiling:** Run Profile.Allocs on a solve, verify PProf flamegraph renders
+
+## Scope
+
+**In scope (this design):**
+- HarmoniqsBenchmarks.jl package creation
+- DirectTrajOpt benchmark suite (Ipopt vs MadNLP, scaling, micro-benchmarks, allocation profiling)
+- Piccolissimo benchmark suite (integrate existing benchmarks + new scaling)
+- CI workflows for DirectTrajOpt and Piccolissimo
+- Aggregator script in harmoniqs-benchmarks repo
+
+**Future work:**
+- Altissimo GPU benchmarks (requires CUDA runner validation)
+- Intonato convergence benchmarks (requires stable Phase 5)
+- Piccolo template benchmarks
+- AllocCheck CI gates (after hot paths are optimized)
+- Automated regression comments on PRs

From 7fe44c572d074368d55e97caee63443b925b0d55 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Wed, 15 Apr 2026 18:55:37 -0400
Subject: [PATCH 03/13] ci: add benchmark workflow and README, remove stale
 files

- Add .github/workflows/benchmark.yml that runs on PRs touching src/ or benchmark/
- Uses Pkg.add(url=...) to install HarmoniqsBenchmarks (unregistered)
- Uploads JLD2 artifacts for 90 days
- Add benchmark/README.md with run instructions
- Remove empty BenchmarkUtils.jl leftover
- Ignore Manifest.toml (regenerated on each CI run)
---
 .github/workflows/benchmark.yml | 56 +++++++++++++++++++++++++++++++++
 benchmark/.gitignore            |  1 +
 benchmark/BenchmarkUtils.jl     |  1 -
 benchmark/README.md             | 37 ++++++++++++++++++++++
 4 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/benchmark.yml
 delete mode 100644 benchmark/BenchmarkUtils.jl
 create mode 100644 benchmark/README.md

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..ac64298
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,56 @@
+name: Benchmarks
+on:
+  push:
+    tags: ['v*']
+  pull_request:
+    paths:
+      - 'src/**'
+      - 'benchmark/**'
+      - '.github/workflows/benchmark.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  benchmark:
+    name: Benchmark suite
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    permissions:
+      actions: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: '1.11'
+          arch: x64
+
+      - uses: julia-actions/cache@v2
+
+      - name: Install HarmoniqsBenchmarks (unregistered) and dev-install DirectTrajOpt
+        run: |
+          julia --project=benchmark -e '
+            using Pkg
+            Pkg.develop(path=".")
+            Pkg.add(url="https://github.com/harmoniqs/HarmoniqsBenchmarks.jl")
+            Pkg.instantiate()
+          '
+
+      - name: Run benchmarks
+        run: |
+          julia --project=benchmark -t auto -e '
+            using TestItemRunner
+            TestItemRunner.run_tests("benchmark/")
+          '
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-${{ github.ref_name }}-${{ github.sha }}
+          path: benchmark/results/
+          retention-days: 90
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index fbca225..ca28c11 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -1 +1,2 @@
 results/
+Manifest.toml
diff --git a/benchmark/BenchmarkUtils.jl b/benchmark/BenchmarkUtils.jl
deleted file mode 100644
index 8b13789..0000000
--- a/benchmark/BenchmarkUtils.jl
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..534cec0
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,37 @@
+# DirectTrajOpt Benchmarks
+
+Benchmark suite for DirectTrajOpt.jl comparing Ipopt and MadNLP solver performance.
+
+## Running locally
+
+```bash
+# From DirectTrajOpt.jl root
+julia --project=benchmark -e '
+    using Pkg
+    Pkg.add(url="https://github.com/harmoniqs/HarmoniqsBenchmarks.jl")
+    Pkg.instantiate()
+'
+
+julia --project=benchmark -t auto -e '
+    using TestItemRunner
+    TestItemRunner.run_tests("benchmark/")
+'
+```
+
+Artifacts are saved as JLD2 files in `benchmark/results/` (gitignored).
+
+## Benchmark suites
+
+- **Evaluator micro-benchmarks** — `BenchmarkTools.@benchmark` timings for each MOI eval function (objective, gradient, constraint, jacobian, hessian_lagrangian) on bilinear N=51
+- **Ipopt vs MadNLP** — full solve comparison on bilinear N=51
+- **Memory scaling study** — N ∈ {25, 51, 101} × state_dim ∈ {4, 8, 16}
+
+## Schema
+
+Results use `BenchmarkResult` / `MicroBenchmarkResult` from [HarmoniqsBenchmarks.jl](https://github.com/harmoniqs/HarmoniqsBenchmarks.jl).
+
+Load with:
+```julia
+using HarmoniqsBenchmarks
+results = load_results("benchmark/results/ipopt_vs_madnlp_N51_<sha>.jld2")
+```

From 3a5003c061675cb7f882ad5e13d064827c8c53ab Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Wed, 15 Apr 2026 19:01:52 -0400
Subject: [PATCH 04/13] benchmark: use [sources] in Project.toml instead of
 Pkg.add in CI

Uses Julia 1.11+ [sources] section to resolve:
- DirectTrajOpt from local path (parent dir)
- HarmoniqsBenchmarks from public GitHub URL

CI workflow simplified to just Pkg.instantiate (no manual Pkg.add needed).
---
 .github/workflows/benchmark.yml | 10 ++--------
 benchmark/Project.toml          |  8 ++++++--
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index ac64298..e4bdf37 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -31,14 +31,8 @@ jobs:
 
       - uses: julia-actions/cache@v2
 
-      - name: Install HarmoniqsBenchmarks (unregistered) and dev-install DirectTrajOpt
-        run: |
-          julia --project=benchmark -e '
-            using Pkg
-            Pkg.develop(path=".")
-            Pkg.add(url="https://github.com/harmoniqs/HarmoniqsBenchmarks.jl")
-            Pkg.instantiate()
-          '
+      - name: Instantiate benchmark environment
+        run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
 
       - name: Run benchmarks
         run: |
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 9782442..b219215 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
 ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -8,9 +9,12 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
 NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
-Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[sources]
+DirectTrajOpt = {path = ".."}
+HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl"}

From 6d9fa524c4681ab13807a2106495be5507b2c673 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Wed, 15 Apr 2026 20:44:50 -0400
Subject: [PATCH 05/13] ci: sanitize artifact name (PR refs contain / which is
 invalid)

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e4bdf37..1f20d76 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,6 +45,6 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark-${{ github.ref_name }}-${{ github.sha }}
+          name: benchmark-${{ github.event.pull_request.number || github.ref_name }}-${{ github.sha }}
           path: benchmark/results/
           retention-days: 90

From 11720e35256255840f5e84254420db17d45846c3 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Thu, 16 Apr 2026 02:04:35 -0400
Subject: [PATCH 06/13] chore: move specs/plans to separate PR, fix stale
 README
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove docs/superpowers/ (specs and plans) from this PR to keep the
diff focused on benchmarks and MadNLP integration. Fix stale Pkg.add
instruction in benchmark README — deps resolve via [sources] now.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmark/README.md                           |    6 +-
 .../2026-04-15-benchmarking-infrastructure.md | 1620 -----------------
 ...6-04-15-altissimo-gpu-benchmarks-design.md |  198 --
 .../specs/2026-04-15-benchmarking-design.md   |  383 ----
 4 files changed, 1 insertion(+), 2206 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
 delete mode 100644 docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
 delete mode 100644 docs/superpowers/specs/2026-04-15-benchmarking-design.md

diff --git a/benchmark/README.md b/benchmark/README.md
index 534cec0..c0737c9 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -6,11 +6,7 @@ Benchmark suite for DirectTrajOpt.jl comparing Ipopt and MadNLP solver performan
 
 ```bash
 # From DirectTrajOpt.jl root
-julia --project=benchmark -e '
-    using Pkg
-    Pkg.add(url="https://github.com/harmoniqs/HarmoniqsBenchmarks.jl")
-    Pkg.instantiate()
-'
+julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
 
 julia --project=benchmark -t auto -e '
     using TestItemRunner
diff --git a/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md b/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
deleted file mode 100644
index 99dee3f..0000000
--- a/docs/superpowers/plans/2026-04-15-benchmarking-infrastructure.md
+++ /dev/null
@@ -1,1620 +0,0 @@
-# HarmoniqsBenchmarks.jl + DirectTrajOpt Benchmark Suite — Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Create a shared benchmarking package (`HarmoniqsBenchmarks.jl`) and wire up the first benchmark suite in DirectTrajOpt.jl comparing Ipopt vs MadNLP, with micro-benchmarks, full-solve benchmarks, and memory scaling studies.
-
-**Architecture:** HarmoniqsBenchmarks.jl provides schema types, a profiling harness, and JLD2 storage/comparison. DirectTrajOpt.jl's `benchmark/` directory contains `@testitem`-based benchmarks that use the shared harness. Both Ipopt and MadNLP benchmarks use the same shared `Evaluator` (in `src/solvers/evaluator.jl`), so micro-benchmarks are solver-agnostic while macro-benchmarks compare the two solver backends.
-
-**Tech Stack:** Julia 1.11+, BenchmarkTools.jl, JLD2.jl, TestItems/TestItemRunner, MathOptInterface
-
-**Spec:** `docs/superpowers/specs/2026-04-15-benchmarking-design.md`
-
----
-
-## File Structure
-
-### New repo: `HarmoniqsBenchmarks.jl` (at `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/`)
-
-| File | Responsibility |
-|------|---------------|
-| `Project.toml` | Package metadata + deps (BenchmarkTools, JLD2, Dates, DirectTrajOpt, MathOptInterface, NamedTrajectories) |
-| `src/HarmoniqsBenchmarks.jl` | Module definition + exports |
-| `src/schema.jl` | `BenchmarkResult`, `MicroBenchmarkResult`, `EvalBenchmark` structs |
-| `src/harness.jl` | `build_evaluator`, `benchmark_solve!`, GC/allocation capture |
-| `src/storage.jl` | `save_results`, `save_micro_results`, `load_results`, `load_micro_results` |
-| `src/report.jl` | `compare_results` — diff tables + regression flagging |
-| `test/runtests.jl` | Tests for all of the above |
-
-### Modified repo: `DirectTrajOpt.jl` (benchmark directory)
-
-| File | Responsibility |
-|------|---------------|
-| `benchmark/Project.toml` | Benchmark env deps (HarmoniqsBenchmarks, BenchmarkTools, TestItems, MadNLP) |
-| `benchmark/benchmarks.jl` | `@testitem` definitions: micro, macro, scaling |
-| `benchmark/.gitignore` | Ignore `results/` directory |
-
----
-
-## Task 1: Create HarmoniqsBenchmarks.jl Project Skeleton
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/Project.toml`
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
-
-- [ ] **Step 1: Initialize the package directory**
-
-```bash
-mkdir -p /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src
-mkdir -p /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git init
-```
-
-- [ ] **Step 2: Create Project.toml**
-
-```toml
-name = "HarmoniqsBenchmarks"
-uuid = "GENERATE_UUID"
-version = "0.1.0"
-authors = ["harmoniqs contributors"]
-
-[deps]
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
-DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
-NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
-
-[compat]
-BenchmarkTools = "1.6"
-Dates = "1.10, 1.11, 1.12"
-DirectTrajOpt = "0.8"
-JLD2 = "0.5"
-MathOptInterface = "1.49"
-NamedTrajectories = "0.8"
-julia = "1.10, 1.11, 1.12"
-```
-
-Generate the UUID with: `using UUIDs; uuid4()`
-
-- [ ] **Step 3: Create module stub**
-
-```julia
-# src/HarmoniqsBenchmarks.jl
-module HarmoniqsBenchmarks
-
-end
-```
-
-- [ ] **Step 4: Dev-install dependencies and verify the package loads**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e '
-    using Pkg
-    Pkg.develop(path="../DirectTrajOpt.jl")
-    Pkg.develop(path="../NamedTrajectories.jl")
-    Pkg.instantiate()
-    using HarmoniqsBenchmarks
-    println("Package loads OK")
-'
-```
-
-Expected: "Package loads OK"
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add Project.toml src/HarmoniqsBenchmarks.jl
-git commit -m "feat: initialize HarmoniqsBenchmarks.jl package skeleton"
-```
-
----
-
-## Task 2: Implement Schema Types
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/schema.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Write tests for schema types**
-
-```julia
-# test/runtests.jl
-using Test
-using HarmoniqsBenchmarks
-using Dates
-
-@testset "HarmoniqsBenchmarks" begin
-
-@testset "Schema" begin
-    @testset "EvalBenchmark construction" begin
-        eb = EvalBenchmark(
-            times_ns = [100.0, 110.0, 105.0],
-            gctimes_ns = [0.0, 0.0, 5.0],
-            memory_bytes = 1024,
-            allocs = 3,
-        )
-        @test eb.median_ns == 105.0
-        @test eb.min_ns == 100.0
-        @test 104.0 < eb.mean_ns < 106.0
-    end
-
-    @testset "BenchmarkResult construction" begin
-        r = BenchmarkResult(
-            package = "DirectTrajOpt",
-            package_version = "0.8.10",
-            commit = "abc1234",
-            benchmark_name = "test_bench",
-            N = 51,
-            state_dim = 4,
-            control_dim = 2,
-            n_constraints = 200,
-            n_variables = 765,
-            wall_time_s = 1.5,
-            iterations = 42,
-            objective_value = 0.001,
-            constraint_violation = 1e-8,
-            solver_status = :Optimal,
-            solver = "ipopt",
-            total_allocations_bytes = 1_000_000,
-            total_allocs_count = 500,
-            gc_time_ns = 10_000,
-            gc_count = 2,
-            gc_full_count = 0,
-            solver_options = Dict{Symbol,Any}(:tol => 1e-8, :max_iter => 1000),
-            julia_version = string(VERSION),
-            timestamp = now(),
-            runner = "local",
-            n_threads = 1,
-        )
-        @test r.package == "DirectTrajOpt"
-        @test r.solver_status == :Optimal
-    end
-
-    @testset "MicroBenchmarkResult construction" begin
-        eb = EvalBenchmark(
-            times_ns = [100.0],
-            gctimes_ns = [0.0],
-            memory_bytes = 0,
-            allocs = 0,
-        )
-        mr = MicroBenchmarkResult(
-            package = "DirectTrajOpt",
-            package_version = "0.8.10",
-            commit = "abc1234",
-            benchmark_name = "micro_test",
-            N = 51,
-            state_dim = 4,
-            control_dim = 2,
-            eval_benchmarks = Dict{Symbol,EvalBenchmark}(
-                :eval_objective => eb,
-            ),
-            julia_version = string(VERSION),
-            timestamp = now(),
-            runner = "local",
-            n_threads = 1,
-        )
-        @test mr.eval_benchmarks[:eval_objective].min_ns == 100.0
-    end
-end
-
-end # HarmoniqsBenchmarks testset
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `EvalBenchmark` not defined
-
-- [ ] **Step 3: Implement schema types**
-
-```julia
-# src/schema.jl
-using Dates
-using Statistics: median, mean
-
-struct EvalBenchmark
-    times_ns::Vector{Float64}
-    gctimes_ns::Vector{Float64}
-    memory_bytes::Int
-    allocs::Int
-    # Derived stats (computed at construction)
-    median_ns::Float64
-    min_ns::Float64
-    mean_ns::Float64
-end
-
-function EvalBenchmark(;
-    times_ns::Vector{Float64},
-    gctimes_ns::Vector{Float64},
-    memory_bytes::Int,
-    allocs::Int,
-)
-    return EvalBenchmark(
-        times_ns,
-        gctimes_ns,
-        memory_bytes,
-        allocs,
-        median(times_ns),
-        minimum(times_ns),
-        mean(times_ns),
-    )
-end
-
-struct BenchmarkResult
-    # Identity
-    package::String
-    package_version::String
-    commit::String
-    benchmark_name::String
-    # Problem dimensions
-    N::Int
-    state_dim::Int
-    control_dim::Int
-    n_constraints::Int
-    n_variables::Int
-    # Solve metrics
-    wall_time_s::Float64
-    iterations::Int
-    objective_value::Float64
-    constraint_violation::Float64
-    solver_status::Symbol
-    solver::String
-    # Memory & allocations
-    total_allocations_bytes::Int
-    total_allocs_count::Int
-    gc_time_ns::Int
-    gc_count::Int
-    gc_full_count::Int
-    # Solver options snapshot
-    solver_options::Dict{Symbol,Any}
-    # Metadata
-    julia_version::String
-    timestamp::DateTime
-    runner::String
-    n_threads::Int
-end
-
-struct MicroBenchmarkResult
-    package::String
-    package_version::String
-    commit::String
-    benchmark_name::String
-    N::Int
-    state_dim::Int
-    control_dim::Int
-    eval_benchmarks::Dict{Symbol,EvalBenchmark}
-    julia_version::String
-    timestamp::DateTime
-    runner::String
-    n_threads::Int
-end
-```
-
-- [ ] **Step 4: Update module to include schema and export types**
-
-```julia
-# src/HarmoniqsBenchmarks.jl
-module HarmoniqsBenchmarks
-
-export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
-
-include("schema.jl")
-
-end
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/schema.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add BenchmarkResult, MicroBenchmarkResult, EvalBenchmark schema types"
-```
-
----
-
-## Task 3: Implement JLD2 Storage
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/storage.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Add storage tests**
-
-Append to `test/runtests.jl`, inside the top-level `@testset "HarmoniqsBenchmarks"`:
-
-```julia
-@testset "Storage" begin
-    mktempdir() do dir
-        r = BenchmarkResult(
-            package = "DirectTrajOpt",
-            package_version = "0.8.10",
-            commit = "abc1234",
-            benchmark_name = "storage_test",
-            N = 51, state_dim = 4, control_dim = 2,
-            n_constraints = 200, n_variables = 765,
-            wall_time_s = 1.5, iterations = 42,
-            objective_value = 0.001, constraint_violation = 1e-8,
-            solver_status = :Optimal, solver = "ipopt",
-            total_allocations_bytes = 1_000_000, total_allocs_count = 500,
-            gc_time_ns = 10_000, gc_count = 2, gc_full_count = 0,
-            solver_options = Dict{Symbol,Any}(:tol => 1e-8),
-            julia_version = string(VERSION),
-            timestamp = now(), runner = "local", n_threads = 1,
-        )
-
-        path = save_results(dir, "test_bench", [r])
-        @test isfile(path)
-        @test endswith(path, ".jld2")
-
-        loaded = load_results(path)
-        @test length(loaded) == 1
-        @test loaded[1].package == "DirectTrajOpt"
-        @test loaded[1].wall_time_s == 1.5
-        @test loaded[1].solver_options[:tol] == 1e-8
-    end
-
-    mktempdir() do dir
-        eb = EvalBenchmark(
-            times_ns = [100.0, 110.0],
-            gctimes_ns = [0.0, 0.0],
-            memory_bytes = 512, allocs = 1,
-        )
-        mr = MicroBenchmarkResult(
-            package = "DirectTrajOpt",
-            package_version = "0.8.10",
-            commit = "abc1234",
-            benchmark_name = "micro_storage_test",
-            N = 51, state_dim = 4, control_dim = 2,
-            eval_benchmarks = Dict(:eval_objective => eb),
-            julia_version = string(VERSION),
-            timestamp = now(), runner = "local", n_threads = 1,
-        )
-
-        path = save_micro_results(dir, "micro_test", mr)
-        @test isfile(path)
-
-        loaded = load_micro_results(path)
-        @test loaded.benchmark_name == "micro_storage_test"
-        @test loaded.eval_benchmarks[:eval_objective].min_ns == 100.0
-    end
-end
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `save_results` not defined
-
-- [ ] **Step 3: Implement storage functions**
-
-```julia
-# src/storage.jl
-using JLD2
-
-"""
-    save_results(dir, name, results::Vector{BenchmarkResult}) -> String
-
-Save benchmark results to a JLD2 file in `dir`. Returns the file path.
-"""
-function save_results(dir::String, name::String, results::Vector{BenchmarkResult})
-    mkpath(dir)
-    commit = isempty(results) ? "unknown" : results[1].commit
-    filename = "$(name)_$(commit).jld2"
-    path = joinpath(dir, filename)
-    JLD2.jldsave(path; results=results)
-    return path
-end
-
-"""
-    load_results(path) -> Vector{BenchmarkResult}
-
-Load benchmark results from a JLD2 file.
-"""
-function load_results(path::String)
-    return JLD2.load(path, "results")
-end
-
-"""
-    save_micro_results(dir, name, result::MicroBenchmarkResult) -> String
-
-Save micro-benchmark results to a JLD2 file in `dir`. Returns the file path.
-"""
-function save_micro_results(dir::String, name::String, result::MicroBenchmarkResult)
-    mkpath(dir)
-    filename = "$(name)_$(result.commit).jld2"
-    path = joinpath(dir, filename)
-    JLD2.jldsave(path; result=result)
-    return path
-end
-
-"""
-    load_micro_results(path) -> MicroBenchmarkResult
-
-Load micro-benchmark results from a JLD2 file.
-"""
-function load_micro_results(path::String)
-    return JLD2.load(path, "result")
-end
-```
-
-- [ ] **Step 4: Update module**
-
-```julia
-# src/HarmoniqsBenchmarks.jl
-module HarmoniqsBenchmarks
-
-export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
-export save_results, load_results, save_micro_results, load_micro_results
-
-include("schema.jl")
-include("storage.jl")
-
-end
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/storage.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add JLD2 save/load for BenchmarkResult and MicroBenchmarkResult"
-```
-
----
-
-## Task 4: Implement build_evaluator Harness
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Add test for build_evaluator**
-
-Append to `test/runtests.jl`, inside top-level testset:
-
-```julia
-@testset "Harness" begin
-    using DirectTrajOpt
-    using NamedTrajectories
-    using SparseArrays
-    using ExponentialAction
-    using MathOptInterface
-    const MOI = MathOptInterface
-
-    # Build a simple bilinear problem (same as DirectTrajOpt test_utils.jl)
-    N = 10; Δt = 0.1; u_bound = 0.1; ω = 0.1
-    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
-    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
-    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
-    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
-
-    traj = NamedTrajectory(
-        (
-            x = 2rand(4, N) .- 1,
-            u = u_bound * (2rand(2, N) .- 1),
-            du = randn(2, N),
-            ddu = randn(2, N),
-            Δt = fill(Δt, N),
-        );
-        controls = (:ddu, :Δt),
-        timestep = :Δt,
-        bounds = (u = u_bound, Δt = (0.01, 0.5)),
-        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-        final = (u = zeros(2),),
-        goal = (x = [0.0, 1.0, 0.0, 0.0],),
-    )
-
-    integrators = [
-        BilinearIntegrator(G, :x, :u, traj),
-        DerivativeIntegrator(:u, :du, traj),
-        DerivativeIntegrator(:du, :ddu, traj),
-    ]
-
-    J = QuadraticRegularizer(:u, traj, 1.0)
-    prob = DirectTrajOptProblem(traj, J, integrators)
-
-    @testset "build_evaluator returns evaluator and Z vector" begin
-        evaluator, Z_vec = build_evaluator(prob)
-        @test evaluator isa MOI.AbstractNLPEvaluator
-        @test length(Z_vec) == traj.dim * traj.N + traj.global_dim
-
-        # Verify eval functions are callable
-        obj = MOI.eval_objective(evaluator, Z_vec)
-        @test obj isa Float64
-        @test isfinite(obj)
-    end
-
-    @testset "evaluator_dims returns correct sizes" begin
-        evaluator, Z_vec = build_evaluator(prob)
-        dims = evaluator_dims(evaluator)
-        @test dims.n_constraints == evaluator.n_constraints
-        @test dims.n_variables == length(Z_vec)
-        @test dims.n_jacobian_entries == length(evaluator.jacobian_structure)
-        @test dims.n_hessian_entries == length(evaluator.hessian_structure)
-    end
-end
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `build_evaluator` not defined
-
-- [ ] **Step 3: Implement build_evaluator and evaluator_dims**
-
-```julia
-# src/harness.jl
-using DirectTrajOpt
-using NamedTrajectories
-using MathOptInterface
-const MOI = MathOptInterface
-
-"""
-    build_evaluator(prob::DirectTrajOptProblem; eval_hessian=true) -> (evaluator, Z_vec)
-
-Extract a MOI evaluator and the initial decision variable vector from a
-DirectTrajOptProblem. Used for micro-benchmarking individual eval functions.
-
-Returns:
-- `evaluator`: An `MOI.AbstractNLPEvaluator` ready for `MOI.eval_*` calls
-- `Z_vec`: The flat decision variable vector `[trajectory_data; global_data]`
-"""
-function build_evaluator(prob::DirectTrajOpt.Problems.DirectTrajOptProblem; eval_hessian::Bool=true)
-    evaluator = DirectTrajOpt.Solvers.Evaluator(prob; eval_hessian=eval_hessian, verbose=false)
-    traj = prob.trajectory
-    Z_vec = vcat(collect(traj.datavec), collect(traj.global_data))
-    return evaluator, Z_vec
-end
-
-"""
-    evaluator_dims(evaluator) -> NamedTuple
-
-Return key dimensions of the evaluator for buffer pre-allocation.
-"""
-function evaluator_dims(evaluator::DirectTrajOpt.Solvers.Evaluator)
-    return (
-        n_constraints = evaluator.n_constraints,
-        n_variables = evaluator.trajectory.dim * evaluator.trajectory.N + evaluator.trajectory.global_dim,
-        n_jacobian_entries = length(evaluator.jacobian_structure),
-        n_hessian_entries = length(evaluator.hessian_structure),
-    )
-end
-```
-
-- [ ] **Step 4: Update module**
-
-```julia
-# src/HarmoniqsBenchmarks.jl
-module HarmoniqsBenchmarks
-
-export EvalBenchmark, BenchmarkResult, MicroBenchmarkResult
-export save_results, load_results, save_micro_results, load_micro_results
-export build_evaluator, evaluator_dims
-
-include("schema.jl")
-include("storage.jl")
-include("harness.jl")
-
-end
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add build_evaluator and evaluator_dims harness functions"
-```
-
----
-
-## Task 5: Implement benchmark_solve! Harness
-
-**Files:**
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Add test for benchmark_solve!**
-
-Append inside the `@testset "Harness"` block in `test/runtests.jl`:
-
-```julia
-@testset "benchmark_solve! captures metrics" begin
-    # Rebuild a fresh problem (solve! mutates in place)
-    traj2 = NamedTrajectory(
-        (
-            x = 2rand(4, N) .- 1,
-            u = u_bound * (2rand(2, N) .- 1),
-            du = randn(2, N),
-            ddu = randn(2, N),
-            Δt = fill(Δt, N),
-        );
-        controls = (:ddu, :Δt),
-        timestep = :Δt,
-        bounds = (u = u_bound, Δt = (0.01, 0.5)),
-        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-        final = (u = zeros(2),),
-        goal = (x = [0.0, 1.0, 0.0, 0.0],),
-    )
-    integrators2 = [
-        BilinearIntegrator(G, :x, :u, traj2),
-        DerivativeIntegrator(:u, :du, traj2),
-        DerivativeIntegrator(:du, :ddu, traj2),
-    ]
-    J2 = QuadraticRegularizer(:u, traj2, 1.0)
-    prob2 = DirectTrajOptProblem(traj2, J2, integrators2)
-
-    result = benchmark_solve!(
-        prob2, IpoptOptions(max_iter=10, print_level=0);
-        benchmark_name = "test_solve",
-    )
-
-    @test result isa BenchmarkResult
-    @test result.package == "DirectTrajOpt"
-    @test result.solver == "ipopt"
-    @test result.wall_time_s > 0.0
-    @test result.iterations >= 0
-    @test result.total_allocations_bytes >= 0
-    @test result.gc_count >= 0
-    @test result.N == N
-    @test result.state_dim == 4
-    @test haskey(result.solver_options, :max_iter)
-    @test result.solver_options[:max_iter] == 10
-end
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `benchmark_solve!` not defined
-
-- [ ] **Step 3: Implement benchmark_solve!**
-
-Append to `src/harness.jl`:
-
-```julia
-using Dates
-
-"""
-    benchmark_solve!(prob, options; benchmark_name, runner="local", kwargs...) -> BenchmarkResult
-
-Run `solve!(prob; options, kwargs...)` and capture timing, memory, GC stats, and solver options.
-"""
-function benchmark_solve!(
-    prob::DirectTrajOpt.Problems.DirectTrajOptProblem,
-    options::DirectTrajOpt.Solvers.AbstractSolverOptions;
-    benchmark_name::String = "unnamed",
-    runner::String = "local",
-    verbose::Bool = false,
-    kwargs...,
-)
-    traj = prob.trajectory
-
-    # Capture problem dimensions before solve
-    n_vars = traj.dim * traj.N + traj.global_dim
-    state_dim = _infer_state_dim(prob)
-    control_dim = _infer_control_dim(prob)
-    n_constraints_total = _count_constraints(prob, options)
-
-    # Snapshot solver options
-    opts_snapshot = Dict{Symbol,Any}()
-    for name in fieldnames(typeof(options))
-        opts_snapshot[name] = getfield(options, name)
-    end
-
-    # GC baseline
-    GC.gc()
-    gc_before = Base.gc_num()
-
-    # Timed solve
-    timed = @timed solve!(prob; options=options, verbose=verbose, kwargs...)
-
-    gc_after = Base.gc_num()
-
-    # Compute GC deltas
-    gc_time = timed.gctime  # in seconds, convert to ns
-    gc_count_delta = gc_after.pause - gc_before.pause
-    gc_full_delta = gc_after.full_sweep - gc_before.full_sweep
-
-    # Package version from Project.toml
-    pkg_version = _get_package_version("DirectTrajOpt")
-    commit = _get_git_commit()
-
-    return BenchmarkResult(
-        package = "DirectTrajOpt",
-        package_version = pkg_version,
-        commit = commit,
-        benchmark_name = benchmark_name,
-        N = traj.N,
-        state_dim = state_dim,
-        control_dim = control_dim,
-        n_constraints = n_constraints_total,
-        n_variables = n_vars,
-        wall_time_s = timed.time,
-        iterations = -1,  # TODO: extract from solver output when available
-        objective_value = NaN,  # TODO: extract from solver
-        constraint_violation = NaN,
-        solver_status = :Unknown,
-        solver = _solver_name(options),
-        total_allocations_bytes = timed.bytes,
-        total_allocs_count = -1,  # @timed doesn't give count; use gc_num delta
-        gc_time_ns = round(Int, timed.gctime * 1e9),
-        gc_count = gc_count_delta,
-        gc_full_count = gc_full_delta,
-        solver_options = opts_snapshot,
-        julia_version = string(VERSION),
-        timestamp = now(),
-        runner = runner,
-        n_threads = Threads.nthreads(),
-    )
-end
-
-# --- helpers ---
-
-function _solver_name(options::DirectTrajOpt.Solvers.AbstractSolverOptions)
-    name = string(typeof(options).name.name)
-    if occursin("Ipopt", name)
-        return "ipopt"
-    elseif occursin("MadNLP", name)
-        return "madnlp"
-    else
-        return lowercase(name)
-    end
-end
-
-function _infer_state_dim(prob)
-    traj = prob.trajectory
-    # Heuristic: look for common state variable names
-    for name in [:x, :ψ̃, :Ũ⃗, :ρ̃]
-        if haskey(traj.dims, name)
-            return traj.dims[name]
-        end
-    end
-    # Fallback: first non-control component
-    return first(values(traj.dims))
-end
-
-function _infer_control_dim(prob)
-    traj = prob.trajectory
-    total = 0
-    for name in traj.control_names
-        if name != traj.timestep_name
-            total += traj.dims[name]
-        end
-    end
-    return total
-end
-
-function _count_constraints(prob, options)
-    n_dynamics = sum(integrator.dim for integrator in prob.integrators; init=0)
-    n_nonlinear = sum(
-        c.dim for c in prob.constraints
-        if c isa DirectTrajOpt.Constraints.AbstractNonlinearConstraint;
-        init=0
-    )
-    return n_dynamics * (prob.trajectory.N - 1) + n_nonlinear
-end
-
-function _get_package_version(pkg_name::String)
-    try
-        deps = Pkg.dependencies()
-        for (_, info) in deps
-            if info.name == pkg_name
-                return string(info.version)
-            end
-        end
-    catch
-    end
-    return "unknown"
-end
-
-function _get_git_commit()
-    try
-        return strip(read(`git rev-parse --short HEAD`, String))
-    catch
-        return "unknown"
-    end
-end
-```
-
-- [ ] **Step 4: Add `Pkg` import to harness.jl**
-
-Add at the top of `src/harness.jl`:
-
-```julia
-import Pkg
-```
-
-- [ ] **Step 5: Update module exports**
-
-In `src/HarmoniqsBenchmarks.jl`, add to exports:
-
-```julia
-export benchmark_solve!
-```
-
-- [ ] **Step 6: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 7: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add benchmark_solve! harness with GC stats and options snapshot"
-```
-
----
-
-## Task 6: Implement BenchmarkTools→EvalBenchmark Conversion
-
-**Files:**
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/harness.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Add test for trial_to_eval_benchmark**
-
-Append inside `@testset "Harness"`:
-
-```julia
-@testset "trial_to_eval_benchmark extracts data from BenchmarkTools.Trial" begin
-    using BenchmarkTools
-    trial = @benchmark 1 + 1
-    eb = trial_to_eval_benchmark(trial)
-    @test eb isa EvalBenchmark
-    @test length(eb.times_ns) > 0
-    @test eb.min_ns > 0.0
-    @test eb.memory_bytes >= 0
-    @test eb.allocs >= 0
-end
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `trial_to_eval_benchmark` not defined
-
-- [ ] **Step 3: Implement trial_to_eval_benchmark**
-
-Append to `src/harness.jl`:
-
-```julia
-using BenchmarkTools
-
-"""
-    trial_to_eval_benchmark(trial::BenchmarkTools.Trial) -> EvalBenchmark
-
-Convert a BenchmarkTools.Trial to an EvalBenchmark, extracting raw timing data.
-"""
-function trial_to_eval_benchmark(trial::BenchmarkTools.Trial)
-    return EvalBenchmark(
-        times_ns = Float64.(trial.times),
-        gctimes_ns = Float64.(trial.gctimes),
-        memory_bytes = trial.memory,
-        allocs = trial.allocs,
-    )
-end
-```
-
-- [ ] **Step 4: Export the function**
-
-Add `trial_to_eval_benchmark` to exports in `src/HarmoniqsBenchmarks.jl`.
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/harness.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add trial_to_eval_benchmark for BenchmarkTools integration"
-```
-
----
-
-## Task 7: Implement compare_results Reporter
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/report.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/src/HarmoniqsBenchmarks.jl`
-- Modify: `/home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl/test/runtests.jl`
-
-- [ ] **Step 1: Add test for compare_results**
-
-Append to `test/runtests.jl`, inside top-level testset:
-
-```julia
-@testset "Report" begin
-    @testset "compare_results detects regressions" begin
-        baseline = BenchmarkResult(
-            package="DirectTrajOpt", package_version="0.8.9",
-            commit="aaa1111", benchmark_name="test",
-            N=51, state_dim=4, control_dim=2,
-            n_constraints=200, n_variables=765,
-            wall_time_s=1.0, iterations=50,
-            objective_value=0.001, constraint_violation=1e-8,
-            solver_status=:Optimal, solver="ipopt",
-            total_allocations_bytes=1_000_000, total_allocs_count=500,
-            gc_time_ns=10_000, gc_count=2, gc_full_count=0,
-            solver_options=Dict{Symbol,Any}(),
-            julia_version=string(VERSION), timestamp=now(),
-            runner="local", n_threads=1,
-        )
-
-        # 20% regression in wall time
-        current = BenchmarkResult(
-            package="DirectTrajOpt", package_version="0.8.10",
-            commit="bbb2222", benchmark_name="test",
-            N=51, state_dim=4, control_dim=2,
-            n_constraints=200, n_variables=765,
-            wall_time_s=1.2, iterations=50,
-            objective_value=0.001, constraint_violation=1e-8,
-            solver_status=:Optimal, solver="ipopt",
-            total_allocations_bytes=900_000, total_allocs_count=450,
-            gc_time_ns=10_000, gc_count=2, gc_full_count=0,
-            solver_options=Dict{Symbol,Any}(),
-            julia_version=string(VERSION), timestamp=now(),
-            runner="local", n_threads=1,
-        )
-
-        comparison = compare_results([baseline], [current])
-        @test length(comparison) == 1
-        row = comparison[1]
-        @test row.benchmark_name == "test"
-        @test row.wall_time_pct_change > 15.0  # 20% regression
-        @test row.alloc_bytes_pct_change < 0.0  # 10% improvement
-        @test row.has_regression == true         # wall time regressed >10%
-    end
-end
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: FAIL — `compare_results` not defined
-
-- [ ] **Step 3: Implement compare_results**
-
-```julia
-# src/report.jl
-
-struct ComparisonRow
-    benchmark_name::String
-    solver::String
-    N::Int
-    state_dim::Int
-    # Wall time
-    baseline_wall_s::Float64
-    current_wall_s::Float64
-    wall_time_pct_change::Float64
-    # Allocations
-    baseline_alloc_bytes::Int
-    current_alloc_bytes::Int
-    alloc_bytes_pct_change::Float64
-    # Regression flag
-    has_regression::Bool
-end
-
-"""
-    compare_results(baseline, current; regression_threshold=10.0) -> Vector{ComparisonRow}
-
-Compare two sets of BenchmarkResults by matching on `benchmark_name`.
-Returns comparison rows with percent changes and regression flags.
-
-A regression is flagged when wall_time or allocations increase by more than
-`regression_threshold` percent.
-"""
-function compare_results(
-    baseline::Vector{BenchmarkResult},
-    current::Vector{BenchmarkResult};
-    regression_threshold::Float64 = 10.0,
-)
-    baseline_by_name = Dict(r.benchmark_name => r for r in baseline)
-    rows = ComparisonRow[]
-
-    for r in current
-        b = get(baseline_by_name, r.benchmark_name, nothing)
-        isnothing(b) && continue
-
-        wall_pct = _pct_change(b.wall_time_s, r.wall_time_s)
-        alloc_pct = _pct_change(Float64(b.total_allocations_bytes), Float64(r.total_allocations_bytes))
-        has_regression = wall_pct > regression_threshold || alloc_pct > regression_threshold
-
-        push!(rows, ComparisonRow(
-            r.benchmark_name, r.solver, r.N, r.state_dim,
-            b.wall_time_s, r.wall_time_s, wall_pct,
-            b.total_allocations_bytes, r.total_allocations_bytes, alloc_pct,
-            has_regression,
-        ))
-    end
-
-    return rows
-end
-
-function _pct_change(old::Float64, new::Float64)
-    old == 0.0 && return new == 0.0 ? 0.0 : 100.0
-    return (new - old) / abs(old) * 100.0
-end
-```
-
-- [ ] **Step 4: Update module**
-
-Add exports to `src/HarmoniqsBenchmarks.jl`:
-
-```julia
-export compare_results, ComparisonRow
-```
-
-And add the include:
-
-```julia
-include("report.jl")
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-julia --project=. -e 'using Pkg; Pkg.test()'
-```
-
-Expected: All tests PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/HarmoniqsBenchmarks.jl
-git add src/report.jl src/HarmoniqsBenchmarks.jl test/runtests.jl
-git commit -m "feat: add compare_results reporter with regression detection"
-```
-
----
-
-## Task 8: Set Up DirectTrajOpt.jl Benchmark Environment
-
-**Files:**
-- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/Project.toml`
-- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/.gitignore`
-- Create: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
-
-- [ ] **Step 1: Create benchmark directory**
-
-```bash
-mkdir -p /home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/results
-```
-
-- [ ] **Step 2: Create .gitignore**
-
-```
-# benchmark/.gitignore
-results/
-```
-
-- [ ] **Step 3: Create benchmark/Project.toml**
-
-```toml
-[deps]
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-HarmoniqsBenchmarks = "INSERT_UUID"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
-MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
-NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
-TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
-```
-
-Replace `INSERT_UUID` with the UUID generated in Task 1.
-
-- [ ] **Step 4: Instantiate the benchmark environment**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-julia --project=benchmark -e '
-    using Pkg
-    Pkg.develop(path=".")
-    Pkg.develop(path="../HarmoniqsBenchmarks.jl")
-    Pkg.develop(path="../NamedTrajectories.jl")
-    Pkg.instantiate()
-    using HarmoniqsBenchmarks
-    println("Benchmark env OK")
-'
-```
-
-Expected: "Benchmark env OK"
-
-- [ ] **Step 5: Create benchmarks.jl stub**
-
-```julia
-# benchmark/benchmarks.jl
-using TestItems
-```
-
-- [ ] **Step 6: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-git add benchmark/Project.toml benchmark/.gitignore benchmark/benchmarks.jl
-git commit -m "feat: add benchmark/ environment for HarmoniqsBenchmarks integration"
-```
-
----
-
-## Task 9: Write Evaluator Micro-benchmarks
-
-**Files:**
-- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
-
-- [ ] **Step 1: Write the micro-benchmark @testitem**
-
-```julia
-# benchmark/benchmarks.jl
-using TestItems
-
-@testitem "Evaluator micro-benchmarks: bilinear N=51" begin
-    using HarmoniqsBenchmarks
-    using BenchmarkTools
-    using DirectTrajOpt
-    using NamedTrajectories
-    using SparseArrays
-    using ExponentialAction
-    using MathOptInterface
-    const MOI = MathOptInterface
-    using Dates
-
-    # Build a deterministic bilinear problem
-    Random.seed!(42)
-    N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
-    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
-    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
-    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
-    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
-
-    traj = NamedTrajectory(
-        (
-            x = 2rand(4, N) .- 1,
-            u = u_bound * (2rand(2, N) .- 1),
-            du = randn(2, N),
-            ddu = randn(2, N),
-            Δt = fill(Δt, N),
-        );
-        controls = (:ddu, :Δt),
-        timestep = :Δt,
-        bounds = (u = u_bound, Δt = (0.01, 0.5)),
-        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-        final = (u = zeros(2),),
-        goal = (x = [0.0, 1.0, 0.0, 0.0],),
-    )
-
-    integrators = [
-        BilinearIntegrator(G, :x, :u, traj),
-        DerivativeIntegrator(:u, :du, traj),
-        DerivativeIntegrator(:du, :ddu, traj),
-    ]
-    J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
-    prob = DirectTrajOptProblem(traj, J, integrators)
-
-    evaluator, Z_vec = build_evaluator(prob)
-    dims = evaluator_dims(evaluator)
-
-    # Pre-allocate buffers
-    g = zeros(dims.n_constraints)
-    grad = zeros(dims.n_variables)
-    H = zeros(dims.n_hessian_entries)
-    Jac = zeros(dims.n_jacobian_entries)
-    sigma = 1.0
-    mu = ones(dims.n_constraints)
-
-    # Run benchmarks
-    benchmarks = Dict{Symbol,EvalBenchmark}(
-        :eval_objective => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_objective($evaluator, $Z_vec))
-        ),
-        :eval_gradient => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))
-        ),
-        :eval_constraint => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))
-        ),
-        :eval_jacobian => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))
-        ),
-        :eval_hessian_lagrangian => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))
-        ),
-    )
-
-    result = MicroBenchmarkResult(
-        package = "DirectTrajOpt",
-        package_version = "0.8.10",
-        commit = try strip(read(`git rev-parse --short HEAD`, String)) catch; "unknown" end,
-        benchmark_name = "evaluator_micro_bilinear_N51",
-        N = N, state_dim = 4, control_dim = 2,
-        eval_benchmarks = benchmarks,
-        julia_version = string(VERSION),
-        timestamp = now(),
-        runner = get(ENV, "BENCHMARK_RUNNER", "local"),
-        n_threads = Threads.nthreads(),
-    )
-
-    # Print summary
-    println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
-    for (name, eb) in sort(collect(result.eval_benchmarks), by=first)
-        Printf = Base.Printf
-        @Printf.printf("  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n",
-            name, eb.median_ns, eb.allocs, eb.memory_bytes)
-    end
-
-    # Save
-    results_dir = joinpath(@__DIR__, "results")
-    save_micro_results(results_dir, result.benchmark_name, result)
-    println("  Saved to $results_dir/")
-end
-```
-
-- [ ] **Step 2: Run the micro-benchmark to verify it works**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-julia --project=benchmark -e '
-    using TestItemRunner
-    @run_package_tests(filter=ti -> occursin("micro", ti.name), benchmark)
-'
-```
-
-Expected: Benchmark runs, prints timing table, saves JLD2 to `benchmark/results/`
-
-- [ ] **Step 3: Verify the JLD2 output is loadable**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-julia --project=benchmark -e '
-    using HarmoniqsBenchmarks
-    files = filter(f -> endswith(f, ".jld2"), readdir("benchmark/results", join=true))
-    @assert length(files) >= 1 "Expected at least one JLD2 file"
-    result = load_micro_results(files[1])
-    println("Loaded: $(result.benchmark_name)")
-    println("Functions benchmarked: $(keys(result.eval_benchmarks))")
-'
-```
-
-Expected: Loads successfully, shows function names
-
-- [ ] **Step 4: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-git add benchmark/benchmarks.jl
-git commit -m "feat: add evaluator micro-benchmarks with BenchmarkTools"
-```
-
----
-
-## Task 10: Write Ipopt vs MadNLP Macro-benchmarks
-
-**Files:**
-- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
-
-- [ ] **Step 1: Append the macro-benchmark @testitem**
-
-Append to `benchmark/benchmarks.jl`:
-
-```julia
-@testitem "Ipopt vs MadNLP: bilinear N=51" begin
-    using HarmoniqsBenchmarks
-    using DirectTrajOpt
-    using NamedTrajectories
-    using SparseArrays
-    using ExponentialAction
-    import MadNLP
-    using Dates
-
-    # Resolve MadNLPOptions from the extension
-    const MadNLPSolverExt = [
-        mod for mod in reverse(Base.loaded_modules_order)
-        if Symbol(mod) == :MadNLPSolverExt
-    ][1]
-
-    function make_bilinear_problem(; seed=42)
-        Random.seed!(seed)
-        N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
-        Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
-        Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
-        Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
-        G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
-
-        traj = NamedTrajectory(
-            (
-                x = 2rand(4, N) .- 1,
-                u = u_bound * (2rand(2, N) .- 1),
-                du = randn(2, N),
-                ddu = randn(2, N),
-                Δt = fill(Δt, N),
-            );
-            controls = (:ddu, :Δt),
-            timestep = :Δt,
-            bounds = (u = u_bound, Δt = (0.01, 0.5)),
-            initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-            final = (u = zeros(2),),
-            goal = (x = [0.0, 1.0, 0.0, 0.0],),
-        )
-
-        integrators = [
-            BilinearIntegrator(G, :x, :u, traj),
-            DerivativeIntegrator(:u, :du, traj),
-            DerivativeIntegrator(:du, :ddu, traj),
-        ]
-        J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
-        return DirectTrajOptProblem(traj, J, integrators)
-    end
-
-    # Ipopt solve
-    prob_ipopt = make_bilinear_problem()
-    result_ipopt = benchmark_solve!(
-        prob_ipopt,
-        IpoptOptions(max_iter=200, print_level=0);
-        benchmark_name = "bilinear_N51_ipopt",
-    )
-
-    # MadNLP solve (fresh problem)
-    prob_madnlp = make_bilinear_problem()
-    result_madnlp = benchmark_solve!(
-        prob_madnlp,
-        MadNLPSolverExt.MadNLPOptions(max_iter=200, print_level=1);
-        benchmark_name = "bilinear_N51_madnlp",
-    )
-
-    # Print comparison
-    println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
-    println("  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc")
-    println("  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc")
-
-    # Save
-    results_dir = joinpath(@__DIR__, "results")
-    save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
-end
-```
-
-- [ ] **Step 2: Run the macro-benchmark**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-julia --project=benchmark -e '
-    using TestItemRunner
-    @run_package_tests(filter=ti -> occursin("Ipopt vs MadNLP", ti.name), benchmark)
-'
-```
-
-Expected: Both solvers run, prints wall time and allocation comparison
-
-- [ ] **Step 3: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-git add benchmark/benchmarks.jl
-git commit -m "feat: add Ipopt vs MadNLP macro-benchmark"
-```
-
----
-
-## Task 11: Write Memory Scaling Study
-
-**Files:**
-- Modify: `/home/jack/repos/harmoniqs/DirectTrajOpt.jl/benchmark/benchmarks.jl`
-
-- [ ] **Step 1: Append the scaling study @testitem**
-
-Append to `benchmark/benchmarks.jl`:
-
-```julia
-@testitem "Memory scaling: N and state_dim sweep" begin
-    using HarmoniqsBenchmarks
-    using DirectTrajOpt
-    using NamedTrajectories
-    using SparseArrays
-    using ExponentialAction
-    import MadNLP
-    using Dates, Printf
-
-    const MadNLPSolverExt = [
-        mod for mod in reverse(Base.loaded_modules_order)
-        if Symbol(mod) == :MadNLPSolverExt
-    ][1]
-
-    function make_scaled_problem(; N, state_dim, n_controls=2, seed=42)
-        Random.seed!(seed)
-
-        # Build random bilinear system at given state dimension
-        G_drift = sparse(randn(state_dim, state_dim))
-        G_drives = [sparse(randn(state_dim, state_dim)) for _ in 1:n_controls]
-        G(u) = G_drift + sum(u[i] * G_drives[i] for i in 1:n_controls)
-
-        x_init = zeros(state_dim); x_init[1] = 1.0
-        x_goal = zeros(state_dim); x_goal[2] = 1.0
-
-        traj = NamedTrajectory(
-            (
-                x = randn(state_dim, N),
-                u = 0.1 * randn(n_controls, N),
-                du = randn(n_controls, N),
-                Δt = fill(0.1, N),
-            );
-            controls = (:du, :Δt),
-            timestep = :Δt,
-            bounds = (u = 1.0, Δt = (0.01, 0.5)),
-            initial = (x = x_init, u = zeros(n_controls)),
-            final = (u = zeros(n_controls),),
-            goal = (x = x_goal,),
-        )
-
-        integrators = [
-            BilinearIntegrator(G, :x, :u, traj),
-            DerivativeIntegrator(:u, :du, traj),
-        ]
-        J = QuadraticRegularizer(:u, traj, 1.0)
-        return DirectTrajOptProblem(traj, J, integrators)
-    end
-
-    N_values = [25, 51, 101]
-    dim_values = [4, 8, 16]
-    results = BenchmarkResult[]
-
-    println("\n=== Memory Scaling Study ===")
-    @printf("  %5s | %5s | %12s | %12s | %12s | %12s\n",
-        "N", "dim", "Ipopt (s)", "Ipopt (KB)", "MadNLP (s)", "MadNLP (KB)")
-    @printf("  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n",
-        "-"^5, "-"^5, "-"^12, "-"^12, "-"^12, "-"^12)
-
-    for N in N_values
-        for dim in dim_values
-            # Ipopt
-            prob = make_scaled_problem(; N=N, state_dim=dim)
-            r_ipopt = benchmark_solve!(
-                prob, IpoptOptions(max_iter=50, print_level=0);
-                benchmark_name = "scaling_N$(N)_d$(dim)_ipopt",
-            )
-            push!(results, r_ipopt)
-
-            # MadNLP
-            prob = make_scaled_problem(; N=N, state_dim=dim)
-            r_madnlp = benchmark_solve!(
-                prob, MadNLPSolverExt.MadNLPOptions(max_iter=50, print_level=1);
-                benchmark_name = "scaling_N$(N)_d$(dim)_madnlp",
-            )
-            push!(results, r_madnlp)
-
-            @printf("  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
-                N, dim,
-                r_ipopt.wall_time_s, r_ipopt.total_allocations_bytes ÷ 1024,
-                r_madnlp.wall_time_s, r_madnlp.total_allocations_bytes ÷ 1024)
-        end
-    end
-
-    # Save all results
-    results_dir = joinpath(@__DIR__, "results")
-    save_results(results_dir, "memory_scaling", results)
-    println("\n  Saved $(length(results)) results to $results_dir/")
-end
-```
-
-- [ ] **Step 2: Run the scaling study**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-julia --project=benchmark -e '
-    using TestItemRunner
-    @run_package_tests(filter=ti -> occursin("Memory scaling", ti.name), benchmark)
-'
-```
-
-Expected: Table printed with wall times and allocations for each (N, dim) combination
-
-- [ ] **Step 3: Commit**
-
-```bash
-cd /home/jack/repos/harmoniqs/DirectTrajOpt.jl
-git add benchmark/benchmarks.jl
-git commit -m "feat: add memory scaling study benchmark (N x state_dim sweep)"
-```
-
----
-
-## Verification Checklist
-
-After all tasks are complete:
-
-- [ ] `cd HarmoniqsBenchmarks.jl && julia --project=. -e 'using Pkg; Pkg.test()'` — all tests pass
-- [ ] `cd DirectTrajOpt.jl && julia --project=benchmark -e 'using TestItemRunner; @run_package_tests(benchmark)'` — all three benchmark @testitems run
-- [ ] `ls DirectTrajOpt.jl/benchmark/results/` — contains `.jld2` files for each benchmark
-- [ ] Load and compare results:
-  ```julia
-  using HarmoniqsBenchmarks
-  results = load_results("benchmark/results/ipopt_vs_madnlp_N51_<sha>.jld2")
-  println("Ipopt: $(results[1].wall_time_s)s, MadNLP: $(results[2].wall_time_s)s")
-  ```
-
----
-
-## Follow-up Plans (Not in Scope)
-
-- **Piccolissimo benchmark suite** — migrate existing `benchmark/complex_vs_real_ode.jl` and `constraint_comparison.jl` to use HarmoniqsBenchmarks schema
-- **Demo-repo problem generators** — clone bosonic-demo, nv-center-demo, atoms-demo, ions, fluxonium-demo, gkp-stanford and extract system Hamiltonians
-- **CI workflows** — `.github/workflows/benchmark.yml` for DirectTrajOpt and other packages
-- **Allocation profiling spike** — parallel worktree experiments with Profile.Allocs, AllocCheck.jl, --track-allocation
-- **Aggregator repo** — `harmoniqs-benchmarks` with cross-package comparison tables
diff --git a/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md b/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
deleted file mode 100644
index 50b7959..0000000
--- a/docs/superpowers/specs/2026-04-15-altissimo-gpu-benchmarks-design.md
+++ /dev/null
@@ -1,198 +0,0 @@
-# Altissimo GPU Benchmark Suite — Design
-
-**Date:** 2026-04-15
-**Status:** Design (follow-up to HarmoniqsBenchmarks.jl core plan)
-**Depends on:** HarmoniqsBenchmarks.jl (schema, harness, storage)
-**Reference:** `gpu_benchmark.py` (Colab notebook from Raghav, T4 results)
-
-## Context
-
-Altissimo.jl is a GPU-accelerated augmented Lagrangian optimizer for quantum trajectory optimization. It uses matrix-free JVP/VJP callbacks, making it GPU-compatible where Ipopt (which requires sparse Jacobians/Hessians) is CPU-only. Raghav demonstrated 4.5x GPU speedup at 1024 state dim on a T4. This benchmark suite formalizes those measurements and tracks them across versions.
-
-Three benchmark categories, matching the existing Colab notebook structure:
-
-1. **Ipopt vs Altissimo (CPU)** — real quantum gate optimization
-2. **Altissimo CPU vs GPU scaling** — structured optimization at increasing state dim
-3. **cuDensityMat vs cuSPARSE** — Liouvillian operator action for open-system trajectory optimization
-
-## Benchmark 0: Three-Way Solver Comparison (Ipopt vs MadNLP-GPU vs Altissimo-GPU)
-
-The harmoniqs org maintains a MadNLP.jl fork with `MadNLPGPU` (in `lib/MadNLPGPU/`), which uses CUDSS for GPU-accelerated sparse KKT system solves. This enables a three-way comparison at increasing problem sizes:
-
-| Solver | Method | Linear Algebra | GPU? |
-|--------|--------|---------------|------|
-| Ipopt | Interior-point | MUMPS/Pardiso (sparse, CPU) | No |
-| MadNLP + MadNLPGPU | Interior-point | CUDSS (sparse, GPU) | Yes |
-| Altissimo | Augmented Lagrangian | Matrix-free JVP/VJP (GPU) | Yes |
-
-**Hypothesis:** At small state dims (sd < 256), Ipopt wins due to mature sparse factorization. At medium dims (256-1024), MadNLP-GPU may win due to GPU-accelerated CUDSS. At large dims (1024+), Altissimo wins due to matrix-free scaling (no sparse assembly).
-
-**Problem:** Same quantum-control-structured problem as Benchmark 2 below, swept across sd ∈ {64, 128, 256, 512, 1024, 2048}. For MadNLP-GPU, the problem requires Jacobian/Hessian sparsity (MOI interface), so it uses the same evaluator as Ipopt but with GPU-side linear solves.
-
-**Dependencies:**
-- `MadNLPGPU` from `harmoniqs/MadNLP.jl` (lib/MadNLPGPU)
-- `CUDA.jl` + `CUDSS.jl` for GPU linear algebra
-- DirectTrajOpt MadNLP extension for evaluator hookup
-
-**Metrics:** Wall time, iterations, convergence quality, total allocations, GPU memory usage, speedup vs Ipopt baseline.
-
-**Note:** MadNLP-GPU requires the KKT system to fit in GPU memory. For very large problems, the sparse Jacobian/Hessian may exceed VRAM, which is exactly where Altissimo's matrix-free approach has the advantage.
-
----
-
-## Benchmark 1: Ipopt vs Altissimo (CPU) — Quantum Gate Optimization
-
-Directly comparable: same X gate problem, same initial conditions, both on CPU.
-
-**Problem setup** (from Colab Part 2):
-- System: 1 qubit, H_drift = 0.5 σ_z, drives = [σ_x, σ_y], bounds = [1.0, 1.0]
-- Gate: X gate, T=10.0, N=100
-- Integrator: HermitianExponentialIntegrator
-- Template: SmoothPulseProblem(Q=100.0, R=1e-2, ddu_bound=1.0, Δt_bounds=(0.05, 0.15))
-- Deep copy for identical initial conditions
-
-**Metrics:**
-- Wall time (s)
-- Fidelity (infidelity = 1 - fidelity)
-- Total allocations (bytes)
-- GC time
-
-**Altissimo configuration** (reference values):
-```julia
-AltissimoOptions(
-    search_direction = :LBFGS,
-    lbfgs_memory = 50,
-    line_search = :StrongWolfe,
-    ls_max_evals = 100,
-    max_outer_iter = 20,
-    max_inner_iter = 500,
-    inner_tol = 1e-8,
-    ρ_init = 100.0,
-    ρ_max = 1e8,
-    polish = true,
-    polish_stall_min_iters = 10,
-    polish_δ_w = 1e-6,
-    polish_δ_c = 1e-8,
-)
-```
-
-**Integration with HarmoniqsBenchmarks:** Both produce `BenchmarkResult` with `solver="ipopt"` / `solver="altissimo"`. The `solver_options` field captures the full AltissimoOptions snapshot.
-
-## Benchmark 2: Altissimo CPU vs GPU Scaling
-
-The core scaling benchmark. Uses a quantum-control-structured problem (NOT a real quantum system) to isolate solver scaling behavior from physics complexity.
-
-**Problem structure** (from Colab Part 3):
-- Decision vector: z = [x_1; ...; x_N; u_1; ...; u_{N-1}]
-- Dynamics: x_{k+1} = Φ(u_k) x_k, where Φ(u) = A + Σⱼ uⱼ Cⱼ
-- A is orthogonal (norm-preserving, like unitary evolution)
-- Coupling scaled: ‖Cⱼ‖_spectral ≈ 0.4 independent of state_dim (σ_c = 0.2/√sd)
-- Target generated by forward simulation with known controls → guaranteed feasible
-- Objective: ½|x_N - x_target|² + (α/2) Σ|u_k|²
-- All callbacks GPU-native: cuBLAS matvec, broadcast, dot (no scalar indexing)
-
-**Sweep configurations** (from Colab):
-
-| state_dim | n_drives | N  | n_vars   | n_eq     |
-|-----------|----------|----|----------|----------|
-| 512       | 2        | 20 | 10,278   | 10,240   |
-| 1024      | 2        | 20 | 20,518   | 20,480   |
-| 2048      | 2        | 20 | 41,998   | 40,960   |
-| 4096      | 2        | 20 | 81,958   | 81,920   |
-
-**Metrics per (state_dim, device) pair:**
-- Wall time (s) — after JIT warmup
-- Objective value at convergence
-- Constraint violation ‖c‖
-- Converged (bool)
-- GPU speedup = CPU_time / GPU_time
-
-**Key implementation details:**
-- JIT warmup run before timed run
-- `CUDA.synchronize()` before and after timed run for accurate GPU timing
-- `build_callbacks()` returns obj!, grad!, hvp!, eq!, eq_jvp!, eq_vjp!
-- Optimizer: `Altissimo.LBFGS` with `Altissimo.StrongWolfe` line search
-- `initialize_z!` does forward propagation with u=0 for feasible init
-
-**Schema extension:** Add to `BenchmarkResult`:
-- `device::String` — "cpu" or "gpu"
-- `gpu_name::String` — e.g. "Tesla T4", "A100" (from `CUDA.name(CUDA.device())`)
-- `gpu_memory_bytes::Int` — VRAM (from `CUDA.totalmem`)
-
-OR: encode these in `solver_options` dict to avoid schema changes:
-```julia
-solver_options[:device] = "gpu"
-solver_options[:gpu_name] = CUDA.name(CUDA.device())
-solver_options[:gpu_memory_bytes] = CUDA.totalmem(CUDA.device())
-```
-
-Recommended: use `solver_options` dict to avoid breaking the schema for CPU-only packages.
-
-## Benchmark 3: cuDensityMat vs cuSPARSE — Liouvillian Operator
-
-This measures the fundamental operation for open-system trajectory optimization: applying a Liouvillian superoperator to a density matrix.
-
-**System:** M coupled cavities with Fock truncation d=3, Hilbert space D = 3^M.
-- Hamiltonian: H(t) = Σᵢ δᵢ(t) aᵢ†aᵢ + Σᵢ Kᵢ aᵢ†aᵢ†aᵢaᵢ + Σ⟨i,j⟩ gᵢⱼ(t)(aᵢ†aⱼ + h.c.)
-- Collapse operators: √κ aᵢ (photon loss)
-- Liouvillian: L = -i(H⊗I - I⊗Hᵀ) + Σₖ (Cₖ⊗Cₖ* - ½(Cₖ†Cₖ⊗I + I⊗Cₖᵀ Cₖ*))
-
-**Sweep:**
-
-| M | D    | ρ elements (D²) | cuDensityMat | cuSPARSE | Dense CPU |
-|---|------|-----------------|-------------|----------|-----------|
-| 2 | 9    | 81              | 0.27 ms     | 0.039 ms | 0.003 ms  |
-| 4 | 81   | 6,561           | 1.22 ms     | 0.048 ms | 31.8 ms   |
-| 6 | 729  | 531,441         | 6.45 ms     | 0.90 ms  | infeasible|
-| 8 | 6561 | 43,046,721      | 620 ms      | infeasible| infeasible|
-
-**Batched evolution** (trajectory optimization workload):
-
-| M | D  | Batch | Batched   | Sequential | Speedup |
-|---|----|-------|-----------|------------|---------|
-| 2 | 9  | 256   | 0.38 ms   | 70.1 ms    | 186x    |
-| 4 | 81 | 256   | 8.05 ms   | 280.7 ms   | 35x     |
-
-**Key insight:** cuSPARSE beats cuDensityMat for M ≤ 6 (tensor-network contraction overhead at small D). cuDensityMat wins at M=8+ where sparse Liouvillian can't be materialized (~50-70 GB). Batched evolution is critical for trajectory optimization (35-186x speedup).
-
-**Integration note:** This benchmark depends on CuQuantum.jl (harmoniqs org). The cuDensityMat portion requires the NVIDIA cuQuantum SDK and should run exclusively on EC2 GPU runners.
-
-## CI Runner Requirements
-
-| Benchmark | Runner | GPU Required |
-|-----------|--------|-------------|
-| Ipopt vs Altissimo (CPU) | `ubuntu-latest` (free) | No |
-| 3-way solver (Ipopt/MadNLP-GPU/Altissimo) | `[self-hosted, gpu]` (EC2) | Yes (T4 minimum, CUDSS for MadNLP) |
-| Altissimo CPU vs GPU scaling | `[self-hosted, gpu]` (EC2) | Yes (T4 minimum) |
-| cuSPARSE / cuDensityMat | `[self-hosted, gpu]` (EC2) | Yes (A100 recommended for M=8) |
-
-## Where Benchmarks Live
-
-- **Benchmark 1** (Ipopt vs Altissimo CPU): In `Piccolissimo.jl/benchmark/` since it uses `SmoothPulseProblem` + `HermitianExponentialIntegrator`
-- **Benchmark 2** (GPU scaling): In `Altissimo.jl/benchmark/` since it's Altissimo-specific with CUDA callbacks
-- **Benchmark 3** (Liouvillian): In `CuQuantum.jl/benchmark/` or `Piccolissimo.jl/benchmark/` (TBD based on where cuDensityMat integration lands)
-
-All use `HarmoniqsBenchmarks.jl` schema for consistent artifact format.
-
-## Adaptation for HarmoniqsBenchmarks Schema
-
-The Colab notebook uses ad-hoc timing (`@elapsed`, `CUDA.@elapsed`). To integrate with HarmoniqsBenchmarks:
-
-**Benchmark 2 adaptation:**
-- Wrap `run_one()` to return a `BenchmarkResult` instead of a NamedTuple
-- Add `solver_options` dict with Altissimo config + device info
-- Replace manual `time()` calls with `@timed` for allocation tracking
-- Save JLD2 artifacts instead of printing tables
-
-**Benchmark 3 adaptation:**
-- Create a `LiouvillianBenchmarkResult` (or use a new `MicroBenchmarkResult` variant)
-- Key fields: M, D, D², nnz(L), method (:cusparse, :cudensitymat, :cpu_dense), time_ms, memory_bytes
-- Batched results include batch_size and sequential/batched comparison
-
-## Implementation Notes
-
-- The `apply_Phi!` / `apply_Phi_t!` pattern from the notebook should be extracted into Altissimo's callback builder, not reimplemented in benchmarks
-- `CUDA.synchronize()` is critical for accurate GPU timing — always call before starting and after stopping the timer
-- JIT warmup run is mandatory — first Julia/CUDA execution compiles kernels
-- Memory estimation before large allocations: check `CUDA.totalmem()` and skip if would exceed 80% VRAM
-- The coupling scaling fix (σ_c = 0.2/√sd) is essential for well-conditioned problems at large state dim — without it, ‖C‖ ~ 0.1√sd makes convergence erratic
diff --git a/docs/superpowers/specs/2026-04-15-benchmarking-design.md b/docs/superpowers/specs/2026-04-15-benchmarking-design.md
deleted file mode 100644
index bb9f943..0000000
--- a/docs/superpowers/specs/2026-04-15-benchmarking-design.md
+++ /dev/null
@@ -1,383 +0,0 @@
-# HarmoniqsBenchmarks.jl — Cross-Package Benchmarking Infrastructure
-
-**Date:** 2026-04-15
-**Status:** Design
-
-## Context
-
-The harmoniqs quantum optimal control stack (DirectTrajOpt, Piccolo, Piccolissimo, Altissimo, Intonato) needs a unified benchmarking system to:
-
-- Compare Ipopt vs MadNLP solver performance on the DirectTrajOpt `feat/madnlp-integration` branch
-- Collect statistically robust histograms of key evaluator functions (eval_hessian_lagrangian, eval_constraint_jacobian, etc.) for regression detection
-- Profile memory usage and allocations in MadNLP and across all packages, understanding how memory scales with knot points (N), state dimension, and control dimension
-- Track allocations in the optimization hot path to drive them toward zero
-- Publish version-tagged JLD2 artifacts so labs and enterprises can evaluate problem-size scaling
-
-This is driven by all three active workstreams needing memory/performance benchmarks (MadNLP integration, Altissimo GPU scaling at 1024 state dim, Intonato convergence tracking).
-
-## Architecture
-
-**Approach:** Shared `HarmoniqsBenchmarks.jl` package + per-package `benchmark/` directories + central aggregator repo.
-
-- `HarmoniqsBenchmarks.jl` — lightweight Julia package (own repo in harmoniqs org) providing schema, profiling harness, problem generators, and reporters
-- Each downstream package (DirectTrajOpt, Piccolo, Piccolissimo, Altissimo, Intonato) has a `benchmark/` directory with `@testitem`-based benchmarks using `HarmoniqsBenchmarks`
-- Central `harmoniqs-benchmarks` repo aggregates artifacts and generates cross-package comparison tables
-- Artifacts are JLD2 files stored in CI (GitHub Actions artifact upload), not a live dashboard
-
-## Schema
-
-### BenchmarkResult
-
-```julia
-struct BenchmarkResult
-    # Identity
-    package::String               # "DirectTrajOpt", "Piccolissimo", etc.
-    package_version::String       # semver tag
-    commit::String                # short SHA
-    benchmark_name::String        # "cz_gate_ipopt", "madnlp_scaling_N101_d16"
-
-    # Problem dimensions
-    N::Int                        # knot points
-    state_dim::Int                # state vector dimension
-    control_dim::Int              # number of controls
-    n_constraints::Int            # total nonlinear constraints
-    n_variables::Int              # total NLP variables
-
-    # Solve metrics
-    wall_time_s::Float64
-    iterations::Int
-    objective_value::Float64
-    constraint_violation::Float64
-    solver_status::Symbol         # :Optimal, :MaxIter, :Infeasible
-    solver::String                # "ipopt", "madnlp", "altissimo"
-
-    # Memory & allocations
-    total_allocations_bytes::Int
-    total_allocs_count::Int       # number of allocation events
-    peak_memory_bytes::Int
-
-    # GC stats
-    gc_time_ns::Int
-    gc_count::Int
-    gc_full_count::Int
-
-    # Solver options snapshot
-    solver_options::Dict{Symbol,Any}
-
-    # Metadata
-    julia_version::String
-    timestamp::DateTime
-    runner::String                # "github-actions", "ec2-gpu", "local"
-    n_threads::Int
-end
-```
-
-### MicroBenchmarkResult
-
-```julia
-struct MicroBenchmarkResult
-    # Identity (same as above)
-    package::String
-    package_version::String
-    commit::String
-    benchmark_name::String
-
-    # Problem dimensions
-    N::Int
-    state_dim::Int
-    control_dim::Int
-
-    # Per-function BenchmarkTools results
-    # Each value is a serialized BenchmarkTools.Trial containing:
-    #   times (ns), gctimes (ns), memory (bytes), allocs (count)
-    eval_benchmarks::Dict{Symbol, Any}
-    # Keys: :eval_objective, :eval_gradient, :eval_constraint,
-    #        :eval_jacobian, :eval_hessian_lagrangian
-
-    # Metadata
-    julia_version::String
-    timestamp::DateTime
-    runner::String
-    n_threads::Int
-end
-```
-
-## Benchmarking Layers
-
-### Layer 1: Micro-benchmarks (Eval Function Histograms)
-
-Use `BenchmarkTools.@benchmark` on individual MOI evaluator methods. This gives statistically robust distributions with proper warmup, plus allocation counts per call.
-
-```julia
-@testitem "Evaluator micro-benchmarks: CZ N=51" begin
-    using HarmoniqsBenchmarks, BenchmarkTools, Piccolissimo, Piccolo
-
-    prob = build_cz_problem(N=51)
-    evaluator, Z_vec = build_evaluator(prob)
-
-    # Pre-allocate output buffers
-    g = zeros(n_constraints(evaluator))
-    grad = zeros(n_variables(evaluator))
-    H = zeros(n_hessian_entries(evaluator))
-    J = zeros(n_jacobian_entries(evaluator))
-    sigma = 1.0
-    mu = ones(n_constraints(evaluator))
-
-    benchmarks = Dict(
-        :eval_objective          => @benchmark(MOI.eval_objective($evaluator, $Z_vec)),
-        :eval_gradient           => @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec)),
-        :eval_constraint         => @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec)),
-        :eval_jacobian           => @benchmark(MOI.eval_constraint_jacobian($evaluator, $J, $Z_vec)),
-        :eval_hessian_lagrangian => @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu)),
-    )
-
-    save_micro_results("cz_N51_ipopt", benchmarks; prob)
-end
-```
-
-**Regression detection:** Compare median times and allocation counts across versions. A >10% regression in any eval function on the same problem size flags for review.
-
-### Layer 2: Macro-benchmarks (Full Solves)
-
-Use `@timed` for wall clock + total allocations on `solve!`. Full optimization is not repeatable in the BenchmarkTools sense (each call modifies the problem), so we capture single-run metrics.
-
-```julia
-@testitem "CZ gate Ipopt vs MadNLP" begin
-    using HarmoniqsBenchmarks, Piccolissimo, Piccolo
-
-    prob = build_cz_problem(N=51)
-    result_ipopt = benchmark_solve!(prob, IpoptOptions())
-
-    prob = build_cz_problem(N=51)  # fresh problem
-    result_madnlp = benchmark_solve!(prob, MadNLPOptions())
-
-    save_results("cz_gate_comparison", [result_ipopt, result_madnlp])
-end
-```
-
-### Layer 3: Scaling Studies
-
-Parameterized sweeps over problem dimensions to characterize memory and time growth.
-
-```julia
-@testitem "MadNLP memory scaling" begin
-    using HarmoniqsBenchmarks, Piccolissimo, Piccolo
-
-    results = BenchmarkResult[]
-    for N in [25, 51, 101, 201, 401]
-        for state_dim in [4, 8, 16, 32, 64]
-            prob = build_bilinear_problem(; N, state_dim, n_controls=2)
-            r = benchmark_solve!(prob, MadNLPOptions())
-            push!(results, r)
-        end
-    end
-    save_results("madnlp_memory_scaling", results)
-end
-```
-
-### Layer 4: Allocation Profiling
-
-Tools for tracking down and eliminating allocations in the optimization hot path.
-
-**Profile.Allocs** — captures per-allocation stack traces during a solve:
-```julia
-@testitem "Allocation profile: CZ solve" begin
-    using HarmoniqsBenchmarks, Profile, Piccolissimo, Piccolo
-
-    prob = build_cz_problem(N=51)
-    Profile.Allocs.clear()
-    Profile.Allocs.@profile sample_rate=1.0 solve!(prob)
-    alloc_results = Profile.Allocs.fetch()
-
-    save_alloc_profile("cz_N51_alloc_profile", alloc_results)
-    # Visualize locally: using PProf; PProf.Allocs.pprof(alloc_results)
-end
-```
-
-**AllocCheck.jl** — compile-time zero-allocation enforcement for evaluator hot paths. Can be added as an optional CI check:
-```julia
-@testitem "Zero-allocation check: evaluator methods" begin
-    using AllocCheck, DirectTrajOpt
-
-    # These should be allocation-free once optimized
-    @check_allocs MOI.eval_constraint(ev::Evaluator, g::Vector{Float64}, Z::Vector{Float64})
-    @check_allocs MOI.eval_constraint_jacobian(ev::Evaluator, J::Vector{Float64}, Z::Vector{Float64})
-    @check_allocs MOI.eval_hessian_lagrangian(ev::Evaluator, H::Vector{Float64}, Z::Vector{Float64}, s::Float64, m::Vector{Float64})
-end
-```
-
-**Per-line tracking** (local development, not CI):
-```bash
-julia --track-allocation=user --project=benchmark benchmark/benchmarks.jl
-# Generates .mem files with per-line allocation counts
-```
-
-**Implementation note:** The best allocation profiling approach for the evaluator hot path is TBD. During implementation, spike all three approaches (`Profile.Allocs`, `AllocCheck.jl`, `--track-allocation`) in parallel worktrees against a representative problem (e.g. CZ N=51) to determine which gives the most actionable results for tracking down and eliminating allocations in the MOI eval methods.
-
-## Problem Generators
-
-Deterministic, parameterized problem constructors for reproducibility.
-
-### DirectTrajOpt level
-- `build_bilinear_problem(; N=51, state_dim=4, n_controls=2, seed=42)` — random Hermitian system matrices, bilinear integrator + quadratic regularizer
-- `build_constrained_problem(; N=51, state_dim=4, n_nonlinear=3, seed=42)` — adds nonlinear knot-point constraints
-
-### Piccolo/Piccolissimo level
-- `build_cz_problem(; N=51, integrator=:hermitian_exp)` — 2-qubit CZ gate, exchange-only system (4-level), matches spin-qubit-demo
-- `build_cnot_problem(; N=101, integrator=:hermitian_exp)` — 2-qubit CNOT with 3 EDSR drives
-- `build_transmon_problem(; levels=3, N=51)` — single-qubit X gate on multi-level transmon
-
-### Altissimo level
-- `build_polish_problem(; N=51, state_dim=4)` — pre-solved Ipopt problem ready for Altissimo refinement
-- `build_gpu_scaling_problem(; state_dim=1024)` — large-state-dim problem for GPU benchmarking
-
-### Intonato level
-- `build_qilc_problem(; N=101, n_paulis=15, J_mismatch=1.3)` — QILC calibration loop with simulated experiment, matches spin-qubit-demo pattern
-
-### Demo-repo-derived problems
-
-The harmoniqs org has several hardware-platform demo repos that provide real-world benchmark problems. During implementation, clone and extract representative problem configurations from:
-
-| Repo | Platform | Typical Dimensions | Key Benchmark |
-|------|----------|-------------------|---------------|
-| `spin-qubit-demo` | Silicon spin qubits | N=51-101, 4-level, 1-3 drives | CZ, CNOT, QILC calibration |
-| `bosonic-demo` | Bosonic cavity QED | Higher Hilbert space dims | Cavity control |
-| `nv-center-demo` | NV centers | Spin-1 + nuclear spins | Dark matter sensing pulses |
-| `atoms-demo` | Neutral atoms | Rydberg levels | Multi-qubit gates |
-| `ions` | Trapped ions | Motional modes + qubits | MS gate, individual addressing |
-| `fluxonium-demo` | Fluxonium qubits | Multi-level transmon-like | Single-qubit gates |
-| `gkp-stanford` | GKP states | Bosonic Fock space | State preparation |
-
-These provide the "enterprise-scale" problem suite that demonstrates what problem sizes each solver can handle. Extract the system Hamiltonians and problem parameters from each demo, wrap them as generators in `HarmoniqsBenchmarks.problems/`.
-
-All generators use `Random.seed!(seed)` for determinism.
-
-## Harness Functions
-
-### build_evaluator(prob) -> (evaluator, Z_vec)
-
-Extracts the MOI evaluator and initial decision variable vector from a `DirectTrajOptProblem`. Used for micro-benchmarks so individual eval functions can be called directly.
-
-### benchmark_solve!(prob, options; kwargs...) -> BenchmarkResult
-
-```julia
-function benchmark_solve!(prob, options; kwargs...)
-    GC.gc()
-    gc_before = Base.gc_num()
-
-    timed = @timed solve!(prob; options, kwargs...)
-
-    gc_after = Base.gc_num()
-
-    return BenchmarkResult(
-        # ... populate from prob metadata, timed, gc delta, options snapshot
-    )
-end
-```
-
-### save_results(name, results) / save_micro_results(name, benchmarks)
-
-Write JLD2 to `benchmark/results/<name>_<commit_sha>.jld2`.
-
-### compare_results(baseline_path, current_path) -> ComparisonTable
-
-Load two result sets and produce a diff table with percent changes, flagging regressions.
-
-## CI Workflow
-
-### Per-package: `.github/workflows/benchmark.yml`
-
-```yaml
-name: Benchmarks
-on:
-  push:
-    tags: ['v*']
-  workflow_dispatch:
-    inputs:
-      baseline_tag:
-        description: 'Tag to compare against'
-        required: false
-
-jobs:
-  benchmark:
-    runs-on: ubuntu-latest   # free for OSS
-    steps:
-      - uses: actions/checkout@v4
-      - uses: julia-actions/setup-julia@v2
-        with:
-          version: '1.11'
-      - name: Instantiate benchmark env
-        run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
-      - name: Run benchmarks
-        run: julia --project=benchmark -t auto -e '
-          using TestItemRunner
-          @run_package_tests(benchmark)
-        '
-      - uses: actions/upload-artifact@v4
-        with:
-          name: benchmark-${{ github.ref_name }}-${{ github.sha }}
-          path: benchmark/results/
-          retention-days: 365
-
-  # GPU/large-scale benchmarks (Altissimo, large N)
-  benchmark-gpu:
-    if: contains(github.repository, 'Altissimo') || github.event_name == 'workflow_dispatch'
-    runs-on: [self-hosted, gpu]   # EC2 runners from CuQuantum.jl setup
-    steps:
-      # same as above but with CUDA-enabled Julia
-```
-
-### Central aggregator: `harmoniqs-benchmarks` repo
-
-Triggered by workflow_dispatch or cron. Downloads latest artifacts from each package repo, generates comparison tables, stores historical archive.
-
-## Package Structure
-
-```
-HarmoniqsBenchmarks.jl/
-  src/
-    HarmoniqsBenchmarks.jl       # module + exports
-    schema.jl                     # BenchmarkResult, MicroBenchmarkResult
-    harness.jl                    # benchmark_solve!, build_evaluator
-    storage.jl                    # save/load JLD2, save_alloc_profile
-    report.jl                     # compare_results, regression detection
-    problems/
-      bilinear.jl                 # DirectTrajOpt-level generators
-      quantum_gates.jl            # Piccolo/Piccolissimo-level generators
-      polish.jl                   # Altissimo-level generators
-      qilc.jl                     # Intonato-level generators
-  Project.toml                    # deps: BenchmarkTools, JLD2, Dates
-  README.md
-
-# Per downstream package:
-DirectTrajOpt.jl/
-  benchmark/
-    Project.toml                  # [deps] HarmoniqsBenchmarks, BenchmarkTools, TestItems, ...
-    benchmarks.jl                 # @testitems: micro, macro, scaling
-    results/                      # .gitignored JLD2 output
-```
-
-## Verification
-
-1. **Unit test the harness:** `benchmark_solve!` returns a valid `BenchmarkResult` with all fields populated
-2. **Run micro-benchmarks locally:** Confirm BenchmarkTools produces histograms for each eval function
-3. **Run scaling sweep:** Verify memory grows as expected with N and state_dim
-4. **CI dry run:** Trigger workflow_dispatch on DirectTrajOpt, confirm artifact upload
-5. **Cross-package comparison:** Run aggregator on two package artifacts, verify comparison table output
-6. **Allocation profiling:** Run Profile.Allocs on a solve, verify PProf flamegraph renders
-
-## Scope
-
-**In scope (this design):**
-- HarmoniqsBenchmarks.jl package creation
-- DirectTrajOpt benchmark suite (Ipopt vs MadNLP, scaling, micro-benchmarks, allocation profiling)
-- Piccolissimo benchmark suite (integrate existing benchmarks + new scaling)
-- CI workflows for DirectTrajOpt and Piccolissimo
-- Aggregator script in harmoniqs-benchmarks repo
-
-**Future work:**
-- Altissimo GPU benchmarks (requires CUDA runner validation)
-- Intonato convergence benchmarks (requires stable Phase 5)
-- Piccolo template benchmarks
-- AllocCheck CI gates (after hot paths are optimized)
-- Automated regression comments on PRs

From ba121d399f0e830545b867d1b6d615ac034f52bb Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Thu, 16 Apr 2026 02:33:54 -0400
Subject: [PATCH 07/13] fix: exclude benchmark/ testitems from test suite

The benchmark @testitems require HarmoniqsBenchmarks which is only
available in the benchmark/ project environment, not the test extras.
Filter them out so `Pkg.test()` / julia-runtest CI doesn't pick them up.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 9f95075..d57ccc9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,5 +2,5 @@ using DirectTrajOpt
 using TestItemRunner
 
 
-# Run all testitem tests in package
-@run_package_tests
+# Exclude benchmark/ testitems — those run in a separate project environment
+@run_package_tests filter=ti -> !contains(ti.filename, "benchmark")

From 35ddb07257026f8b103689de9ae60fcf14946d1a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 17 Apr 2026 16:52:31 +0000
Subject: [PATCH 08/13] chore: autoformat

---
 benchmark/benchmarks.jl | 217 +++++++++++++++++++++++++++++++---------
 test/compare_solvers.jl |  18 ++--
 2 files changed, 174 insertions(+), 61 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index a7ecc6f..28e6ee4 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -6,19 +6,35 @@ using TestItems
     const MOI = MathOptInterface
 
     Random.seed!(42)
-    N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+    N = 51;
+    Δt = 0.1;
+    u_bound = 0.1;
+    ω = 0.1
     Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
     Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
     Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
     G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
 
     traj = NamedTrajectory(
-        (x=2rand(4,N).-1, u=u_bound*(2rand(2,N).-1), du=randn(2,N), ddu=randn(2,N), Δt=fill(Δt,N));
-        controls=(:ddu,:Δt), timestep=:Δt, bounds=(u=u_bound, Δt=(0.01,0.5)),
-        initial=(x=[1.0,0.0,0.0,0.0], u=zeros(2)), final=(u=zeros(2),),
-        goal=(x=[0.0,1.0,0.0,0.0],),
+        (
+            x = 2rand(4, N) .- 1,
+            u = u_bound*(2rand(2, N) .- 1),
+            du = randn(2, N),
+            ddu = randn(2, N),
+            Δt = fill(Δt, N),
+        );
+        controls = (:ddu, :Δt),
+        timestep = :Δt,
+        bounds = (u = u_bound, Δt = (0.01, 0.5)),
+        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+        final = (u = zeros(2),),
+        goal = (x = [0.0, 1.0, 0.0, 0.0],),
     )
-    integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj), DerivativeIntegrator(:du,:ddu,traj)]
+    integrators = [
+        BilinearIntegrator(G, :x, :u, traj),
+        DerivativeIntegrator(:u, :du, traj),
+        DerivativeIntegrator(:du, :ddu, traj),
+    ]
     J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
     prob = DirectTrajOptProblem(traj, J, integrators)
 
@@ -33,24 +49,52 @@ using TestItems
     mu = ones(dims.n_constraints)
 
     benchmarks = Dict{Symbol,EvalBenchmark}(
-        :eval_objective => trial_to_eval_benchmark(@benchmark(MOI.eval_objective($evaluator, $Z_vec))),
-        :eval_gradient => trial_to_eval_benchmark(@benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))),
-        :eval_constraint => trial_to_eval_benchmark(@benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))),
-        :eval_jacobian => trial_to_eval_benchmark(@benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))),
-        :eval_hessian_lagrangian => trial_to_eval_benchmark(@benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))),
+        :eval_objective =>
+            trial_to_eval_benchmark(@benchmark(MOI.eval_objective($evaluator, $Z_vec))),
+        :eval_gradient => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))
+        ),
+        :eval_constraint => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))
+        ),
+        :eval_jacobian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))
+        ),
+        :eval_hessian_lagrangian => trial_to_eval_benchmark(
+            @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))
+        ),
     )
 
     result = MicroBenchmarkResult(
-        package="DirectTrajOpt", package_version="0.8.10",
-        commit=(try String(strip(read(`git rev-parse --short HEAD`, String))) catch; "unknown" end),
-        benchmark_name="evaluator_micro_bilinear_N51", N=N, state_dim=4, control_dim=2,
-        eval_benchmarks=benchmarks, julia_version=string(VERSION),
-        timestamp=Dates.now(), runner=get(ENV, "BENCHMARK_RUNNER", "local"), n_threads=Threads.nthreads(),
+        package = "DirectTrajOpt",
+        package_version = "0.8.10",
+        commit = (
+            try
+                String(strip(read(`git rev-parse --short HEAD`, String)))
+            catch
+                ; "unknown"
+            end
+        ),
+        benchmark_name = "evaluator_micro_bilinear_N51",
+        N = N,
+        state_dim = 4,
+        control_dim = 2,
+        eval_benchmarks = benchmarks,
+        julia_version = string(VERSION),
+        timestamp = Dates.now(),
+        runner = get(ENV, "BENCHMARK_RUNNER", "local"),
+        n_threads = Threads.nthreads(),
     )
 
     println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
-    for (name, eb) in sort(collect(result.eval_benchmarks), by=first)
-        @printf("  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n", name, eb.median_ns, eb.allocs, eb.memory_bytes)
+    for (name, eb) in sort(collect(result.eval_benchmarks), by = first)
+        @printf(
+            "  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n",
+            name,
+            eb.median_ns,
+            eb.allocs,
+            eb.memory_bytes
+        )
     end
 
     results_dir = joinpath(@__DIR__, "results")
@@ -63,36 +107,66 @@ end
     using SparseArrays, ExponentialAction, Random, Dates
     import MadNLP
 
-    const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
+    ][1]
 
-    function make_bilinear_problem(; seed=42)
+    function make_bilinear_problem(; seed = 42)
         Random.seed!(seed)
-        N = 51; Δt = 0.1; u_bound = 0.1; ω = 0.1
+        N = 51;
+        Δt = 0.1;
+        u_bound = 0.1;
+        ω = 0.1
         Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
         Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
         Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
         G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
 
         traj = NamedTrajectory(
-            (x=2rand(4,N).-1, u=u_bound*(2rand(2,N).-1), du=randn(2,N), ddu=randn(2,N), Δt=fill(Δt,N));
-            controls=(:ddu,:Δt), timestep=:Δt, bounds=(u=u_bound, Δt=(0.01,0.5)),
-            initial=(x=[1.0,0.0,0.0,0.0], u=zeros(2)), final=(u=zeros(2),),
-            goal=(x=[0.0,1.0,0.0,0.0],),
+            (
+                x = 2rand(4, N) .- 1,
+                u = u_bound*(2rand(2, N) .- 1),
+                du = randn(2, N),
+                ddu = randn(2, N),
+                Δt = fill(Δt, N),
+            );
+            controls = (:ddu, :Δt),
+            timestep = :Δt,
+            bounds = (u = u_bound, Δt = (0.01, 0.5)),
+            initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
+            final = (u = zeros(2),),
+            goal = (x = [0.0, 1.0, 0.0, 0.0],),
         )
-        integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj), DerivativeIntegrator(:du,:ddu,traj)]
+        integrators = [
+            BilinearIntegrator(G, :x, :u, traj),
+            DerivativeIntegrator(:u, :du, traj),
+            DerivativeIntegrator(:du, :ddu, traj),
+        ]
         J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
         return DirectTrajOptProblem(traj, J, integrators)
     end
 
     prob_ipopt = make_bilinear_problem()
-    result_ipopt = benchmark_solve!(prob_ipopt, IpoptOptions(max_iter=200, print_level=0); benchmark_name="bilinear_N51_ipopt")
+    result_ipopt = benchmark_solve!(
+        prob_ipopt,
+        IpoptOptions(max_iter = 200, print_level = 0);
+        benchmark_name = "bilinear_N51_ipopt",
+    )
 
     prob_madnlp = make_bilinear_problem()
-    result_madnlp = benchmark_solve!(prob_madnlp, MadNLPSolverExt.MadNLPOptions(max_iter=200, print_level=1); benchmark_name="bilinear_N51_madnlp")
+    result_madnlp = benchmark_solve!(
+        prob_madnlp,
+        MadNLPSolverExt.MadNLPOptions(max_iter = 200, print_level = 1);
+        benchmark_name = "bilinear_N51_madnlp",
+    )
 
     println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
-    println("  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc")
-    println("  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc")
+    println(
+        "  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc",
+    )
+    println(
+        "  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc",
+    )
 
     results_dir = joinpath(@__DIR__, "results")
     save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
@@ -103,24 +177,37 @@ end
     using SparseArrays, ExponentialAction, Random, Dates, Printf
     import MadNLP
 
-    const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+    const MadNLPSolverExt = [
+        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
+    ][1]
 
-    function make_scaled_problem(; N, state_dim, n_controls=2, seed=42)
+    function make_scaled_problem(; N, state_dim, n_controls = 2, seed = 42)
         Random.seed!(seed)
         G_drift = sparse(randn(state_dim, state_dim))
-        G_drives = [sparse(randn(state_dim, state_dim)) for _ in 1:n_controls]
-        G(u) = G_drift + sum(u[i] * G_drives[i] for i in 1:n_controls)
+        G_drives = [sparse(randn(state_dim, state_dim)) for _ = 1:n_controls]
+        G(u) = G_drift + sum(u[i] * G_drives[i] for i = 1:n_controls)
 
-        x_init = zeros(state_dim); x_init[1] = 1.0
-        x_goal = zeros(state_dim); x_goal[min(2,state_dim)] = 1.0
+        x_init = zeros(state_dim);
+        x_init[1] = 1.0
+        x_goal = zeros(state_dim);
+        x_goal[min(2, state_dim)] = 1.0
 
         traj = NamedTrajectory(
-            (x=randn(state_dim,N), u=0.1*randn(n_controls,N), du=randn(n_controls,N), Δt=fill(0.1,N));
-            controls=(:du,:Δt), timestep=:Δt, bounds=(u=1.0, Δt=(0.01,0.5)),
-            initial=(x=x_init, u=zeros(n_controls)), final=(u=zeros(n_controls),),
-            goal=(x=x_goal,),
+            (
+                x = randn(state_dim, N),
+                u = 0.1*randn(n_controls, N),
+                du = randn(n_controls, N),
+                Δt = fill(0.1, N),
+            );
+            controls = (:du, :Δt),
+            timestep = :Δt,
+            bounds = (u = 1.0, Δt = (0.01, 0.5)),
+            initial = (x = x_init, u = zeros(n_controls)),
+            final = (u = zeros(n_controls),),
+            goal = (x = x_goal,),
         )
-        integrators = [BilinearIntegrator(G,:x,:u,traj), DerivativeIntegrator(:u,:du,traj)]
+        integrators =
+            [BilinearIntegrator(G, :x, :u, traj), DerivativeIntegrator(:u, :du, traj)]
         J = QuadraticRegularizer(:u, traj, 1.0)
         return DirectTrajOptProblem(traj, J, integrators)
     end
@@ -130,22 +217,52 @@ end
     results = BenchmarkResult[]
 
     println("\n=== Memory Scaling Study ===")
-    @printf("  %5s | %5s | %12s | %12s | %12s | %12s\n", "N", "dim", "Ipopt (s)", "Ipopt (KB)", "MadNLP (s)", "MadNLP (KB)")
-    @printf("  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n", "-"^5, "-"^5, "-"^12, "-"^12, "-"^12, "-"^12)
+    @printf(
+        "  %5s | %5s | %12s | %12s | %12s | %12s\n",
+        "N",
+        "dim",
+        "Ipopt (s)",
+        "Ipopt (KB)",
+        "MadNLP (s)",
+        "MadNLP (KB)"
+    )
+    @printf(
+        "  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n",
+        "-"^5,
+        "-"^5,
+        "-"^12,
+        "-"^12,
+        "-"^12,
+        "-"^12
+    )
 
     for N in N_values
         for dim in dim_values
-            prob = make_scaled_problem(; N=N, state_dim=dim)
-            r_ipopt = benchmark_solve!(prob, IpoptOptions(max_iter=50, print_level=0); benchmark_name="scaling_N$(N)_d$(dim)_ipopt")
+            prob = make_scaled_problem(; N = N, state_dim = dim)
+            r_ipopt = benchmark_solve!(
+                prob,
+                IpoptOptions(max_iter = 50, print_level = 0);
+                benchmark_name = "scaling_N$(N)_d$(dim)_ipopt",
+            )
             push!(results, r_ipopt)
 
-            prob = make_scaled_problem(; N=N, state_dim=dim)
-            r_madnlp = benchmark_solve!(prob, MadNLPSolverExt.MadNLPOptions(max_iter=50, print_level=1); benchmark_name="scaling_N$(N)_d$(dim)_madnlp")
+            prob = make_scaled_problem(; N = N, state_dim = dim)
+            r_madnlp = benchmark_solve!(
+                prob,
+                MadNLPSolverExt.MadNLPOptions(max_iter = 50, print_level = 1);
+                benchmark_name = "scaling_N$(N)_d$(dim)_madnlp",
+            )
             push!(results, r_madnlp)
 
-            @printf("  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
-                N, dim, r_ipopt.wall_time_s, r_ipopt.total_allocations_bytes ÷ 1024,
-                r_madnlp.wall_time_s, r_madnlp.total_allocations_bytes ÷ 1024)
+            @printf(
+                "  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
+                N,
+                dim,
+                r_ipopt.wall_time_s,
+                r_ipopt.total_allocations_bytes ÷ 1024,
+                r_madnlp.wall_time_s,
+                r_madnlp.total_allocations_bytes ÷ 1024
+            )
         end
     end
 
diff --git a/test/compare_solvers.jl b/test/compare_solvers.jl
index 77ac9a1..7dca12c 100644
--- a/test/compare_solvers.jl
+++ b/test/compare_solvers.jl
@@ -7,16 +7,12 @@ using SparseArrays
 using NamedTrajectories
 using DirectTrajOpt
 
-const MadNLPSolverExt = [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
-
-function get_seeded_trajectory(seed;
-    N = 10,
-    Δt = 0.1,
-    u_bound = 0.1,
-    ω = 0.1,
-)
+const MadNLPSolverExt =
+    [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+
+function get_seeded_trajectory(seed; N = 10, Δt = 0.1, u_bound = 0.1, ω = 0.1)
     Random.seed!(seed)
-    
+
     Gx = sparse(Float64[
         0 0 0 1;
         0 0 1 0;
@@ -59,7 +55,7 @@ function get_seeded_trajectory(seed;
         );
         controls = (:ddu, :Δt),
         timestep = :Δt,
-        bounds = (u = (-u_bound, u_bound), Δt = (1., 1.)), # timestep variability is a major source of error as in the "multiple comparisons problem" so we make them constant here
+        bounds = (u = (-u_bound, u_bound), Δt = (1.0, 1.0)), # timestep variability is a major source of error as in the "multiple comparisons problem" so we make them constant here
         initial = (x = x_init, u = zeros(2)),
         final = (u = zeros(2),),
         goal = (x = x_goal,),
@@ -144,7 +140,7 @@ function get_solver_comparison(seed)
     return err, (ti, tm)
 end
 
-wins = Dict(:ipopt => 0, :madnlp => 0,)
+wins = Dict(:ipopt => 0, :madnlp => 0)
 for seed = 0:99
     err, (ti, tm) = get_solver_comparison(seed)
     (err < 1e-3) || exit(1)

From 1bf0416d7b421de69edf549d91a6adae1f74c031 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jack@harmoniqs.co>
Date: Thu, 23 Apr 2026 08:55:49 -0400
Subject: [PATCH 09/13] Add MadNLP pass-through fields (linear_solver,
 array_type, etc.)

Restores pass-throughs consumed by MadNLP's MOI layer so users can select CUDSSSolver, CuArray, KKT variants, and cuDSS ordering through the MadNLPOptions struct (required for MadNLPGPU/cuDSS-on-GPU flows). set_options! now skips fields left as `nothing` so MadNLP's own defaults stand.
---
 ext/MadNLPSolverExt/solver.jl        | 7 ++++++-
 src/solvers/madnlp_solver/options.jl | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ext/MadNLPSolverExt/solver.jl b/ext/MadNLPSolverExt/solver.jl
index 514d29a..ca60e53 100644
--- a/ext/MadNLPSolverExt/solver.jl
+++ b/ext/MadNLPSolverExt/solver.jl
@@ -208,7 +208,12 @@ function DirectTrajOpt.set_options!(optimizer::AbstractOptimizer, options::MadNL
         if name in ignored_options
             continue
         end
-        # TODO: allow internal defaults, i.e. do not set the internal options dict unless the user actually specified the associated opt
+        # `nothing` means "use MadNLP's own default" — don't overwrite the optimizer's
+        # internal dict in that case. Applies to the pass-through fields
+        # (linear_solver, array_type, kkt_system, cudss_ordering).
+        if value === nothing
+            continue
+        end
         if name == :print_level
             optimizer.options[name] = MadNLP.LogLevels(value)
         elseif name == :hessian_approximation
diff --git a/src/solvers/madnlp_solver/options.jl b/src/solvers/madnlp_solver/options.jl
index 9a6fb73..53c10ae 100644
--- a/src/solvers/madnlp_solver/options.jl
+++ b/src/solvers/madnlp_solver/options.jl
@@ -7,6 +7,13 @@ export MadNLPOptions
     print_level::Int = 3 # (MadNLP.TRACE::MadNLP.LogLevels = 1, ..., MadNLP.ERROR::MadNLP.LogLevels = 6)
     hessian_approximation::String = "exact" # (exact = MadNLP.ExactHessian, compact_lbfgs = MadNLP.CompactLBFGS) # no other QN methods supported in conjunction with MadNLP.SparseCallback
 
+    # Pass-throughs consumed by MadNLP's MOI layer (not by MadNLP itself);
+    # leave as `nothing` to use MadNLP defaults. Only forwarded when non-nothing.
+    linear_solver::Any  = nothing  # e.g. MadNLPGPU.CUDSSSolver, MadNLP.LapackCPUSolver
+    array_type::Any     = nothing  # e.g. CUDA.CuArray for GPU
+    kkt_system::Any     = nothing  # e.g. MadNLP.SparseUnreducedKKTSystem
+    cudss_ordering::Any = nothing  # e.g. MadNLPGPU.AMD_ORDERING
+
     # # Only supported by DirectTrajOpt._solve, as an optional kwarg override of `hessian_approximation`;
     # #   `hessian_approximation = eval_hessian ? "exact" : "compact_lbfgs"`
     # eval_hessian::Bool = true

From 9239b1a1b2834c0ac69cb2e97f90f39dd78b3eb2 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jack@harmoniqs.co>
Date: Thu, 23 Apr 2026 08:58:35 -0400
Subject: [PATCH 10/13] Format MadNLPOptions per JuliaFormatter

---
 src/solvers/madnlp_solver/options.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solvers/madnlp_solver/options.jl b/src/solvers/madnlp_solver/options.jl
index 53c10ae..6d3a382 100644
--- a/src/solvers/madnlp_solver/options.jl
+++ b/src/solvers/madnlp_solver/options.jl
@@ -9,9 +9,9 @@ export MadNLPOptions
 
     # Pass-throughs consumed by MadNLP's MOI layer (not by MadNLP itself);
     # leave as `nothing` to use MadNLP defaults. Only forwarded when non-nothing.
-    linear_solver::Any  = nothing  # e.g. MadNLPGPU.CUDSSSolver, MadNLP.LapackCPUSolver
-    array_type::Any     = nothing  # e.g. CUDA.CuArray for GPU
-    kkt_system::Any     = nothing  # e.g. MadNLP.SparseUnreducedKKTSystem
+    linear_solver::Any = nothing  # e.g. MadNLPGPU.CUDSSSolver, MadNLP.LapackCPUSolver
+    array_type::Any = nothing  # e.g. CUDA.CuArray for GPU
+    kkt_system::Any = nothing  # e.g. MadNLP.SparseUnreducedKKTSystem
     cudss_ordering::Any = nothing  # e.g. MadNLPGPU.AMD_ORDERING
 
     # # Only supported by DirectTrajOpt._solve, as an optional kwarg override of `hessian_approximation`;

From 6b50bc76b4229c7261a6e798960ed1a87bb6e1d6 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Fri, 24 Apr 2026 16:46:20 -0400
Subject: [PATCH 11/13] chore: bump version to 0.8.11

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index ad27bd3..fe86d12 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DirectTrajOpt"
 uuid = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-version = "0.8.10"
+version = "0.8.11"
 authors = ["Aaron Trowbridge <aaron.j.trowbridge@gmail.com> and contributors"]
 
 [deps]

From 091c7b24bdae191a1f3a4893f520c06e3693f1d7 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Fri, 24 Apr 2026 16:47:43 -0400
Subject: [PATCH 12/13] Revert "chore: bump version to 0.8.11"

This reverts commit 6b50bc76b4229c7261a6e798960ed1a87bb6e1d6.
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index fe86d12..ad27bd3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DirectTrajOpt"
 uuid = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-version = "0.8.11"
+version = "0.8.10"
 authors = ["Aaron Trowbridge <aaron.j.trowbridge@gmail.com> and contributors"]
 
 [deps]

From 83da0761265257f600586c693f11a9810c77b378 Mon Sep 17 00:00:00 2001
From: Jack Champagne <jackchampagne.r@gmail.com>
Date: Fri, 24 Apr 2026 16:53:14 -0400
Subject: [PATCH 13/13] Revert "Merge pull request #67 from
 harmoniqs/benchmarks/directtrajopt-initial"

This reverts commit 19fa68ad630ccf30141aee73ac4aba2a9207dd1a, reversing
changes made to 9239b1a1b2834c0ac69cb2e97f90f39dd78b3eb2.
---
 .github/workflows/benchmark.yml |  50 ------
 Project.toml                    |   2 +-
 benchmark/.gitignore            |   2 -
 benchmark/BenchmarkUtils.jl     |   1 +
 benchmark/Project.toml          |  20 ---
 benchmark/README.md             |  33 ----
 benchmark/benchmarks.jl         | 272 --------------------------------
 test/compare_solvers.jl         |  16 +-
 test/runtests.jl                |   4 +-
 9 files changed, 12 insertions(+), 388 deletions(-)
 delete mode 100644 .github/workflows/benchmark.yml
 delete mode 100644 benchmark/.gitignore
 create mode 100644 benchmark/BenchmarkUtils.jl
 delete mode 100644 benchmark/Project.toml
 delete mode 100644 benchmark/README.md
 delete mode 100644 benchmark/benchmarks.jl

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index 1f20d76..0000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Benchmarks
-on:
-  push:
-    tags: ['v*']
-  pull_request:
-    paths:
-      - 'src/**'
-      - 'benchmark/**'
-      - '.github/workflows/benchmark.yml'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  benchmark:
-    name: Benchmark suite
-    runs-on: ubuntu-latest
-    timeout-minutes: 60
-    permissions:
-      actions: write
-      contents: read
-    steps:
-      - uses: actions/checkout@v6
-
-      - uses: julia-actions/setup-julia@v2
-        with:
-          version: '1.11'
-          arch: x64
-
-      - uses: julia-actions/cache@v2
-
-      - name: Instantiate benchmark environment
-        run: julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
-
-      - name: Run benchmarks
-        run: |
-          julia --project=benchmark -t auto -e '
-            using TestItemRunner
-            TestItemRunner.run_tests("benchmark/")
-          '
-
-      - name: Upload benchmark artifacts
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: benchmark-${{ github.event.pull_request.number || github.ref_name }}-${{ github.sha }}
-          path: benchmark/results/
-          retention-days: 90
diff --git a/Project.toml b/Project.toml
index ee36b38..796bc0e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DirectTrajOpt"
 uuid = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-version = "0.9.1"
+version = "0.9.0"
 authors = ["Aaron Trowbridge <aaron.j.trowbridge@gmail.com> and contributors"]
 
 [deps]
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
deleted file mode 100644
index ca28c11..0000000
--- a/benchmark/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-results/
-Manifest.toml
diff --git a/benchmark/BenchmarkUtils.jl b/benchmark/BenchmarkUtils.jl
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/benchmark/BenchmarkUtils.jl
@@ -0,0 +1 @@
+
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
deleted file mode 100644
index b219215..0000000
--- a/benchmark/Project.toml
+++ /dev/null
@@ -1,20 +0,0 @@
-[deps]
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
-DirectTrajOpt = "c823fa1f-8872-4af5-b810-2b9b72bbbf56"
-ExponentialAction = "e24c0720-ea99-47e8-929e-571b494574d3"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-HarmoniqsBenchmarks = "f45d0b76-2d23-4568-9599-481e0da131db"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
-MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
-NamedTrajectories = "538bc3a1-5ab9-4fc3-b776-35ca1e893e08"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
-TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
-
-[sources]
-DirectTrajOpt = {path = ".."}
-HarmoniqsBenchmarks = {url = "https://github.com/harmoniqs/HarmoniqsBenchmarks.jl"}
diff --git a/benchmark/README.md b/benchmark/README.md
deleted file mode 100644
index c0737c9..0000000
--- a/benchmark/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# DirectTrajOpt Benchmarks
-
-Benchmark suite for DirectTrajOpt.jl comparing Ipopt and MadNLP solver performance.
-
-## Running locally
-
-```bash
-# From DirectTrajOpt.jl root
-julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
-
-julia --project=benchmark -t auto -e '
-    using TestItemRunner
-    TestItemRunner.run_tests("benchmark/")
-'
-```
-
-Artifacts are saved as JLD2 files in `benchmark/results/` (gitignored).
-
-## Benchmark suites
-
-- **Evaluator micro-benchmarks** — `BenchmarkTools.@benchmark` timings for each MOI eval function (objective, gradient, constraint, jacobian, hessian_lagrangian) on bilinear N=51
-- **Ipopt vs MadNLP** — full solve comparison on bilinear N=51
-- **Memory scaling study** — N ∈ {25, 51, 101} × state_dim ∈ {4, 8, 16}
-
-## Schema
-
-Results use `BenchmarkResult` / `MicroBenchmarkResult` from [HarmoniqsBenchmarks.jl](https://github.com/harmoniqs/HarmoniqsBenchmarks.jl).
-
-Load with:
-```julia
-using HarmoniqsBenchmarks
-results = load_results("benchmark/results/ipopt_vs_madnlp_N51_<sha>.jld2")
-```
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
deleted file mode 100644
index 28e6ee4..0000000
--- a/benchmark/benchmarks.jl
+++ /dev/null
@@ -1,272 +0,0 @@
-using TestItems
-
-@testitem "Evaluator micro-benchmarks: bilinear N=51" begin
-    using HarmoniqsBenchmarks, BenchmarkTools, DirectTrajOpt, NamedTrajectories
-    using SparseArrays, ExponentialAction, MathOptInterface, Random, Dates, Printf
-    const MOI = MathOptInterface
-
-    Random.seed!(42)
-    N = 51;
-    Δt = 0.1;
-    u_bound = 0.1;
-    ω = 0.1
-    Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
-    Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
-    Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
-    G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
-
-    traj = NamedTrajectory(
-        (
-            x = 2rand(4, N) .- 1,
-            u = u_bound*(2rand(2, N) .- 1),
-            du = randn(2, N),
-            ddu = randn(2, N),
-            Δt = fill(Δt, N),
-        );
-        controls = (:ddu, :Δt),
-        timestep = :Δt,
-        bounds = (u = u_bound, Δt = (0.01, 0.5)),
-        initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-        final = (u = zeros(2),),
-        goal = (x = [0.0, 1.0, 0.0, 0.0],),
-    )
-    integrators = [
-        BilinearIntegrator(G, :x, :u, traj),
-        DerivativeIntegrator(:u, :du, traj),
-        DerivativeIntegrator(:du, :ddu, traj),
-    ]
-    J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
-    prob = DirectTrajOptProblem(traj, J, integrators)
-
-    evaluator, Z_vec = build_evaluator(prob)
-    dims = evaluator_dims(evaluator)
-
-    g = zeros(dims.n_constraints)
-    grad = zeros(dims.n_variables)
-    H = zeros(dims.n_hessian_entries)
-    Jac = zeros(dims.n_jacobian_entries)
-    sigma = 1.0
-    mu = ones(dims.n_constraints)
-
-    benchmarks = Dict{Symbol,EvalBenchmark}(
-        :eval_objective =>
-            trial_to_eval_benchmark(@benchmark(MOI.eval_objective($evaluator, $Z_vec))),
-        :eval_gradient => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_objective_gradient($evaluator, $grad, $Z_vec))
-        ),
-        :eval_constraint => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_constraint($evaluator, $g, $Z_vec))
-        ),
-        :eval_jacobian => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_constraint_jacobian($evaluator, $Jac, $Z_vec))
-        ),
-        :eval_hessian_lagrangian => trial_to_eval_benchmark(
-            @benchmark(MOI.eval_hessian_lagrangian($evaluator, $H, $Z_vec, $sigma, $mu))
-        ),
-    )
-
-    result = MicroBenchmarkResult(
-        package = "DirectTrajOpt",
-        package_version = "0.8.10",
-        commit = (
-            try
-                String(strip(read(`git rev-parse --short HEAD`, String)))
-            catch
-                ; "unknown"
-            end
-        ),
-        benchmark_name = "evaluator_micro_bilinear_N51",
-        N = N,
-        state_dim = 4,
-        control_dim = 2,
-        eval_benchmarks = benchmarks,
-        julia_version = string(VERSION),
-        timestamp = Dates.now(),
-        runner = get(ENV, "BENCHMARK_RUNNER", "local"),
-        n_threads = Threads.nthreads(),
-    )
-
-    println("\n=== Evaluator Micro-benchmarks (bilinear N=$N) ===")
-    for (name, eb) in sort(collect(result.eval_benchmarks), by = first)
-        @printf(
-            "  %-25s  median: %8.1f ns  allocs: %d  memory: %d bytes\n",
-            name,
-            eb.median_ns,
-            eb.allocs,
-            eb.memory_bytes
-        )
-    end
-
-    results_dir = joinpath(@__DIR__, "results")
-    save_micro_results(results_dir, result.benchmark_name, result)
-    println("  Saved to $results_dir/")
-end
-
-@testitem "Ipopt vs MadNLP: bilinear N=51" begin
-    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
-    using SparseArrays, ExponentialAction, Random, Dates
-    import MadNLP
-
-    const MadNLPSolverExt = [
-        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
-    ][1]
-
-    function make_bilinear_problem(; seed = 42)
-        Random.seed!(seed)
-        N = 51;
-        Δt = 0.1;
-        u_bound = 0.1;
-        ω = 0.1
-        Gx = sparse(Float64[0 0 0 1; 0 0 1 0; 0 -1 0 0; -1 0 0 0])
-        Gy = sparse(Float64[0 -1 0 0; 1 0 0 0; 0 0 0 -1; 0 0 1 0])
-        Gz = sparse(Float64[0 0 1 0; 0 0 0 -1; -1 0 0 0; 0 1 0 0])
-        G(u) = ω * Gz + u[1] * Gx + u[2] * Gy
-
-        traj = NamedTrajectory(
-            (
-                x = 2rand(4, N) .- 1,
-                u = u_bound*(2rand(2, N) .- 1),
-                du = randn(2, N),
-                ddu = randn(2, N),
-                Δt = fill(Δt, N),
-            );
-            controls = (:ddu, :Δt),
-            timestep = :Δt,
-            bounds = (u = u_bound, Δt = (0.01, 0.5)),
-            initial = (x = [1.0, 0.0, 0.0, 0.0], u = zeros(2)),
-            final = (u = zeros(2),),
-            goal = (x = [0.0, 1.0, 0.0, 0.0],),
-        )
-        integrators = [
-            BilinearIntegrator(G, :x, :u, traj),
-            DerivativeIntegrator(:u, :du, traj),
-            DerivativeIntegrator(:du, :ddu, traj),
-        ]
-        J = QuadraticRegularizer(:u, traj, 1.0) + QuadraticRegularizer(:du, traj, 1.0)
-        return DirectTrajOptProblem(traj, J, integrators)
-    end
-
-    prob_ipopt = make_bilinear_problem()
-    result_ipopt = benchmark_solve!(
-        prob_ipopt,
-        IpoptOptions(max_iter = 200, print_level = 0);
-        benchmark_name = "bilinear_N51_ipopt",
-    )
-
-    prob_madnlp = make_bilinear_problem()
-    result_madnlp = benchmark_solve!(
-        prob_madnlp,
-        MadNLPSolverExt.MadNLPOptions(max_iter = 200, print_level = 1);
-        benchmark_name = "bilinear_N51_madnlp",
-    )
-
-    println("\n=== Ipopt vs MadNLP: bilinear N=51 ===")
-    println(
-        "  Ipopt:  $(round(result_ipopt.wall_time_s, digits=3))s, $(result_ipopt.total_allocations_bytes ÷ 1024) KB alloc",
-    )
-    println(
-        "  MadNLP: $(round(result_madnlp.wall_time_s, digits=3))s, $(result_madnlp.total_allocations_bytes ÷ 1024) KB alloc",
-    )
-
-    results_dir = joinpath(@__DIR__, "results")
-    save_results(results_dir, "ipopt_vs_madnlp_N51", [result_ipopt, result_madnlp])
-end
-
-@testitem "Memory scaling: N and state_dim sweep" begin
-    using HarmoniqsBenchmarks, DirectTrajOpt, NamedTrajectories
-    using SparseArrays, ExponentialAction, Random, Dates, Printf
-    import MadNLP
-
-    const MadNLPSolverExt = [
-        mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt
-    ][1]
-
-    function make_scaled_problem(; N, state_dim, n_controls = 2, seed = 42)
-        Random.seed!(seed)
-        G_drift = sparse(randn(state_dim, state_dim))
-        G_drives = [sparse(randn(state_dim, state_dim)) for _ = 1:n_controls]
-        G(u) = G_drift + sum(u[i] * G_drives[i] for i = 1:n_controls)
-
-        x_init = zeros(state_dim);
-        x_init[1] = 1.0
-        x_goal = zeros(state_dim);
-        x_goal[min(2, state_dim)] = 1.0
-
-        traj = NamedTrajectory(
-            (
-                x = randn(state_dim, N),
-                u = 0.1*randn(n_controls, N),
-                du = randn(n_controls, N),
-                Δt = fill(0.1, N),
-            );
-            controls = (:du, :Δt),
-            timestep = :Δt,
-            bounds = (u = 1.0, Δt = (0.01, 0.5)),
-            initial = (x = x_init, u = zeros(n_controls)),
-            final = (u = zeros(n_controls),),
-            goal = (x = x_goal,),
-        )
-        integrators =
-            [BilinearIntegrator(G, :x, :u, traj), DerivativeIntegrator(:u, :du, traj)]
-        J = QuadraticRegularizer(:u, traj, 1.0)
-        return DirectTrajOptProblem(traj, J, integrators)
-    end
-
-    N_values = [25, 51, 101]
-    dim_values = [4, 8, 16]
-    results = BenchmarkResult[]
-
-    println("\n=== Memory Scaling Study ===")
-    @printf(
-        "  %5s | %5s | %12s | %12s | %12s | %12s\n",
-        "N",
-        "dim",
-        "Ipopt (s)",
-        "Ipopt (KB)",
-        "MadNLP (s)",
-        "MadNLP (KB)"
-    )
-    @printf(
-        "  %5s-+-%5s-+-%12s-+-%12s-+-%12s-+-%12s\n",
-        "-"^5,
-        "-"^5,
-        "-"^12,
-        "-"^12,
-        "-"^12,
-        "-"^12
-    )
-
-    for N in N_values
-        for dim in dim_values
-            prob = make_scaled_problem(; N = N, state_dim = dim)
-            r_ipopt = benchmark_solve!(
-                prob,
-                IpoptOptions(max_iter = 50, print_level = 0);
-                benchmark_name = "scaling_N$(N)_d$(dim)_ipopt",
-            )
-            push!(results, r_ipopt)
-
-            prob = make_scaled_problem(; N = N, state_dim = dim)
-            r_madnlp = benchmark_solve!(
-                prob,
-                MadNLPSolverExt.MadNLPOptions(max_iter = 50, print_level = 1);
-                benchmark_name = "scaling_N$(N)_d$(dim)_madnlp",
-            )
-            push!(results, r_madnlp)
-
-            @printf(
-                "  %5d | %5d | %12.3f | %12d | %12.3f | %12d\n",
-                N,
-                dim,
-                r_ipopt.wall_time_s,
-                r_ipopt.total_allocations_bytes ÷ 1024,
-                r_madnlp.wall_time_s,
-                r_madnlp.total_allocations_bytes ÷ 1024
-            )
-        end
-    end
-
-    results_dir = joinpath(@__DIR__, "results")
-    save_results(results_dir, "memory_scaling", results)
-    println("\n  Saved $(length(results)) results to $results_dir/")
-end
diff --git a/test/compare_solvers.jl b/test/compare_solvers.jl
index 5fd5c3f..d13cb06 100644
--- a/test/compare_solvers.jl
+++ b/test/compare_solvers.jl
@@ -7,8 +7,8 @@ using SparseArrays
 using NamedTrajectories
 using DirectTrajOpt
 
-const MadNLPSolverExt =
-    [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
+# const MadNLPSolverExt =
+#     [mod for mod in reverse(Base.loaded_modules_order) if Symbol(mod) == :MadNLPSolverExt][1]
 
 function get_seeded_trajectory(seed; N = 10, Δt = 0.1, u_bound = 0.1, ω = 0.1)
     Random.seed!(seed)
@@ -140,11 +140,11 @@ function get_solver_comparison(seed)
     return err, (ti, tm)
 end
 
-wins = Dict(:ipopt => 0, :madnlp => 0)
-for seed = 0:99
-    err, (ti, tm) = get_solver_comparison(seed)
-    (err < 1e-3) || exit(1)
-    wins[(ti < tm) ? :ipopt : :madnlp] += 1
-end
+# wins = Dict(:ipopt => 0, :madnlp => 0)
+# for seed = 0:99
+#     err, (ti, tm) = get_solver_comparison(seed)
+#     (err < 1e-3) || exit(1)
+#     wins[(ti < tm) ? :ipopt : :madnlp] += 1
+# end
 
 # @info "Wins: $(wins)"
diff --git a/test/runtests.jl b/test/runtests.jl
index d57ccc9..9f95075 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,5 +2,5 @@ using DirectTrajOpt
 using TestItemRunner
 
 
-# Exclude benchmark/ testitems — those run in a separate project environment
-@run_package_tests filter=ti -> !contains(ti.filename, "benchmark")
+# Run all testitem tests in package
+@run_package_tests