From 71b055d6a463d3656282bc435c23bd0055f4b2b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:20:18 +0000 Subject: [PATCH 1/9] Initial plan From c1fb1b884171295c716191784af90ca36c0e39b9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:27:10 +0000 Subject: [PATCH 2/9] Add experimental tag to flaky test and implement filtering in runtests.jl Co-authored-by: jack-champagne <43344745+jack-champagne@users.noreply.github.com> --- src/constraints/linear/time_consistency_constraint.jl | 2 +- test/runtests.jl | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/constraints/linear/time_consistency_constraint.jl b/src/constraints/linear/time_consistency_constraint.jl index f84bdb7..c1089c6 100644 --- a/src/constraints/linear/time_consistency_constraint.jl +++ b/src/constraints/linear/time_consistency_constraint.jl @@ -81,7 +81,7 @@ end end end -@testitem "TimeConsistencyConstraint with free time optimization" begin +@testitem "TimeConsistencyConstraint with free time optimization" tags=[:experimental] begin include("../../../test/test_utils.jl") using NamedTrajectories diff --git a/test/runtests.jl b/test/runtests.jl index 9f95075..8fd4472 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,4 +3,9 @@ using TestItemRunner # Run all testitem tests in package -@run_package_tests +# Filter out experimental tests unless INCLUDE_EXPERIMENTAL environment variable is set +if !haskey(ENV, "INCLUDE_EXPERIMENTAL") + @run_package_tests filter=ti -> !(:experimental in ti.tags) +else + @run_package_tests +end From fab856f21113dafa0f50ee64b2f371be91c90231 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:32:42 +0000 Subject: [PATCH 3/9] Add testing documentation to README Co-authored-by: jack-champagne <43344745+jack-champagne@users.noreply.github.com> --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 57fd758..cf7f6ed 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,21 @@ solve!(prob; max_iter=100) - **Modular objectives**: Combine multiple cost terms (regularization, minimum time, etc.) - **Constraint support**: Bounds, equality, and general nonlinear constraints - **Automatic differentiation**: Efficient gradients and Hessians -- **Sparse formulations**: Exploits problem structure for efficiency +- **Sparse formulations**: Exploits problem structure for efficiency + +## Testing + +To run all stable tests: +```bash +julia --project=. test/runtests.jl +``` + +To include experimental tests (tests marked as potentially flaky): +```bash +INCLUDE_EXPERIMENTAL=1 julia --project=. test/runtests.jl +``` + +Some tests are tagged as `experimental` because they may be unstable or flaky in certain environments. By default, these tests are excluded from CI runs to maintain build stability. ### Building Documentation From 2317d72b473bb66537dd7560e842e47452bd9a14 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:34:17 +0000 Subject: [PATCH 4/9] Add safety check for tags field in test filtering Co-authored-by: jack-champagne <43344745+jack-champagne@users.noreply.github.com> --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 8fd4472..2650210 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,7 @@ using TestItemRunner # Run all testitem tests in package # Filter out experimental tests unless INCLUDE_EXPERIMENTAL environment variable is set if !haskey(ENV, "INCLUDE_EXPERIMENTAL") - @run_package_tests filter=ti -> !(:experimental in ti.tags) + @run_package_tests filter=ti -> !(:experimental in get(ti, :tags, [])) else @run_package_tests end From 13693aea1b8d9c9521b2dffd35888e8ea5d5e123 Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 01:11:45 -0400 Subject: [PATCH 5/9] test(infra): thread rng through bilinear_dynamics_and_trajectory and test_constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds optional `rng::AbstractRNG` kwarg to the two main test helpers so seeded reproducibility is opt-in and per-test rather than via global Random.seed!: - `bilinear_dynamics_and_trajectory` now accepts `rng` (default `Random.default_rng()`); a seeded `MersenneTwister` makes every random draw in the trajectory deterministic. - `test_constraint` accepts `rng` (for the Lagrange-multiplier draw) and `assert::Bool=true` (set false to inspect `jacobian_pass`/`hessian_pass` on the returned NamedTuple without registering @test outcomes — needed for multi-seed robustness sweeps). The return type changes from a 4-tuple to a NamedTuple, but positional destructuring `(a,b,c,d) = test_constraint(...)` still works since the first four fields are unchanged. No internal callers destructure the result anyway. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/constraints/_constraints.jl | 46 +++++++++++++++++++++++---------- test/test_utils.jl | 12 +++++---- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/constraints/_constraints.jl b/src/constraints/_constraints.jl index f5aff3d..3f86759 100644 --- a/src/constraints/_constraints.jl +++ b/src/constraints/_constraints.jl @@ -24,6 +24,7 @@ using FiniteDiff using SparseArrays using TestItemRunner using LinearAlgebra +using Random using Test # Import and extend the common interface @@ -121,7 +122,8 @@ function get_full_hessian end show_hessian_diff=false, test_equality=true, atol=1e-5, - rtol=1e-5 + rtol=1e-5, + rng=Random.default_rng(), ) Test that constraint Jacobian and Hessian match finite difference approximations. @@ -136,9 +138,16 @@ Test that constraint Jacobian and Hessian match finite difference approximations - `test_equality=true`: Test element-wise equality (vs norm-based test) - `atol=1e-5`: Absolute tolerance - `rtol=1e-5`: Relative tolerance +- `rng`: RNG used to draw the Lagrange multiplier `μ` for the Hessian check. + Pass a seeded `AbstractRNG` (e.g. `MersenneTwister(seed)`) for reproducible + test results across runs and Julia versions. +- `assert=true`: When `true`, fails the surrounding `@testset` via `@test` on + mismatch. Set `false` to inspect `jacobian_pass`/`hessian_pass` in the + returned NamedTuple without registering a test outcome — useful for + multi-seed robustness sweeps. # Returns -Tuple of (∂g, ∂g_finite_diff, μ∂²g, μ∂²g_finite_diff) for inspection +NamedTuple `(; ∂g, ∂g_finite_diff, μ∂²g, μ∂²g_finite_diff, jacobian_pass, hessian_pass)` # Example ```julia @@ -155,6 +164,8 @@ function test_constraint( test_equality = true, atol = 1e-5, rtol = 1e-5, + rng::AbstractRNG = Random.default_rng(), + assert::Bool = true, ) # Function to evaluate constraint via evaluate! @@ -192,18 +203,19 @@ function test_constraint( end # Test Jacobian equality - if test_equality - @test all(isapprox.(∂g, ∂g_finite_diff, atol = atol, rtol = rtol)) + jac_pass = if test_equality + all(isapprox.(∂g, ∂g_finite_diff, atol = atol, rtol = rtol)) else if atol > 0.0 - @test norm(∂g - ∂g_finite_diff) < atol + norm(∂g - ∂g_finite_diff) < atol else - @test norm(∂g - ∂g_finite_diff) / norm(∂g_finite_diff) < rtol + norm(∂g - ∂g_finite_diff) / norm(∂g_finite_diff) < rtol end end + assert && @test jac_pass # Test Hessian - μ = rand(constraint.dim) + μ = rand(rng, constraint.dim) μ∂²g = CommonInterface.eval_hessian_of_lagrangian(constraint, traj, μ) @@ -225,17 +237,25 @@ function test_constraint( end # Test Hessian equality (only upper triangle since Hessian is symmetric) - if test_equality - @test all(isapprox.(triu(μ∂²g), triu(μ∂²g_finite_diff), atol = atol)) + hess_pass = if test_equality + all(isapprox.(triu(μ∂²g), triu(μ∂²g_finite_diff), atol = atol)) else if atol > 0.0 - @test norm(μ∂²g - μ∂²g_finite_diff) < atol + norm(μ∂²g - μ∂²g_finite_diff) < atol else - @test norm(μ∂²g - μ∂²g_finite_diff) / norm(μ∂²g_finite_diff) < rtol + norm(μ∂²g - μ∂²g_finite_diff) / norm(μ∂²g_finite_diff) < rtol end end - - return ∂g, ∂g_finite_diff, μ∂²g, μ∂²g_finite_diff + assert && @test hess_pass + + return (; + ∂g, + ∂g_finite_diff, + μ∂²g, + μ∂²g_finite_diff, + jacobian_pass = jac_pass, + hessian_pass = hess_pass, + ) end export test_constraint diff --git a/test/test_utils.jl b/test/test_utils.jl index b34a482..cf0c1a9 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -1,6 +1,7 @@ using NamedTrajectories using SparseArrays using LinearAlgebra +using Random using ForwardDiff using ExponentialAction @@ -121,6 +122,7 @@ function bilinear_dynamics_and_trajectory(; ω::Float64 = 0.1, add_time::Bool = false, add_global::Bool = false, + rng::AbstractRNG = Random.default_rng(), ) Gx = sparse(Float64[ 0 0 0 1; @@ -148,8 +150,8 @@ function bilinear_dynamics_and_trajectory(; G(u) = ω * Gz + sum(u .* G_drives) - u_initial = u_bound * (2rand(2, N) .- 1) - x_initial = 2rand(4, N) .- 1 + u_initial = u_bound * (2rand(rng, 2, N) .- 1) + x_initial = 2rand(rng, 4, N) .- 1 x_init = [1.0, 0.0, 0.0, 0.0] x_goal = [0.0, 1.0, 0.0, 0.0] @@ -158,8 +160,8 @@ function bilinear_dynamics_and_trajectory(; ( x = x_initial, u = u_initial, - du = randn(2, N), - ddu = randn(2, N), + du = randn(rng, 2, N), + ddu = randn(rng, 2, N), Δt = fill(Δt, N), ); controls = (:ddu, :Δt), @@ -175,7 +177,7 @@ function bilinear_dynamics_and_trajectory(; end if add_global - traj = add_component(traj, :g, randn(N), type = :global) + traj = add_component(traj, :g, randn(rng, N), type = :global) end return G, traj From 0fe40c5d649b80533b850717b2944f4f9f6e5246 Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 01:12:04 -0400 Subject: [PATCH 6/9] test: add :robustness tag, replace :experimental flaky tests with seeded + sweep pairs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalizes the runtests.jl filter into a two-tag taxonomy: :experimental excluded by default; opt in with INCLUDE_EXPERIMENTAL=1 :robustness excluded by default; opt in with INCLUDE_ROBUSTNESS=1 The filter remains a single closure over `ti.tags`, easy to absorb into gennadiryan's upcoming upfront test-item discovery PR later. For the two confirmed-flaky tests, replaces the single-seed :experimental version with two testitems: 1. A deterministic baseline (untagged) that uses MersenneTwister(0) for both trajectory initialization and the multiplier μ. Runs every PR. A failure here is a real regression on a specific (Julia version, seed) pair — not RNG drift. 2. A :robustness sweep (K=20 seeds) that asserts ≥80% of seeds pass within the same tolerance. With K=20 and threshold=0.80, a true pass-rate of 95% passes ~99.8% of the time, while a regression that drops the true rate to 50% fails ~99.4% of the time. Affected tests: - TimeConsistencyConstraint with free time optimization (was :experimental; drops the tag, gains a robustness twin) - NonlinearKnotPointConstraint - single variable with vector syntax (intermittent Julia-1.10 finite-diff failure; same treatment) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../linear/time_consistency_constraint.jl | 77 ++++++++++++++++++- .../nonlinear/knot_point_constraint.jl | 77 +++++++++++++++++-- test/runtests.jl | 35 +++++++-- 3 files changed, 173 insertions(+), 16 deletions(-) diff --git a/src/constraints/linear/time_consistency_constraint.jl b/src/constraints/linear/time_consistency_constraint.jl index 898e720..3c0bfa3 100644 --- a/src/constraints/linear/time_consistency_constraint.jl +++ b/src/constraints/linear/time_consistency_constraint.jl @@ -83,9 +83,15 @@ end end end -@testitem "TimeConsistencyConstraint with free time optimization" tags=[:experimental] begin +@testitem "TimeConsistencyConstraint with free time optimization" begin include("../../../test/test_utils.jl") using NamedTrajectories + using Random + + # Deterministic seed: same trajectory + multipliers on every run. A failure + # here is a real regression, not RNG drift. Robustness across seeds is + # covered by the `:robustness` testitem below. + rng = MersenneTwister(0) # Create trajectory with inconsistent t and Δt initially N = 10 @@ -93,10 +99,10 @@ end traj = NamedTrajectory( ( - x = rand(2, N), - u = rand(1, N), + x = rand(rng, 2, N), + u = rand(rng, 1, N), Δt = fill(Δt_val, N), - t = cumsum(rand(N)), # Random times - inconsistent! + t = cumsum(rand(rng, N)), # Random times - inconsistent! ); controls = (:u, :Δt, :t), # t is also a control to be optimized timestep = :Δt, @@ -123,3 +129,66 @@ end # Verify initial time constraint @test abs(t[1]) < 1e-8 end + +@testitem "TimeConsistencyConstraint with free time optimization — robustness sweep" tags = + [:robustness] begin + include("../../../test/test_utils.jl") + using NamedTrajectories + using Random + + # K independent seeds; pass if ≥80% land within the same tolerance the + # deterministic test uses. Catches regressions where the solver's local- + # minimum behavior degrades for "typical" inconsistent initializations. + K = 20 + pass_threshold = 0.80 + pass_count = 0 + failures = String[] + + for seed = 1:K + rng = MersenneTwister(seed) + N = 10 + Δt_val = 0.5 + + traj = NamedTrajectory( + ( + x = rand(rng, 2, N), + u = rand(rng, 1, N), + Δt = fill(Δt_val, N), + t = cumsum(rand(rng, N)), + ); + controls = (:u, :Δt, :t), + timestep = :Δt, + bounds = (u = (-1.0, 1.0), t = (0.0, 10.0)), + initial = (t = [0.0],), + ) + + J = QuadraticRegularizer(:u, traj, 1.0) + J += QuadraticRegularizer(:t, traj, 0.1) + + time_con = TimeConsistencyConstraint() + + prob = DirectTrajOptProblem(traj, J, AbstractIntegrator[]; constraints = [time_con]) + + ok = try + solve!(prob; max_iter = 100) + t = prob.trajectory.t + Δt = prob.trajectory.Δt + consistent = all(k -> abs(t[k+1] - t[k] - Δt[k]) < 1e-6, 1:(N-1)) + initial_ok = abs(t[1]) < 1e-8 + consistent && initial_ok + catch e + push!(failures, "seed $seed: threw $(typeof(e))") + false + end + + if ok + pass_count += 1 + else + push!(failures, "seed $seed: tolerance not met after solve") + end + end + + pass_rate = pass_count / K + @info "TimeConsistencyConstraint free-time robustness sweep" pass_count K pass_rate failures + @test pass_rate >= pass_threshold +end diff --git a/src/constraints/nonlinear/knot_point_constraint.jl b/src/constraints/nonlinear/knot_point_constraint.jl index 3a245bf..6e16ffe 100644 --- a/src/constraints/nonlinear/knot_point_constraint.jl +++ b/src/constraints/nonlinear/knot_point_constraint.jl @@ -312,12 +312,17 @@ end @testitem "NonlinearKnotPointConstraint - single variable with vector syntax" begin using DirectTrajOpt: CommonInterface + using Random include("../../../test/test_utils.jl") - _, traj = bilinear_dynamics_and_trajectory() + # Deterministic trajectory + multiplier: the test verifies vector vs + # single-symbol syntax structurally, plus a single finite-diff comparison. + # Multi-seed coverage of the finite-diff tolerance lives in the robustness + # testitem below — this one must be reproducible across Julia versions. + rng = MersenneTwister(0) + _, traj = bilinear_dynamics_and_trajectory(; rng = rng) - # Test that [:u] syntax works the same as :u g(a) = [norm(a) - 1.0] NLC1 = NonlinearKnotPointConstraint(g, :u, traj; equality = false) @@ -330,9 +335,71 @@ end @test δ1 ≈ δ2 - # Test both with finite differences - test_constraint(NLC1, traj; atol = 1e-3) - test_constraint(NLC2, traj; atol = 1e-3) + # Test both with finite differences (seeded μ → deterministic Hessian check) + test_constraint(NLC1, traj; atol = 1e-3, rng = MersenneTwister(1)) + test_constraint(NLC2, traj; atol = 1e-3, rng = MersenneTwister(1)) +end + +@testitem "NonlinearKnotPointConstraint - vector syntax robustness sweep" tags = + [:robustness] begin + using DirectTrajOpt: CommonInterface + using Random + + include("../../../test/test_utils.jl") + + # Finite-diff vs analytic Jacobian/Hessian at atol=1e-3 is fundamentally + # noisy in the random trajectory point. Run K seeds; require ≥80% pass. + K = 20 + pass_threshold = 0.80 + pass_count = 0 + failures = String[] + + g(a) = [norm(a) - 1.0] + + for seed = 1:K + _, traj = bilinear_dynamics_and_trajectory(; rng = MersenneTwister(seed)) + NLC1 = NonlinearKnotPointConstraint(g, :u, traj; equality = false) + NLC2 = NonlinearKnotPointConstraint(g, [:u], traj; equality = false) + + δ1 = zeros(NLC1.dim) + δ2 = zeros(NLC2.dim) + CommonInterface.evaluate!(δ1, NLC1, traj) + CommonInterface.evaluate!(δ2, NLC2, traj) + syntax_ok = isapprox(δ1, δ2) + + # `assert=false`: inspect jacobian_pass/hessian_pass directly instead + # of registering @test outcomes per-seed. `test_equality=false` uses + # the norm-based aggregate check (a single noisy entry shouldn't fail + # the seed — the overall derivative match is what matters). + r1 = test_constraint( + NLC1, + traj; + atol = 1e-3, + test_equality = false, + rng = MersenneTwister(seed + K), + assert = false, + ) + r2 = test_constraint( + NLC2, + traj; + atol = 1e-3, + test_equality = false, + rng = MersenneTwister(seed + K), + assert = false, + ) + fd_ok = r1.jacobian_pass && r1.hessian_pass && r2.jacobian_pass && r2.hessian_pass + + if syntax_ok && fd_ok + pass_count += 1 + else + !syntax_ok && push!(failures, "seed $seed: δ1 ≉ δ2 (syntax check)") + !fd_ok && push!(failures, "seed $seed: finite-diff check failed") + end + end + + pass_rate = pass_count / K + @info "NonlinearKnotPointConstraint vector-syntax robustness sweep" pass_count K pass_rate failures + @test pass_rate >= pass_threshold end @testitem "NonlinearKnotPointConstraint - multiple variables concatenated" begin diff --git a/test/runtests.jl b/test/runtests.jl index 33a539b..ea12abd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,10 +3,31 @@ using TestItemRunner include("test_snippets.jl") -# Run all testitem tests in package -# Filter out experimental tests unless INCLUDE_EXPERIMENTAL environment variable is set -if !haskey(ENV, "INCLUDE_EXPERIMENTAL") - @run_package_tests filter=ti -> !(:experimental in get(ti, :tags, [])) -else - @run_package_tests -end +# Tag taxonomy controlling which @testitems run in CI. +# +# Defaults (untagged): +# Always run. Fast, deterministic, must pass on every PR. +# +# :experimental +# Known-flaky or environment-sensitive. Excluded by default. +# Opt in with INCLUDE_EXPERIMENTAL=1 for local diagnosis. +# Goal: eventually rewrite as deterministic + :robustness pair, then drop the tag. +# +# :robustness +# Multi-seed sweeps that assert ≥80% of seeds pass within tolerance. +# Excluded by default because they re-solve a problem many times. +# Opt in with INCLUDE_ROBUSTNESS=1 (e.g. nightly / scheduled workflows). +# A regression that drops the true pass rate below ~80% will fail this gate +# with very high probability (binomial, K=20). +# +# The filter is a single closure so it stays trivial to absorb into a more +# sophisticated upstream filter (e.g. upfront test-item discovery) later. +const INCLUDE_EXPERIMENTAL = haskey(ENV, "INCLUDE_EXPERIMENTAL") +const INCLUDE_ROBUSTNESS = haskey(ENV, "INCLUDE_ROBUSTNESS") + +@run_package_tests filter = + ti -> begin + tags = get(ti, :tags, Symbol[]) + (INCLUDE_EXPERIMENTAL || !(:experimental in tags)) && + (INCLUDE_ROBUSTNESS || !(:robustness in tags)) + end From 1b5b75f54a9072371949c23b9d5e6b7c6a0109bf Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 01:12:12 -0400 Subject: [PATCH 7/9] docs(testing): document tag taxonomy and multi-seed robustness philosophy Replaces the single-paragraph Testing section with a table of the two opt-in env vars (INCLUDE_EXPERIMENTAL, INCLUDE_ROBUSTNESS) and a short prose section explaining the deterministic-baseline + robustness-sweep pattern for stochastic / numerical primitives. Captures the rationale: a single Random.seed!(0) is reproducible on one Julia version but drifts across the CI matrix (1.10/1.11/1.12), so we prefer a two-layer approach over indefinite :experimental tagging. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 96b9cfe..bce3e43 100644 --- a/README.md +++ b/README.md @@ -105,17 +105,42 @@ solve!(prob; max_iter=100) ## Testing -To run all stable tests: +To run the standard test suite (what CI gates every PR on): ```bash julia --project=. test/runtests.jl ``` -To include experimental tests (tests marked as potentially flaky): +`@testitem`s are filtered by tag. Two opt-in tags expand coverage: + +| Env var | Tag | What it adds | +| --------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------- | +| `INCLUDE_EXPERIMENTAL=1` | `:experimental` | Known-flaky tests held out of PR CI. Useful for local diagnosis while a fix is being worked out. | +| `INCLUDE_ROBUSTNESS=1` | `:robustness` | Multi-seed sweeps (K=20) that assert ≥80% of seeds pass within tolerance. Slow but catches noisy regressions. | + ```bash INCLUDE_EXPERIMENTAL=1 julia --project=. test/runtests.jl +INCLUDE_ROBUSTNESS=1 julia --project=. test/runtests.jl ``` -Some tests are tagged as `experimental` because they may be unstable or flaky in certain environments. By default, these tests are excluded from CI runs to maintain build stability. +### Testing philosophy for stochastic / numerical primitives + +A single `Random.seed!(0)` covers reproducibility on one Julia version but +can drift across the CI matrix (1.10 / 1.11 / 1.12). For tests that touch +non-deterministic surfaces — solver convergence with random initial +conditions, finite-difference derivative comparisons — we use a two-layer +approach: + +1. **Deterministic baseline** (untagged, runs every PR): a single seeded + trajectory + multiplier. A failure here means a real regression on the + specific (Julia version, seed) pair. +2. **Robustness sweep** (`:robustness`, opt-in / nightly): K independent + seeds; pass if ≥80% land within the tolerance. A K=20 sweep detects + regressions that drop the true pass rate below ~80% with very high + probability (binomial), while staying robust against random unlucky + draws on the deterministic baseline. + +When writing a new flaky test, prefer adding both rather than tagging +`:experimental` indefinitely. ## Contributing From e0bcd3b6cf183dc77fa484c1f39e2ecd2db1ee00 Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 01:38:30 -0400 Subject: [PATCH 8/9] test: fix robustness sweep parsing and tune threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the multi-seed robustness sweeps: 1. JuliaFormatter folded `tags=[:robustness] begin` across a line break, which TestItemDetection's parser doesn't handle — the keyword arg got absorbed into the begin block and `default_imports` was effectively off, so the testitem module didn't get `using DirectTrajOpt` injected (manifesting as `UndefVarError: QuadraticRegularizer not defined`). Renamed each robustness testitem (drop em-dash, drop leading dash) so the macro line fits on one row and the formatter leaves it alone. 2. Wrapped the per-seed loop in `run_sweep(K::Int)` so `pass_count` and `failures` are function-local, eliminating the soft-scope-ambiguity warning. Also lowers the NonlinearKnotPointConstraint sweep threshold from 0.80 to 0.65. The norm-based finite-diff/analytic comparison at atol=1e-3 is genuinely noisy — observed Julia-1.12 pass rate is exactly 16/20=0.80, right at the old threshold. A one-seed shift across the Julia matrix would false-fail. A real regression in the analytic derivative would drop the rate well below 0.5, still caught with high probability under the looser gate. README updated to note that per-test thresholds are chosen with buffer above the observed baseline. Verified on Julia 1.12: - default suite: 353 pass / 1 broken / 0 fail / 0 error - INCLUDE_ROBUSTNESS=1: 355 pass / 1 broken / 0 fail / 0 error TimeConsistency sweep: pass_rate=1.00 (20/20) NonlinearKnotPoint sweep: pass_rate=0.80 (16/20) Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- .../linear/time_consistency_constraint.jl | 103 ++++++++++-------- .../nonlinear/knot_point_constraint.jl | 101 +++++++++-------- 3 files changed, 110 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index bce3e43..edb0de4 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ julia --project=. test/runtests.jl | Env var | Tag | What it adds | | --------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------- | | `INCLUDE_EXPERIMENTAL=1` | `:experimental` | Known-flaky tests held out of PR CI. Useful for local diagnosis while a fix is being worked out. | -| `INCLUDE_ROBUSTNESS=1` | `:robustness` | Multi-seed sweeps (K=20) that assert ≥80% of seeds pass within tolerance. Slow but catches noisy regressions. | +| `INCLUDE_ROBUSTNESS=1` | `:robustness` | Multi-seed sweeps (K=20) that assert a minimum fraction of seeds pass within tolerance. Per-test threshold chosen with buffer above the observed baseline pass rate — typically 0.80 for clean tests, lower for inherently noisy ones (e.g. finite-difference comparisons). | ```bash INCLUDE_EXPERIMENTAL=1 julia --project=. test/runtests.jl diff --git a/src/constraints/linear/time_consistency_constraint.jl b/src/constraints/linear/time_consistency_constraint.jl index 3c0bfa3..9645348 100644 --- a/src/constraints/linear/time_consistency_constraint.jl +++ b/src/constraints/linear/time_consistency_constraint.jl @@ -130,8 +130,7 @@ end @test abs(t[1]) < 1e-8 end -@testitem "TimeConsistencyConstraint with free time optimization — robustness sweep" tags = - [:robustness] begin +@testitem "TimeConsistencyConstraint free-time robustness sweep" tags=[:robustness] begin include("../../../test/test_utils.jl") using NamedTrajectories using Random @@ -139,55 +138,63 @@ end # K independent seeds; pass if ≥80% land within the same tolerance the # deterministic test uses. Catches regressions where the solver's local- # minimum behavior degrades for "typical" inconsistent initializations. - K = 20 - pass_threshold = 0.80 - pass_count = 0 - failures = String[] - - for seed = 1:K - rng = MersenneTwister(seed) - N = 10 - Δt_val = 0.5 - - traj = NamedTrajectory( - ( - x = rand(rng, 2, N), - u = rand(rng, 1, N), - Δt = fill(Δt_val, N), - t = cumsum(rand(rng, N)), - ); - controls = (:u, :Δt, :t), - timestep = :Δt, - bounds = (u = (-1.0, 1.0), t = (0.0, 10.0)), - initial = (t = [0.0],), - ) - - J = QuadraticRegularizer(:u, traj, 1.0) - J += QuadraticRegularizer(:t, traj, 0.1) - - time_con = TimeConsistencyConstraint() - - prob = DirectTrajOptProblem(traj, J, AbstractIntegrator[]; constraints = [time_con]) - - ok = try - solve!(prob; max_iter = 100) - t = prob.trajectory.t - Δt = prob.trajectory.Δt - consistent = all(k -> abs(t[k+1] - t[k] - Δt[k]) < 1e-6, 1:(N-1)) - initial_ok = abs(t[1]) < 1e-8 - consistent && initial_ok - catch e - push!(failures, "seed $seed: threw $(typeof(e))") - false - end - - if ok - pass_count += 1 - else - push!(failures, "seed $seed: tolerance not met after solve") + function run_sweep(K::Int) + pass_count = 0 + failures = String[] + for seed = 1:K + rng = MersenneTwister(seed) + N = 10 + Δt_val = 0.5 + + traj = NamedTrajectory( + ( + x = rand(rng, 2, N), + u = rand(rng, 1, N), + Δt = fill(Δt_val, N), + t = cumsum(rand(rng, N)), + ); + controls = (:u, :Δt, :t), + timestep = :Δt, + bounds = (u = (-1.0, 1.0), t = (0.0, 10.0)), + initial = (t = [0.0],), + ) + + J = QuadraticRegularizer(:u, traj, 1.0) + J += QuadraticRegularizer(:t, traj, 0.1) + + time_con = TimeConsistencyConstraint() + + prob = DirectTrajOptProblem( + traj, + J, + AbstractIntegrator[]; + constraints = [time_con], + ) + + ok = try + solve!(prob; max_iter = 100) + t = prob.trajectory.t + Δt = prob.trajectory.Δt + consistent = all(k -> abs(t[k+1] - t[k] - Δt[k]) < 1e-6, 1:(N-1)) + initial_ok = abs(t[1]) < 1e-8 + consistent && initial_ok + catch e + push!(failures, "seed $seed: threw $(typeof(e))") + false + end + + if ok + pass_count += 1 + else + push!(failures, "seed $seed: tolerance not met after solve") + end end + return pass_count, failures end + K = 20 + pass_threshold = 0.80 + pass_count, failures = run_sweep(K) pass_rate = pass_count / K @info "TimeConsistencyConstraint free-time robustness sweep" pass_count K pass_rate failures @test pass_rate >= pass_threshold diff --git a/src/constraints/nonlinear/knot_point_constraint.jl b/src/constraints/nonlinear/knot_point_constraint.jl index 6e16ffe..a593c8f 100644 --- a/src/constraints/nonlinear/knot_point_constraint.jl +++ b/src/constraints/nonlinear/knot_point_constraint.jl @@ -340,8 +340,7 @@ end test_constraint(NLC2, traj; atol = 1e-3, rng = MersenneTwister(1)) end -@testitem "NonlinearKnotPointConstraint - vector syntax robustness sweep" tags = - [:robustness] begin +@testitem "NonlinearKnotPointConstraint vector syntax robustness sweep" tags=[:robustness] begin using DirectTrajOpt: CommonInterface using Random @@ -349,54 +348,62 @@ end # Finite-diff vs analytic Jacobian/Hessian at atol=1e-3 is fundamentally # noisy in the random trajectory point. Run K seeds; require ≥80% pass. - K = 20 - pass_threshold = 0.80 - pass_count = 0 - failures = String[] - - g(a) = [norm(a) - 1.0] - - for seed = 1:K - _, traj = bilinear_dynamics_and_trajectory(; rng = MersenneTwister(seed)) - NLC1 = NonlinearKnotPointConstraint(g, :u, traj; equality = false) - NLC2 = NonlinearKnotPointConstraint(g, [:u], traj; equality = false) - - δ1 = zeros(NLC1.dim) - δ2 = zeros(NLC2.dim) - CommonInterface.evaluate!(δ1, NLC1, traj) - CommonInterface.evaluate!(δ2, NLC2, traj) - syntax_ok = isapprox(δ1, δ2) - - # `assert=false`: inspect jacobian_pass/hessian_pass directly instead - # of registering @test outcomes per-seed. `test_equality=false` uses - # the norm-based aggregate check (a single noisy entry shouldn't fail - # the seed — the overall derivative match is what matters). - r1 = test_constraint( - NLC1, - traj; - atol = 1e-3, - test_equality = false, - rng = MersenneTwister(seed + K), - assert = false, - ) - r2 = test_constraint( - NLC2, - traj; - atol = 1e-3, - test_equality = false, - rng = MersenneTwister(seed + K), - assert = false, - ) - fd_ok = r1.jacobian_pass && r1.hessian_pass && r2.jacobian_pass && r2.hessian_pass - - if syntax_ok && fd_ok - pass_count += 1 - else - !syntax_ok && push!(failures, "seed $seed: δ1 ≉ δ2 (syntax check)") - !fd_ok && push!(failures, "seed $seed: finite-diff check failed") + # `assert=false`: inspect jacobian_pass/hessian_pass directly instead of + # registering @test outcomes per-seed. `test_equality=false` uses the + # norm-based aggregate check (single noisy entries shouldn't fail the + # seed — the overall derivative match is what matters). + function run_sweep(K::Int) + pass_count = 0 + failures = String[] + g(a) = [norm(a) - 1.0] + for seed = 1:K + _, traj = bilinear_dynamics_and_trajectory(; rng = MersenneTwister(seed)) + NLC1 = NonlinearKnotPointConstraint(g, :u, traj; equality = false) + NLC2 = NonlinearKnotPointConstraint(g, [:u], traj; equality = false) + + δ1 = zeros(NLC1.dim) + δ2 = zeros(NLC2.dim) + CommonInterface.evaluate!(δ1, NLC1, traj) + CommonInterface.evaluate!(δ2, NLC2, traj) + syntax_ok = isapprox(δ1, δ2) + + r1 = test_constraint( + NLC1, + traj; + atol = 1e-3, + test_equality = false, + rng = MersenneTwister(seed + K), + assert = false, + ) + r2 = test_constraint( + NLC2, + traj; + atol = 1e-3, + test_equality = false, + rng = MersenneTwister(seed + K), + assert = false, + ) + fd_ok = + r1.jacobian_pass && r1.hessian_pass && r2.jacobian_pass && r2.hessian_pass + + if syntax_ok && fd_ok + pass_count += 1 + else + !syntax_ok && push!(failures, "seed $seed: δ1 ≉ δ2 (syntax check)") + !fd_ok && push!(failures, "seed $seed: finite-diff check failed") + end end + return pass_count, failures end + # FD-vs-analytic Jacobian/Hessian on a norm comparison is inherently + # noisier than the time-consistency check. Observed K=20 pass rate on + # Julia 1.12 is 0.80; threshold relaxed to 0.65 so a one-seed shift + # across Julia versions doesn't false-fail the gate. A true regression + # in the analytic derivative would drop the rate well below 0.5. + K = 20 + pass_threshold = 0.65 + pass_count, failures = run_sweep(K) pass_rate = pass_count / K @info "NonlinearKnotPointConstraint vector-syntax robustness sweep" pass_count K pass_rate failures @test pass_rate >= pass_threshold From ce7e7005001c124df545e83f9c76297751dbac7d Mon Sep 17 00:00:00 2001 From: Jack Champagne Date: Wed, 20 May 2026 03:54:12 -0400 Subject: [PATCH 9/9] test: drop :experimental + :robustness tag machinery, run sweeps always MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the deterministic+sweep rework no testitems use :experimental anymore, and the sweeps add ~5 seconds in aggregate on top of a ~10-minute matrix — not worth a parallel filter framework. Drops both INCLUDE_EXPERIMENTAL and INCLUDE_ROBUSTNESS env vars and the ti.tags closure. The robustness sweeps now run on every PR (along with their deterministic baselines). Keeps a `/benchmark/` path filter following Piccolissimo.jl's convention — the benchmark subtree has its own Project.toml + deps + workflow, so its @testitems shouldn't be discovered by `@run_package_tests`. This also preemptively unbreaks PR #93's CI (which fails today because benchmark/convergence/convergence.jl is being picked up). Verified locally on Julia 1.12: 355 pass / 1 broken / 0 fail. TimeConsistency sweep: pass_rate = 1.0 (20/20) NonlinearKnotPoint sweep: pass_rate = 0.8 (16/20, threshold 0.65) Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 48 ++++++++----------- .../linear/time_consistency_constraint.jl | 2 +- .../nonlinear/knot_point_constraint.jl | 2 +- test/runtests.jl | 32 ++----------- 4 files changed, 25 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index edb0de4..25c6168 100644 --- a/README.md +++ b/README.md @@ -105,42 +105,32 @@ solve!(prob; max_iter=100) ## Testing -To run the standard test suite (what CI gates every PR on): ```bash julia --project=. test/runtests.jl ``` -`@testitem`s are filtered by tag. Two opt-in tags expand coverage: +`runtests.jl` runs every `@testitem` in `src/`, `ext/`, and `test/`. Tests in +`benchmark/` are skipped — that subdirectory ships its own `Project.toml` +(extra deps like `HarmoniqsBenchmarks`) and has a dedicated workflow. -| Env var | Tag | What it adds | -| --------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------- | -| `INCLUDE_EXPERIMENTAL=1` | `:experimental` | Known-flaky tests held out of PR CI. Useful for local diagnosis while a fix is being worked out. | -| `INCLUDE_ROBUSTNESS=1` | `:robustness` | Multi-seed sweeps (K=20) that assert a minimum fraction of seeds pass within tolerance. Per-test threshold chosen with buffer above the observed baseline pass rate — typically 0.80 for clean tests, lower for inherently noisy ones (e.g. finite-difference comparisons). | +### Stochastic / numerical primitives — two-layer testing -```bash -INCLUDE_EXPERIMENTAL=1 julia --project=. test/runtests.jl -INCLUDE_ROBUSTNESS=1 julia --project=. test/runtests.jl -``` +A single seeded `MersenneTwister` is reproducible on one Julia version but +small downstream numerics can drift across the CI matrix (1.10 / 1.11 / 1.12). +For tests that touch non-deterministic surfaces (solver convergence from +random init, finite-difference derivative comparisons) we pair each test: + +1. **Deterministic baseline**: a single seeded trajectory + multiplier. + A failure is a real regression on a specific (Julia version, seed) pair. +2. **Robustness sweep**: K=20 independent seeds; passes if a fraction of + seeds (per-test threshold, chosen with buffer above the observed baseline + rate — typically 0.80, lower for inherently noisy checks like norm-based + finite-diff) land within tolerance. Detects regressions that drop the + true pass rate well below the threshold with very high probability + (binomial), while staying insensitive to lucky/unlucky single draws. -### Testing philosophy for stochastic / numerical primitives - -A single `Random.seed!(0)` covers reproducibility on one Julia version but -can drift across the CI matrix (1.10 / 1.11 / 1.12). For tests that touch -non-deterministic surfaces — solver convergence with random initial -conditions, finite-difference derivative comparisons — we use a two-layer -approach: - -1. **Deterministic baseline** (untagged, runs every PR): a single seeded - trajectory + multiplier. A failure here means a real regression on the - specific (Julia version, seed) pair. -2. **Robustness sweep** (`:robustness`, opt-in / nightly): K independent - seeds; pass if ≥80% land within the tolerance. A K=20 sweep detects - regressions that drop the true pass rate below ~80% with very high - probability (binomial), while staying robust against random unlucky - draws on the deterministic baseline. - -When writing a new flaky test, prefer adding both rather than tagging -`:experimental` indefinitely. +The sweeps are cheap enough (a handful of seconds in aggregate) to run on +every PR. ## Contributing diff --git a/src/constraints/linear/time_consistency_constraint.jl b/src/constraints/linear/time_consistency_constraint.jl index 9645348..05e2e6c 100644 --- a/src/constraints/linear/time_consistency_constraint.jl +++ b/src/constraints/linear/time_consistency_constraint.jl @@ -130,7 +130,7 @@ end @test abs(t[1]) < 1e-8 end -@testitem "TimeConsistencyConstraint free-time robustness sweep" tags=[:robustness] begin +@testitem "TimeConsistencyConstraint free-time robustness sweep" begin include("../../../test/test_utils.jl") using NamedTrajectories using Random diff --git a/src/constraints/nonlinear/knot_point_constraint.jl b/src/constraints/nonlinear/knot_point_constraint.jl index a593c8f..6b99031 100644 --- a/src/constraints/nonlinear/knot_point_constraint.jl +++ b/src/constraints/nonlinear/knot_point_constraint.jl @@ -340,7 +340,7 @@ end test_constraint(NLC2, traj; atol = 1e-3, rng = MersenneTwister(1)) end -@testitem "NonlinearKnotPointConstraint vector syntax robustness sweep" tags=[:robustness] begin +@testitem "NonlinearKnotPointConstraint vector syntax robustness sweep" begin using DirectTrajOpt: CommonInterface using Random diff --git a/test/runtests.jl b/test/runtests.jl index ea12abd..e8eecf5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,31 +3,7 @@ using TestItemRunner include("test_snippets.jl") -# Tag taxonomy controlling which @testitems run in CI. -# -# Defaults (untagged): -# Always run. Fast, deterministic, must pass on every PR. -# -# :experimental -# Known-flaky or environment-sensitive. Excluded by default. -# Opt in with INCLUDE_EXPERIMENTAL=1 for local diagnosis. -# Goal: eventually rewrite as deterministic + :robustness pair, then drop the tag. -# -# :robustness -# Multi-seed sweeps that assert ≥80% of seeds pass within tolerance. -# Excluded by default because they re-solve a problem many times. -# Opt in with INCLUDE_ROBUSTNESS=1 (e.g. nightly / scheduled workflows). -# A regression that drops the true pass rate below ~80% will fail this gate -# with very high probability (binomial, K=20). -# -# The filter is a single closure so it stays trivial to absorb into a more -# sophisticated upstream filter (e.g. upfront test-item discovery) later. -const INCLUDE_EXPERIMENTAL = haskey(ENV, "INCLUDE_EXPERIMENTAL") -const INCLUDE_ROBUSTNESS = haskey(ENV, "INCLUDE_ROBUSTNESS") - -@run_package_tests filter = - ti -> begin - tags = get(ti, :tags, Symbol[]) - (INCLUDE_EXPERIMENTAL || !(:experimental in tags)) && - (INCLUDE_ROBUSTNESS || !(:robustness in tags)) - end +# Exclude `benchmark/` testitems from the main test run — they live in a +# subproject with its own Project.toml (different deps, e.g. HarmoniqsBenchmarks) +# and are exercised by a dedicated workflow. +@run_package_tests filter = ti -> !occursin("/benchmark/", ti.filename)