From 3c010ea2061cfb75784fbab88dc9f81ab8ed26d5 Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 7 Apr 2026 12:18:51 -0700 Subject: [PATCH 01/22] feat(barrier): add SOCP cone primitives and NT kernels Build the cone-local Jordan-product, scaling, and corrector kernels needed for the SOCP barrier path before augmented-system integration. --- cpp/src/barrier/second_order_cone.cuh | 919 ++++++++++++ cpp/tests/dual_simplex/CMakeLists.txt | 3 +- .../unit_tests/second_order_cone_test.cu | 1280 +++++++++++++++++ 3 files changed, 2201 insertions(+), 1 deletion(-) create mode 100644 cpp/src/barrier/second_order_cone.cuh create mode 100644 cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu diff --git a/cpp/src/barrier/second_order_cone.cuh b/cpp/src/barrier/second_order_cone.cuh new file mode 100644 index 0000000000..eb337b9ce1 --- /dev/null +++ b/cpp/src/barrier/second_order_cone.cuh @@ -0,0 +1,919 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +#include + +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +// --------------------------------------------------------------------------- +// Shared reduction primitives +// --------------------------------------------------------------------------- + +template +using triplet_t = cuda::std::tuple; + +template +struct triplet_sum { + DI triplet_t operator()(const triplet_t& lhs, const triplet_t& rhs) const + { + const auto& [v0_l, v1_l, v2_l] = lhs; + const auto& [v0_r, v1_r, v2_r] = rhs; + return {v0_l + v0_r, v1_l + v1_r, v2_l + v2_r}; + } +}; + +template +using block_reduce_t = cub::BlockReduce; + +template +struct smem_reduce_t { + using ScalarReduce = block_reduce_t; + using TripletReduce = block_reduce_t, BLOCK_DIM>; + + union { + typename ScalarReduce::TempStorage scalar_temp; + typename TripletReduce::TempStorage triplet_temp; + f_t scalar_broadcast; + triplet_t triplet_broadcast; + }; +}; + +// --------------------------------------------------------------------------- +// reduce_broadcast: block-reduce a value, then broadcast to all threads. +// --------------------------------------------------------------------------- + +template +DI f_t reduce_broadcast(f_t val, smem_reduce_t& s) +{ + f_t agg = typename smem_reduce_t::ScalarReduce(s.scalar_temp).Sum(val); + __syncthreads(); + if (threadIdx.x == 0) { s.scalar_broadcast = agg; } + __syncthreads(); + return s.scalar_broadcast; +} + +template +DI triplet_t reduce_broadcast(triplet_t val, smem_reduce_t& s) +{ + auto agg = typename smem_reduce_t::TripletReduce(s.triplet_temp) + .Reduce(val, triplet_sum{}); + __syncthreads(); + if (threadIdx.x == 0) { s.triplet_broadcast = agg; } + __syncthreads(); + return s.triplet_broadcast; +} + +template +struct smem_warp_reduce_t { + static constexpr int warps_per_block = BLOCK_DIM / 32; + + using ScalarReduce = cub::WarpReduce; + using TripletReduce = cub::WarpReduce, 32>; + + union { + typename ScalarReduce::TempStorage scalar_temp[warps_per_block]; + typename TripletReduce::TempStorage triplet_temp[warps_per_block]; + f_t scalar_broadcast[warps_per_block]; + triplet_t triplet_broadcast[warps_per_block]; + }; +}; + +// --------------------------------------------------------------------------- +// reduce_broadcast: warp-reduce a value, then broadcast within the warp. +// --------------------------------------------------------------------------- + +template +DI f_t reduce_broadcast(f_t val, smem_warp_reduce_t& s) +{ + static_assert(BLOCK_DIM % 32 == 0, "Warp reduce requires warp-aligned CTAs"); + + int warp = threadIdx.x >> 5; + int lane = threadIdx.x & 31; + f_t agg = typename smem_warp_reduce_t::ScalarReduce(s.scalar_temp[warp]).Sum(val); + if (lane == 0) { s.scalar_broadcast[warp] = agg; } + __syncwarp(); + return s.scalar_broadcast[warp]; +} + +template +DI triplet_t reduce_broadcast(triplet_t val, smem_warp_reduce_t& s) +{ + static_assert(BLOCK_DIM % 32 == 0, "Warp reduce requires warp-aligned CTAs"); + + int warp = threadIdx.x >> 5; + int lane = threadIdx.x & 31; + auto agg = typename smem_warp_reduce_t::TripletReduce(s.triplet_temp[warp]) + .Reduce(val, triplet_sum{}); + if (lane == 0) { s.triplet_broadcast[warp] = agg; } + __syncwarp(); + return s.triplet_broadcast[warp]; +} + +// --------------------------------------------------------------------------- +// Apply H^{-1} to one vector per cone (one thread-block per cone). +// +// H^{-1}z = (1/η)(w̄₀z₀ − ζ, z₁ + (−z₀ + ζ/(1+w̄₀))w̄₁), ζ = w̄₁ᵀz₁ +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv_kernel(const f_t* __restrict__ z, + f_t* __restrict__ out, + const f_t* __restrict__ w_bar, + const f_t* __restrict__ inv_eta, + const f_t* __restrict__ inv_1pw0, + const i_t* __restrict__ cone_offsets, + i_t K) +{ + __shared__ smem_reduce_t smem; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + const f_t* w_cone = w_bar + off; + const f_t* z_cone = z + off; + f_t* out_cone = out + off; + + f_t z0 = z_cone[0]; + f_t w0 = w_cone[0]; + + // Phase 1: ζ = w̄₁ᵀ z₁ + f_t partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + partial += w_cone[j] * z_cone[j]; + } + f_t zeta = reduce_broadcast(partial, smem); + + // Phase 2: element-wise output + f_t ie = inv_eta[cone]; + f_t ipw = inv_1pw0[cone]; + f_t coeff = -z0 + zeta * ipw; + + if (threadIdx.x == 0) { out_cone[0] = (w0 * z0 - zeta) * ie; } + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + out_cone[j] = (z_cone[j] + coeff * w_cone[j]) * ie; + } +} + +// --------------------------------------------------------------------------- +// Apply H^{-2} to one vector per cone (one thread-block per cone). +// +// H^{-2}v = η⁻²(2u(uᵀv) − Jv), u = Jw̄, J = diag(1,−1,…,−1). +// +// One dot product (uᵀv) plus element-wise work — same structure as apply_Hinv. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv2_kernel( + const f_t* __restrict__ v, + f_t* __restrict__ out, + const f_t* __restrict__ w_bar, + const f_t* __restrict__ inv_eta, + const i_t* __restrict__ cone_offsets, + i_t K) +{ + __shared__ smem_reduce_t smem; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + const f_t* w_cone = w_bar + off; + const f_t* v_cone = v + off; + f_t* out_cone = out + off; + + f_t v0 = v_cone[0]; + f_t w0 = w_cone[0]; + + // Phase 1: uᵀv = w̄₀v₀ − Σ w̄_j v_j (tail dot, then subtract from head) + f_t partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + partial += w_cone[j] * v_cone[j]; + } + f_t tail_dot = reduce_broadcast(partial, smem); + f_t uTv = w0 * v0 - tail_dot; + + // Phase 2: element-wise output + f_t ie_sq = inv_eta[cone] * inv_eta[cone]; + f_t coeff = f_t(2) * uTv * ie_sq; + + if (threadIdx.x == 0) { out_cone[0] = coeff * w0 - ie_sq * v0; } + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + out_cone[j] = -coeff * w_cone[j] + ie_sq * v_cone[j]; + } +} + +// --------------------------------------------------------------------------- +// Cone-algebra primitives for the deferred combined-step corrector: +// r_K = omega circ omega + dx_scaled circ dz_scaled - sigma mu e +// corr = omega \ r_K +// t_K = H^{-1} corr +// --------------------------------------------------------------------------- + +// --------------------------------------------------------------------------- +// Jordan product for packed SOC vectors (one CTA per cone). +// +// For a, b in Q^q: (a ∘ b)_0 = a^T b, (a ∘ b)_j = a_0 b_j + b_0 a_j. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void jordan_product_kernel( + const f_t* __restrict__ a, + const f_t* __restrict__ b, + f_t* __restrict__ out, + const i_t* __restrict__ cone_offsets, + i_t K) +{ + __shared__ smem_reduce_t smem; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + const f_t* a_cone = a + off; + const f_t* b_cone = b + off; + f_t* out_cone = out + off; + + f_t a0 = a_cone[0]; + f_t b0 = b_cone[0]; + + f_t partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + partial += a_cone[j] * b_cone[j]; + } + f_t tail_dot = reduce_broadcast(partial, smem); + + if (threadIdx.x == 0) { out_cone[0] = a0 * b0 + tail_dot; } + + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + out_cone[j] = a0 * b_cone[j] + b0 * a_cone[j]; + } +} + +// --------------------------------------------------------------------------- +// Inverse Jordan product for packed SOC vectors (one CTA per cone). +// +// For omega in int(Q^q) and vector r, +// (omega \ r)_0 = (omega_0 r_0 − nu) / rho +// (omega \ r)_j = ((nu/omega_0 − r_0)/rho) omega_j + r_j/omega_0 +// where nu = omega_1^T r_1 and rho = ||omega||_J^2 (stored per-cone). +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void inverse_jordan_product_kernel( + const f_t* __restrict__ omega, + const f_t* __restrict__ r, + const f_t* __restrict__ rho, + f_t* __restrict__ out, + const i_t* __restrict__ cone_offsets, + i_t K) +{ + __shared__ smem_reduce_t smem; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + const f_t* omega_cone = omega + off; + const f_t* r_cone = r + off; + f_t* out_cone = out + off; + + f_t omega_0 = omega_cone[0]; + f_t r_0 = r_cone[0]; + + f_t partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + partial += omega_cone[j] * r_cone[j]; + } + f_t nu = reduce_broadcast(partial, smem); + + f_t rho_val = rho[cone]; + f_t inv_rho = f_t(1) / rho_val; + f_t c_omega_j = ((nu / omega_0) - r_0) * inv_rho; + f_t c_r_j = f_t(1) / omega_0; + + if (threadIdx.x == 0) { out_cone[0] = (omega_0 * r_0 - nu) * inv_rho; } + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + out_cone[j] = c_omega_j * omega_cone[j] + c_r_j * r_cone[j]; + } +} + +// --------------------------------------------------------------------------- +// Fused corrector for the combined-step SOC correction (one CTA per cone). +// +// Computes in a single kernel launch: +// 1. dx = H^{-1} Δx_aff (affine scaled direction) +// 2. dz = −ω − dx (complementary direction) +// 3. r_K = ω∘ω + dx∘dz − σμ e (combined cone residual) +// 4. corr = ω \ r_K (inverse Jordan product) +// 5. t_K = H^{-1} corr (corrector for reduced RHS) +// +// Uses the `out` buffer as scratch (holds dx during phases 1–3) and writes +// the final t_K there, so zero extra temporary buffers are needed. +// +// Algebraic shortcut: the triplet (Σ ω_j², Σ ω_j dx_j, Σ dx_j²) computed +// for r_K_0 also yields ν = Σ ω_j r_K_j via a linear combination, avoiding +// a fourth reduction pass. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void fused_corrector_kernel( + const f_t* __restrict__ dx_aff, + const f_t* __restrict__ omega, + const f_t* __restrict__ w_bar, + const f_t* __restrict__ inv_eta, + const f_t* __restrict__ inv_1pw0, + const f_t* __restrict__ rho, + f_t sigma_mu, + f_t* __restrict__ out, + const i_t* __restrict__ cone_offsets, + i_t K) +{ + __shared__ smem_reduce_t smem; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + const f_t* dx_a = dx_aff + off; + const f_t* omega_cone = omega + off; + const f_t* w_cone = w_bar + off; + f_t* out_cone = out + off; + + f_t ie = inv_eta[cone]; + f_t ipw = inv_1pw0[cone]; + f_t rho_val = rho[cone]; + f_t omega_0 = omega_cone[0]; + f_t w_0 = w_cone[0]; + f_t dx_a_0 = dx_a[0]; + + // ================================================================= + // Phase A — reduce ζ = Σ_{j≥1} w̄_j (Δx_aff)_j for H^{-1} + // ================================================================= + f_t partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + partial += w_cone[j] * dx_a[j]; + } + f_t zeta = reduce_broadcast(partial, smem); + + f_t dx_0 = (w_0 * dx_a_0 - zeta) * ie; + f_t coeff_a = -dx_a_0 + zeta * ipw; + f_t dz_0 = -omega_0 - dx_0; + + // ================================================================= + // Phase A→B — write dx to out; accumulate (A, B, C) for r_K and ν + // A = Σ ω_j², B = Σ ω_j dx_j, C = Σ dx_j² (j ≥ 1) + // ================================================================= + auto trip = triplet_t{}; + auto& [A_p, B_p, C_p] = trip; + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t dx_j = (dx_a[j] + coeff_a * w_cone[j]) * ie; + out_cone[j] = dx_j; + f_t omega_j = omega_cone[j]; + A_p += omega_j * omega_j; + B_p += omega_j * dx_j; + C_p += dx_j * dx_j; + } + auto [A, B, C] = reduce_broadcast(trip, smem); + + // ================================================================= + // Phase B — form r_K_0, derive ν, then inverse-Jordan scalars + // ================================================================= + f_t r_K_0 = (omega_0 * omega_0 + A) + (dx_0 * dz_0 - B - C) - sigma_mu; + f_t nu = (f_t(2) * omega_0 - dx_0) * A - (omega_0 + f_t(2) * dx_0) * B; + + f_t inv_rho = f_t(1) / rho_val; + f_t corr_0 = (omega_0 * r_K_0 - nu) * inv_rho; + f_t inv_omega_0 = f_t(1) / omega_0; + f_t c_inv = (nu * inv_omega_0 - r_K_0) * inv_rho; + f_t p1 = c_inv + f_t(2) - dx_0 * inv_omega_0; + f_t p2 = -(f_t(1) + f_t(2) * dx_0 * inv_omega_0); + + // ================================================================= + // Phase B→C — accumulate ζ₂ = Σ_{j≥1} w̄_j corr_j for final H^{-1} + // corr_j = p1 ω_j + p2 dx_j (dx_j still in out_cone[j]) + // ================================================================= + f_t partial2 = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t corr_j = p1 * omega_cone[j] + p2 * out_cone[j]; + partial2 += w_cone[j] * corr_j; + } + f_t zeta2 = reduce_broadcast(partial2, smem); + + // ================================================================= + // Phase C — write t_K = H^{-1}(corr) + // ================================================================= + f_t coeff_c = -corr_0 + zeta2 * ipw; + + if (threadIdx.x == 0) { out_cone[0] = (w_0 * corr_0 - zeta2) * ie; } + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t corr_j = p1 * omega_cone[j] + p2 * out_cone[j]; + out_cone[j] = (corr_j + coeff_c * w_cone[j]) * ie; + } +} + +// --------------------------------------------------------------------------- +// Compute NT scaling from (s, lambda). +// +// Medium/large cones use one CTA per cone and stream s/lambda twice: +// Pass 1: reduce ||s_1||^2, ||lambda_1||^2, and s^T lambda. +// Pass 2: compute omega/w_bar directly from raw inputs and reduce ||w_bar_1||^2. +// +// Small cones (q <= 32) use one warp per cone and keep one element per lane in +// registers for the whole computation. In both paths, shared memory only stores +// per-warp partial reductions plus a small scalar broadcast struct. +// --------------------------------------------------------------------------- + +constexpr int small_cone_limit = 32; +constexpr int medium_cone_limit = 2048; +constexpr int small_block_dim = 64; +constexpr int medium_block_dim = 128; +constexpr int large_block_dim = 256; + +template +struct nt_broadcast_coeffs { + f_t w_from_s; + f_t w_from_lambda; + f_t omega_s_coeff; + f_t omega_lambda_coeff; +}; + +template +struct nt_block_storage { + smem_reduce_t reduce; + nt_broadcast_coeffs coeffs; +}; + +template +struct nt_warp_storage { + static constexpr int warps_per_block = BLOCK_DIM / 32; + + smem_warp_reduce_t reduce; + nt_broadcast_coeffs coeffs[warps_per_block]; +}; + +template +__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_kernel(const f_t* __restrict__ s, + const f_t* __restrict__ lambda, + f_t* __restrict__ eta, + f_t* __restrict__ inv_eta, + f_t* __restrict__ inv_1pw0, + f_t* __restrict__ w_bar, + f_t* __restrict__ omega, + f_t* __restrict__ rho, + const i_t* __restrict__ cone_offsets, + const i_t* __restrict__ cone_ids, + i_t num_cones) +{ + static_assert(BLOCK_DIM % 32 == 0, "NT scaling kernel requires warp-aligned BLOCK_DIM"); + __shared__ nt_block_storage storage; + + i_t cone_idx = static_cast(blockIdx.x); + if (cone_idx >= num_cones) return; + + i_t cone = cone_ids[cone_idx]; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + + f_t s0 = s[off]; + f_t l0 = lambda[off]; + + auto partial = triplet_t{}; + auto& [s1_sq_p, l1_sq_p, sl_p] = partial; + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t sj = s[off + j]; + f_t lj = lambda[off + j]; + s1_sq_p += sj * sj; + l1_sq_p += lj * lj; + sl_p += sj * lj; + } + + auto [s1_sq, l1_sq, sl] = reduce_broadcast(partial, storage.reduce); + f_t owner_eta = f_t(0); + f_t owner_inv_eta = f_t(0); + f_t owner_rho = f_t(0); + f_t owner_omega_0 = f_t(0); + if (threadIdx.x == 0) { + // Clamp radicands to zero: near the cone boundary, roundoff can make these + // slightly negative. + f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq)); + f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq)); + f_t inv_s_J = f_t(1) / s_J; + f_t inv_l_J = f_t(1) / l_J; + owner_rho = s_J * l_J; + owner_eta = sqrt(s_J / l_J); + owner_inv_eta = f_t(1) / owner_eta; + f_t scale = sqrt(owner_rho); + + f_t s_dot_l = (s0 * l0 + sl) * inv_s_J * inv_l_J; + f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); + f_t inv_2g = f_t(1) / (f_t(2) * gamma); + f_t sb0 = s0 * inv_s_J; + f_t lb0 = l0 * inv_l_J; + f_t D = sb0 + lb0 + f_t(2) * gamma; + f_t inv_D = f_t(1) / D; + f_t c_s = (gamma + sb0) * inv_D; + f_t c_l = (gamma + lb0) * inv_D; + + storage.coeffs.w_from_s = inv_2g * inv_s_J; + storage.coeffs.w_from_lambda = -inv_2g * inv_l_J; + // Name these by the raw tail element they multiply: + // omega_j = omega_s_coeff * s_j + omega_lambda_coeff * lambda_j. + // The closed-form NT expression is cross-coupled, so c_l multiplies s_j + // and c_s multiplies lambda_j. + storage.coeffs.omega_s_coeff = scale * c_l * inv_s_J; + storage.coeffs.omega_lambda_coeff = scale * c_s * inv_l_J; + owner_omega_0 = gamma * scale; + } + __syncthreads(); + + f_t w1_sq_partial = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t sj = s[off + j]; + f_t lj = lambda[off + j]; + f_t wj = storage.coeffs.w_from_s * sj + storage.coeffs.w_from_lambda * lj; + w_bar[off + j] = wj; + omega[off + j] = storage.coeffs.omega_s_coeff * sj + storage.coeffs.omega_lambda_coeff * lj; + w1_sq_partial += wj * wj; + } + + f_t w1_sq = reduce_broadcast(w1_sq_partial, storage.reduce); + if (threadIdx.x == 0) { + f_t w0 = sqrt(f_t(1) + w1_sq); + omega[off] = owner_omega_0; + w_bar[off] = w0; + eta[cone] = owner_eta; + inv_eta[cone] = owner_inv_eta; + inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); + rho[cone] = owner_rho; + } +} + +template +__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_small_kernel( + const f_t* __restrict__ s, + const f_t* __restrict__ lambda, + f_t* __restrict__ eta, + f_t* __restrict__ inv_eta, + f_t* __restrict__ inv_1pw0, + f_t* __restrict__ w_bar, + f_t* __restrict__ omega, + f_t* __restrict__ rho, + const i_t* __restrict__ cone_offsets, + const i_t* __restrict__ cone_ids, + i_t num_cones) +{ + static_assert(BLOCK_DIM % 32 == 0, "Small-cone NT kernel requires warp-aligned CTAs"); + __shared__ nt_warp_storage storage; + + constexpr int warps_per_block = BLOCK_DIM / 32; + i_t warp_idx = + static_cast(blockIdx.x) * warps_per_block + static_cast(threadIdx.x >> 5); + if (warp_idx >= num_cones) return; + + int warp = threadIdx.x >> 5; + int lane = threadIdx.x & 31; + i_t cone = cone_ids[warp_idx]; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + + f_t sj = (lane < q) ? s[off + lane] : f_t(0); + f_t lj = (lane < q) ? lambda[off + lane] : f_t(0); + + auto partial = triplet_t{(lane > 0 && lane < q) ? sj * sj : f_t(0), + (lane > 0 && lane < q) ? lj * lj : f_t(0), + (lane > 0 && lane < q) ? sj * lj : f_t(0)}; + auto [s1_sq, l1_sq, sl] = reduce_broadcast(partial, storage.reduce); + + f_t owner_eta = f_t(0); + f_t owner_inv_eta = f_t(0); + f_t owner_rho = f_t(0); + f_t owner_omega_0 = f_t(0); + + if (lane == 0) { + f_t s0 = sj; + f_t l0 = lj; + f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq)); + f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq)); + f_t inv_s_J = f_t(1) / s_J; + f_t inv_l_J = f_t(1) / l_J; + owner_rho = s_J * l_J; + owner_eta = sqrt(s_J / l_J); + owner_inv_eta = f_t(1) / owner_eta; + f_t scale = sqrt(owner_rho); + + f_t s_dot_l = (s0 * l0 + sl) * inv_s_J * inv_l_J; + f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); + f_t inv_2g = f_t(1) / (f_t(2) * gamma); + f_t sb0 = s0 * inv_s_J; + f_t lb0 = l0 * inv_l_J; + f_t D = sb0 + lb0 + f_t(2) * gamma; + f_t inv_D = f_t(1) / D; + f_t c_s = (gamma + sb0) * inv_D; + f_t c_l = (gamma + lb0) * inv_D; + + storage.coeffs[warp].w_from_s = inv_2g * inv_s_J; + storage.coeffs[warp].w_from_lambda = -inv_2g * inv_l_J; + storage.coeffs[warp].omega_s_coeff = scale * c_l * inv_s_J; + storage.coeffs[warp].omega_lambda_coeff = scale * c_s * inv_l_J; + owner_omega_0 = gamma * scale; + } + __syncwarp(); + + f_t w1_sq = f_t(0); + if (lane > 0 && lane < q) { + f_t wj = storage.coeffs[warp].w_from_s * sj + storage.coeffs[warp].w_from_lambda * lj; + w_bar[off + lane] = wj; + omega[off + lane] = + storage.coeffs[warp].omega_s_coeff * sj + storage.coeffs[warp].omega_lambda_coeff * lj; + w1_sq = wj * wj; + } + w1_sq = reduce_broadcast(w1_sq, storage.reduce); + + if (lane == 0) { + f_t w0 = sqrt(f_t(1) + w1_sq); + omega[off] = owner_omega_0; + w_bar[off] = w0; + eta[cone] = owner_eta; + inv_eta[cone] = owner_inv_eta; + inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); + rho[cone] = owner_rho; + } +} + +// --------------------------------------------------------------------------- +// Step length for a single (u, du) pair in Q^q. +// +// Finds the largest alpha in [0, alpha_max] such that u + alpha*du in Q^q. +// The cone condition u_0 + alpha*du_0 >= ||u_1 + alpha*du_1|| reduces to a +// linear test plus a quadratic a*alpha^2 + 2b*alpha + c >= 0 where +// a = du_0^2 - ||du_1||^2, b = u_0*du_0 - u_1^T du_1, c = u_0^2 - ||u_1||^2. +// --------------------------------------------------------------------------- +template +DI f_t +cone_step_length_single(const f_t* __restrict__ u, + const f_t* __restrict__ du, + i_t q, + typename block_reduce_t, BLOCK_DIM>::TempStorage& temp, + f_t alpha) +{ + auto partial = triplet_t{}; + auto& [du1_sq_p, u1du1_p, u1_sq_p] = partial; + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t uj = u[j]; + f_t duj = du[j]; + du1_sq_p += duj * duj; + u1du1_p += uj * duj; + u1_sq_p += uj * uj; + } + + auto [du1_sq, u1du1, u1_sq] = + block_reduce_t, BLOCK_DIM>(temp).Reduce(partial, triplet_sum{}); + __syncthreads(); + + if (threadIdx.x == 0) { + f_t a = du[0] * du[0] - du1_sq; + f_t b = u[0] * du[0] - u1du1; + f_t c = max(f_t(0), u[0] * u[0] - u1_sq); + f_t disc = b * b - a * c; + + // Linear constraint: u_0 + alpha * du_0 >= 0. + if (du[0] < f_t(0)) { alpha = min(alpha, -u[0] / du[0]); } + + // Quadratic constraint. + if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { + // No positive root (parabola stays non-negative for alpha > 0). + } else if (a == f_t(0)) { + // Degenerate: 2b*alpha + c = 0. + if (b < f_t(0)) { alpha = min(alpha, c / (f_t(-2) * b)); } + } else if (c == f_t(0)) { + // Starting exactly on the cone boundary: take a full step only if the + // direction stays in the cone, otherwise the maximum feasible step is 0. + alpha = (a >= f_t(0)) ? alpha : f_t(0); + } else { + f_t t = -(b + copysign(sqrt(disc), b)); + f_t r1 = c / t; + f_t r2 = t / a; + if (r1 < f_t(0)) { r1 = alpha; } + if (r2 < f_t(0)) { r2 = alpha; } + alpha = min(alpha, min(r1, r2)); + } + } + return alpha; +} + +// --------------------------------------------------------------------------- +// Cone step length kernel (one block per cone). +// +// Computes, for each cone i, the largest alpha in [0, alpha_max] such that +// s_i + alpha * ds_i in Q^{q_i} AND lambda_i + alpha * dlambda_i in Q^{q_i}. +// The per-cone result is written to alpha[i]. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( + const f_t* __restrict__ s, + const f_t* __restrict__ ds, + const f_t* __restrict__ lambda, + const f_t* __restrict__ dlambda, + f_t* __restrict__ alpha, + const i_t* __restrict__ cone_offsets, + i_t K, + f_t alpha_max) +{ + __shared__ typename block_reduce_t, BLOCK_DIM>::TempStorage temp_storage; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + + f_t alpha_s = + cone_step_length_single(s + off, ds + off, q, temp_storage, alpha_max); + f_t alpha_l = cone_step_length_single( + lambda + off, dlambda + off, q, temp_storage, alpha_max); + + if (threadIdx.x == 0) { alpha[cone] = min(alpha_s, alpha_l); } +} + +// --------------------------------------------------------------------------- +// Shift u into int(Q^q) if it is not already interior (one block per cone). +// +// alpha(u) = ||u_1|| - u_0. If alpha >= 0 (u on boundary or outside): +// u_0 <- u_0 + 1 + max(0, alpha) (shift along identity element e) +// +// Modifies u in place. Used once during initial-point computation. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void interior_shift_kernel( + f_t* __restrict__ u, const i_t* __restrict__ cone_offsets, i_t K) +{ + __shared__ typename block_reduce_t::TempStorage temp_storage; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + + f_t tail_sq = f_t(0); + for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { + f_t v = u[off + j]; + tail_sq += v * v; + } + tail_sq = block_reduce_t(temp_storage).Sum(tail_sq); + + if (threadIdx.x == 0) { + f_t u1_norm = sqrt(tail_sq); + f_t gap = u1_norm - u[off]; + if (gap >= f_t(0)) { u[off] += f_t(1) + gap; } + } +} + +/** + * Owns device storage for second-order cone topology, iterates, and NT scaling. + * + * Flat arrays are packed by cone: elements [cone_offsets[i], cone_offsets[i+1]) + * belong to cone i, which has dimension cone_dims[i]. + * + * Search directions, RHS vectors, and workspace live directly in + * iteration_data_t (matching the existing LP/QP pattern where dx_aff, dual_rhs, + * etc. are all top-level fields of iteration_data_t). + */ +template +struct cone_data_t { + // --- Topology (set once at construction) --- + i_t K; // number of second-order cones + i_t m_c; // total cone dimension = sum of cone_dims + + rmm::device_uvector cone_offsets; // [K+1] prefix sums of cone_dims + rmm::device_uvector cone_dims; // [K] dimension q_i of each cone + + // --- Primal/dual cone iterates (rewritten each iteration) --- + rmm::device_uvector s; // [m_c] cone slack: s_i in int(Q^{q_i}) + rmm::device_uvector lambda; // [m_c] cone dual: lambda_i in int(Q^{q_i}) + + // --- NT scaling state (recomputed each iteration from s, lambda) --- + rmm::device_uvector eta; // [K] scaling factor eta_i = (||s_i||_J / ||lambda_i||_J)^{1/2} + rmm::device_uvector inv_eta; // [K] cached 1/eta_i + rmm::device_uvector inv_1pw0; // [K] cached 1/(1 + wbar_0_i) + rmm::device_uvector w_bar; // [m_c] NT scaling direction, unit J-norm, packed by cone + rmm::device_uvector omega; // [m_c] scaled variable omega_i = H_i^{-1} s_i, packed by cone + rmm::device_uvector rho; // [K] ||omega_i||^2_J = ||s_i||_J * ||lambda_i||_J + rmm::device_uvector small_cone_ids; // [n_small] cone ids with q <= 32 + rmm::device_uvector medium_cone_ids; // [n_medium] cone ids with 32 < q <= 2048 + rmm::device_uvector large_cone_ids; // [n_large] cone ids with q > 2048 + + cone_data_t(i_t K_in, const std::vector& dims, rmm::cuda_stream_view stream) + : K(K_in), + m_c(std::accumulate(dims.begin(), dims.end(), i_t(0))), + cone_offsets(K_in + 1, stream), + cone_dims(K_in, stream), + s(m_c, stream), + lambda(m_c, stream), + eta(K_in, stream), + inv_eta(K_in, stream), + inv_1pw0(K_in, stream), + w_bar(m_c, stream), + omega(m_c, stream), + rho(K_in, stream), + small_cone_ids(0, stream), + medium_cone_ids(0, stream), + large_cone_ids(0, stream) + { + std::vector offsets(K + 1, 0); + std::vector small_ids; + std::vector medium_ids; + std::vector large_ids; + + for (i_t i = 0; i < K; ++i) { + offsets[i + 1] = offsets[i] + dims[i]; + if (dims[i] <= small_cone_limit) { + small_ids.push_back(i); + } else if (dims[i] <= medium_cone_limit) { + medium_ids.push_back(i); + } else { + large_ids.push_back(i); + } + } + + auto init_device_vec = [&](auto& d_vec, const auto& h_vec) { + if (!h_vec.empty()) { + d_vec.resize(h_vec.size(), stream); + raft::copy(d_vec.data(), h_vec.data(), h_vec.size(), stream); + } + }; + + raft::copy(cone_offsets.data(), offsets.data(), K + 1, stream); + raft::copy(cone_dims.data(), dims.data(), K, stream); + init_device_vec(small_cone_ids, small_ids); + init_device_vec(medium_cone_ids, medium_ids); + init_device_vec(large_cone_ids, large_ids); + } +}; + +template +void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) +{ + auto launch_streaming_bucket = [&](auto& cone_ids, auto block_dim_ic) { + constexpr int block_dim = std::remove_cvref_t::value; + i_t bucket_size = static_cast(cone_ids.size()); + if (bucket_size == 0) return; + + nt_scaling_kernel + <<>>(cones.s.data(), + cones.lambda.data(), + cones.eta.data(), + cones.inv_eta.data(), + cones.inv_1pw0.data(), + cones.w_bar.data(), + cones.omega.data(), + cones.rho.data(), + cones.cone_offsets.data(), + cone_ids.data(), + bucket_size); + }; + + i_t small_count = static_cast(cones.small_cone_ids.size()); + if (small_count > 0) { + constexpr int warps_per_block = small_block_dim / 32; + i_t grid_dim = (small_count + warps_per_block - 1) / warps_per_block; + nt_scaling_small_kernel + <<>>(cones.s.data(), + cones.lambda.data(), + cones.eta.data(), + cones.inv_eta.data(), + cones.inv_1pw0.data(), + cones.w_bar.data(), + cones.omega.data(), + cones.rho.data(), + cones.cone_offsets.data(), + cones.small_cone_ids.data(), + small_count); + } + + launch_streaming_bucket(cones.medium_cone_ids, std::integral_constant{}); + launch_streaming_bucket(cones.large_cone_ids, std::integral_constant{}); +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/tests/dual_simplex/CMakeLists.txt b/cpp/tests/dual_simplex/CMakeLists.txt index 253ef95c83..1abeb62ded 100644 --- a/cpp/tests/dual_simplex/CMakeLists.txt +++ b/cpp/tests/dual_simplex/CMakeLists.txt @@ -1,9 +1,10 @@ # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on ConfigureTest(DUAL_SIMPLEX_TEST ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/solve.cpp ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/solve_barrier.cu + ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/second_order_cone_test.cu ) diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu new file mode 100644 index 0000000000..2f7d51dcae --- /dev/null +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu @@ -0,0 +1,1280 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex::test { +namespace { + +template +auto build_offsets(const std::vector& dims) -> std::vector +{ + std::vector offsets(dims.size() + 1, 0); + for (std::size_t i = 0; i < dims.size(); ++i) { + offsets[i + 1] = offsets[i] + dims[i]; + } + return offsets; +} + +template +auto pack_cones(const std::vector>& cones) -> std::vector +{ + std::size_t total_size = 0; + for (const auto& cone : cones) { + total_size += cone.size(); + } + + std::vector packed; + packed.reserve(total_size); + for (const auto& cone : cones) { + packed.insert(packed.end(), cone.begin(), cone.end()); + } + return packed; +} + +template +auto slice_cone(const std::vector& packed, const std::vector& offsets, i_t cone) + -> std::vector +{ + auto begin = packed.begin() + offsets[cone]; + auto end = packed.begin() + offsets[cone + 1]; + return std::vector(begin, end); +} + +template +auto j_norm_sq(const std::vector& u) -> f_t +{ + if (u.empty()) { return f_t(0); } + + f_t tail_sq = f_t(0); + for (std::size_t j = 1; j < u.size(); ++j) { + tail_sq += u[j] * u[j]; + } + return u[0] * u[0] - tail_sq; +} + +template +auto tail_norm(const std::vector& u) -> f_t +{ + f_t tail_sq = f_t(0); + for (std::size_t j = 1; j < u.size(); ++j) { + tail_sq += u[j] * u[j]; + } + return std::sqrt(tail_sq); +} + +template +auto ref_apply_hinv_single(const std::vector& z, + const std::vector& w_bar, + f_t inv_eta, + f_t inv_1pw0) -> std::vector +{ + std::vector out(z.size(), f_t(0)); + if (z.empty()) { return out; } + + f_t zeta = f_t(0); + for (std::size_t j = 1; j < z.size(); ++j) { + zeta += w_bar[j] * z[j]; + } + + f_t coeff = -z[0] + zeta * inv_1pw0; + out[0] = (w_bar[0] * z[0] - zeta) * inv_eta; + for (std::size_t j = 1; j < z.size(); ++j) { + out[j] = (z[j] + coeff * w_bar[j]) * inv_eta; + } + return out; +} + +template +auto ref_apply_H_single(const std::vector& z, + const std::vector& w_bar, + f_t eta, + f_t inv_1pw0) -> std::vector +{ + std::vector out(z.size(), f_t(0)); + if (z.empty()) { return out; } + + f_t zeta = f_t(0); + for (std::size_t j = 1; j < z.size(); ++j) { + zeta += w_bar[j] * z[j]; + } + + f_t coeff = z[0] + zeta * inv_1pw0; + out[0] = (w_bar[0] * z[0] + zeta) * eta; + for (std::size_t j = 1; j < z.size(); ++j) { + out[j] = (z[j] + coeff * w_bar[j]) * eta; + } + return out; +} + +template +auto ref_apply_hinv2_single(const std::vector& v, const std::vector& w_bar, f_t inv_eta) + -> std::vector +{ + std::vector out(v.size(), f_t(0)); + if (v.empty()) { return out; } + + f_t uTv = w_bar[0] * v[0]; + for (std::size_t j = 1; j < v.size(); ++j) { + uTv -= w_bar[j] * v[j]; + } + + f_t ie_sq = inv_eta * inv_eta; + out[0] = ie_sq * (f_t(2) * w_bar[0] * uTv - v[0]); + for (std::size_t j = 1; j < v.size(); ++j) { + out[j] = ie_sq * (-f_t(2) * w_bar[j] * uTv + v[j]); + } + return out; +} + +template +struct nt_scaling_reference_t { + f_t eta{}; + f_t inv_eta{}; + f_t inv_1pw0{}; + f_t rho{}; + std::vector w_bar; + std::vector omega; +}; + +template +auto ref_nt_scaling_single(const std::vector& s, const std::vector& lambda) + -> nt_scaling_reference_t +{ + EXPECT_EQ(s.size(), lambda.size()); + EXPECT_FALSE(s.empty()); + + f_t s_j_norm_sq = j_norm_sq(s); + f_t l_j_norm_sq = j_norm_sq(lambda); + EXPECT_GT(s_j_norm_sq, f_t(0)); + EXPECT_GT(l_j_norm_sq, f_t(0)); + + f_t s_j_norm = std::sqrt(s_j_norm_sq); + f_t l_j_norm = std::sqrt(l_j_norm_sq); + f_t inv_s_j_norm = f_t(1) / s_j_norm; + f_t inv_l_j_norm = f_t(1) / l_j_norm; + + f_t dot_bar = (s[0] * lambda[0]) * inv_s_j_norm * inv_l_j_norm; + for (std::size_t j = 1; j < s.size(); ++j) { + dot_bar += (s[j] * lambda[j]) * inv_s_j_norm * inv_l_j_norm; + } + + f_t gamma = std::sqrt(std::max(f_t(0), (f_t(1) + dot_bar) * f_t(0.5))); + f_t inv_2g = f_t(1) / (f_t(2) * gamma); + + nt_scaling_reference_t ref{}; + ref.eta = std::sqrt(s_j_norm / l_j_norm); + ref.inv_eta = f_t(1) / ref.eta; + ref.rho = s_j_norm * l_j_norm; + ref.w_bar.assign(s.size(), f_t(0)); + + f_t w1_sq = f_t(0); + for (std::size_t j = 1; j < s.size(); ++j) { + ref.w_bar[j] = inv_2g * (s[j] * inv_s_j_norm - lambda[j] * inv_l_j_norm); + w1_sq += ref.w_bar[j] * ref.w_bar[j]; + } + + // Match the kernel's numerical cleanup path for w_bar[0]. + ref.w_bar[0] = std::sqrt(f_t(1) + w1_sq); + ref.inv_1pw0 = f_t(1) / (f_t(1) + ref.w_bar[0]); + ref.omega = ref_apply_hinv_single(s, ref.w_bar, ref.inv_eta, ref.inv_1pw0); + return ref; +} + +template +auto ref_step_length_single(const std::vector& u, const std::vector& du, f_t alpha_max) + -> f_t +{ + EXPECT_EQ(u.size(), du.size()); + EXPECT_FALSE(u.empty()); + + f_t du1_sq = f_t(0); + f_t u1du1 = f_t(0); + f_t u1_sq = f_t(0); + for (std::size_t j = 1; j < u.size(); ++j) { + du1_sq += du[j] * du[j]; + u1du1 += u[j] * du[j]; + u1_sq += u[j] * u[j]; + } + + f_t a = du[0] * du[0] - du1_sq; + f_t b = u[0] * du[0] - u1du1; + f_t c = std::max(f_t(0), u[0] * u[0] - u1_sq); + f_t disc = b * b - a * c; + + f_t alpha = alpha_max; + if (du[0] < f_t(0)) { alpha = std::min(alpha, -u[0] / du[0]); } + + if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { + // No positive root (parabola stays non-negative for alpha > 0). + } else if (a < f_t(0)) { + alpha = std::min(alpha, (b + std::sqrt(std::max(f_t(0), disc))) / (-a)); + } else if (a == f_t(0)) { + if (b < f_t(0)) { alpha = std::min(alpha, c / (f_t(-2) * b)); } + } else if (c == f_t(0)) { + alpha = (a >= f_t(0)) ? alpha : f_t(0); + } else if (b < f_t(0) && disc > f_t(0)) { + alpha = std::min(alpha, (-b - std::sqrt(disc)) / a); + } + + return alpha; +} + +template +auto ref_jordan_product_single(const std::vector& a, const std::vector& b) + -> std::vector +{ + EXPECT_EQ(a.size(), b.size()); + std::vector out(a.size(), f_t(0)); + if (a.empty()) { return out; } + + f_t dot = f_t(0); + for (std::size_t j = 0; j < a.size(); ++j) { + dot += a[j] * b[j]; + } + out[0] = dot; + for (std::size_t j = 1; j < a.size(); ++j) { + out[j] = a[0] * b[j] + b[0] * a[j]; + } + return out; +} + +template +auto ref_inverse_jordan_product_single(const std::vector& omega, + const std::vector& r, + f_t rho_val) -> std::vector +{ + EXPECT_EQ(omega.size(), r.size()); + std::vector out(omega.size(), f_t(0)); + if (omega.empty()) { return out; } + + f_t nu = f_t(0); + for (std::size_t j = 1; j < omega.size(); ++j) { + nu += omega[j] * r[j]; + } + + f_t inv_rho = f_t(1) / rho_val; + f_t omega_0 = omega[0]; + out[0] = (omega_0 * r[0] - nu) * inv_rho; + + f_t c_omega = ((nu / omega_0) - r[0]) * inv_rho; + f_t c_r = f_t(1) / omega_0; + for (std::size_t j = 1; j < omega.size(); ++j) { + out[j] = c_omega * omega[j] + c_r * r[j]; + } + return out; +} + +template +auto ref_fused_corrector_single(const std::vector& dx_aff, + const std::vector& omega, + const std::vector& w_bar, + f_t inv_eta, + f_t inv_1pw0, + f_t rho_val, + f_t sigma_mu) -> std::vector +{ + auto dx = ref_apply_hinv_single(dx_aff, w_bar, inv_eta, inv_1pw0); + + std::vector dz(dx.size()); + for (std::size_t j = 0; j < dx.size(); ++j) { + dz[j] = -omega[j] - dx[j]; + } + + auto r_K_1 = ref_jordan_product_single(omega, omega); + auto r_K_2 = ref_jordan_product_single(dx, dz); + + std::vector r_K(dx.size()); + for (std::size_t j = 0; j < dx.size(); ++j) { + r_K[j] = r_K_1[j] + r_K_2[j]; + } + r_K[0] -= sigma_mu; + + auto corr = ref_inverse_jordan_product_single(omega, r_K, rho_val); + return ref_apply_hinv_single(corr, w_bar, inv_eta, inv_1pw0); +} + +template +auto ref_interior_shift_single(std::vector u) -> std::vector +{ + if (u.empty()) { return u; } + + f_t gap = tail_norm(u) - u[0]; + if (gap >= f_t(0)) { u[0] += f_t(1) + gap; } + return u; +} + +template +auto make_patterned_cone(int q, f_t head, f_t scale) -> std::vector +{ + std::vector cone(q, f_t(0)); + cone[0] = head; + for (int j = 1; j < q; ++j) { + f_t sign = (j % 2 == 0) ? f_t(1) : f_t(-1); + cone[j] = sign * scale * static_cast((j % 7) + 1); + } + return cone; +} + +} // namespace + +class second_order_cone_test : public ::testing::Test { + protected: + using i_t = int; + using f_t = double; + static constexpr int dim = 256; + + raft::handle_t handle_; + rmm::cuda_stream_view stream_ = handle_.get_stream(); + + template + auto make_device_vector(const std::vector& host) -> rmm::device_uvector + { + rmm::device_uvector device(host.size(), stream_); + if (!host.empty()) { raft::copy(device.data(), host.data(), host.size(), stream_); } + sync(); + return device; + } + + template + auto copy_to_host(const rmm::device_uvector& device) -> std::vector + { + std::vector host(device.size()); + if (!host.empty()) { raft::copy(host.data(), device.data(), host.size(), stream_); } + sync(); + return host; + } + + template + void copy_to_device(rmm::device_uvector& device, const std::vector& host) + { + ASSERT_EQ(device.size(), host.size()); + if (!host.empty()) { raft::copy(device.data(), host.data(), host.size(), stream_); } + sync(); + } + + void sync() { RAFT_CUDA_TRY(cudaStreamSynchronize(stream_.value())); } + + template + void expect_vector_near(const std::vector& actual, + const std::vector& expected, + t_t atol, + t_t rtol, + const char* label) + { + ASSERT_EQ(actual.size(), expected.size()) << label << " size mismatch"; + for (std::size_t i = 0; i < actual.size(); ++i) { + EXPECT_NEAR(actual[i], expected[i], atol + rtol * std::abs(expected[i])) + << label << "[" << i << "]"; + } + } + + void launch_apply_hinv(const rmm::device_uvector& z, + rmm::device_uvector& out, + const rmm::device_uvector& w_bar, + const rmm::device_uvector& inv_eta, + const rmm::device_uvector& inv_1pw0, + const rmm::device_uvector& cone_offsets, + i_t k) + { + apply_Hinv_kernel<<>>( + z.data(), out.data(), w_bar.data(), inv_eta.data(), inv_1pw0.data(), cone_offsets.data(), k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_step_length(const rmm::device_uvector& s, + const rmm::device_uvector& ds, + const rmm::device_uvector& lambda, + const rmm::device_uvector& dlambda, + rmm::device_uvector& alpha, + const rmm::device_uvector& cone_offsets, + i_t k, + f_t alpha_max) + { + step_length_kernel<<>>(s.data(), + ds.data(), + lambda.data(), + dlambda.data(), + alpha.data(), + cone_offsets.data(), + k, + alpha_max); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_interior_shift(rmm::device_uvector& u, + const rmm::device_uvector& cone_offsets, + i_t k) + { + interior_shift_kernel<<>>(u.data(), cone_offsets.data(), k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_apply_hinv2(const rmm::device_uvector& v, + rmm::device_uvector& out, + const rmm::device_uvector& w_bar, + const rmm::device_uvector& inv_eta, + const rmm::device_uvector& cone_offsets, + i_t k) + { + apply_Hinv2_kernel<<>>( + v.data(), out.data(), w_bar.data(), inv_eta.data(), cone_offsets.data(), k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_jordan_product(const rmm::device_uvector& a, + const rmm::device_uvector& b, + rmm::device_uvector& out, + const rmm::device_uvector& cone_offsets, + i_t k) + { + jordan_product_kernel + <<>>(a.data(), b.data(), out.data(), cone_offsets.data(), k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_inverse_jordan_product(const rmm::device_uvector& omega, + const rmm::device_uvector& r, + const rmm::device_uvector& rho, + rmm::device_uvector& out, + const rmm::device_uvector& cone_offsets, + i_t k) + { + inverse_jordan_product_kernel<<>>( + omega.data(), r.data(), rho.data(), out.data(), cone_offsets.data(), k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } + + void launch_fused_corrector(const rmm::device_uvector& dx_aff, + const rmm::device_uvector& omega, + const rmm::device_uvector& w_bar, + const rmm::device_uvector& inv_eta, + const rmm::device_uvector& inv_1pw0, + const rmm::device_uvector& rho, + f_t sigma_mu, + rmm::device_uvector& out, + const rmm::device_uvector& cone_offsets, + i_t k) + { + fused_corrector_kernel<<>>(dx_aff.data(), + omega.data(), + w_bar.data(), + inv_eta.data(), + inv_1pw0.data(), + rho.data(), + sigma_mu, + out.data(), + cone_offsets.data(), + k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + sync(); + } +}; + +TEST_F(second_order_cone_test, cone_data_topology_and_bucket_partitioning) +{ + std::vector dims{1, 32, 33, 2048, 2049}; + cone_data_t cones(static_cast(dims.size()), dims, stream_); + + auto expected_offsets = build_offsets(dims); + auto actual_offsets = copy_to_host(cones.cone_offsets); + auto actual_dims = copy_to_host(cones.cone_dims); + auto small_ids = copy_to_host(cones.small_cone_ids); + auto medium_ids = copy_to_host(cones.medium_cone_ids); + auto large_ids = copy_to_host(cones.large_cone_ids); + + EXPECT_EQ(cones.K, static_cast(dims.size())); + EXPECT_EQ(cones.m_c, expected_offsets.back()); + EXPECT_EQ(actual_offsets, expected_offsets); + EXPECT_EQ(actual_dims, dims); + EXPECT_EQ(small_ids, std::vector({0, 1})); + EXPECT_EQ(medium_ids, std::vector({2, 3})); + EXPECT_EQ(large_ids, std::vector({4})); +} + +TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) +{ + // Borrowed from the Clarabel regression input, but checked against our own + // host-side NT formulas. + std::vector> s_cones{{1.5, 0.3, 0.4}}; + std::vector> lambda_cones{{2.0, 0.5, 0.5}}; + std::vector dims{3}; + + cone_data_t cones(1, dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + auto eta = copy_to_host(cones.eta); + auto inv_eta = copy_to_host(cones.inv_eta); + auto inv_1pw0 = copy_to_host(cones.inv_1pw0); + auto rho = copy_to_host(cones.rho); + auto w_bar = copy_to_host(cones.w_bar); + auto omega = copy_to_host(cones.omega); + + auto ref = ref_nt_scaling_single(s_cones[0], lambda_cones[0]); + + EXPECT_NEAR(eta[0], ref.eta, 1e-12); + EXPECT_NEAR(inv_eta[0], ref.inv_eta, 1e-12); + EXPECT_NEAR(inv_1pw0[0], ref.inv_1pw0, 1e-12); + EXPECT_NEAR(rho[0], ref.rho, 1e-12); + expect_vector_near(w_bar, ref.w_bar, 1e-12, 1e-10, "w_bar"); + expect_vector_near(omega, ref.omega, 1e-12, 1e-10, "omega"); + + EXPECT_NEAR(j_norm_sq(w_bar), f_t(1), 1e-12); + EXPECT_NEAR(j_norm_sq(omega), rho[0], 1e-12); + + auto omega_from_apply_hinv = ref_apply_hinv_single(s_cones[0], w_bar, inv_eta[0], inv_1pw0[0]); + expect_vector_near(omega, omega_from_apply_hinv, 1e-12, 1e-10, "omega_consistency"); +} + +TEST_F(second_order_cone_test, nt_scaling_matches_reference_across_bucket_sizes) +{ + std::vector> s_cones{ + {2.0}, make_patterned_cone(33, 4.0, 0.01), make_patterned_cone(2049, 5.0, 0.001)}; + std::vector> lambda_cones{ + {0.5}, make_patterned_cone(33, 3.0, 0.0075), make_patterned_cone(2049, 4.0, 0.00075)}; + std::vector dims{1, 33, 2049}; + auto offsets = build_offsets(dims); + + cone_data_t cones(static_cast(dims.size()), dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + auto eta = copy_to_host(cones.eta); + auto inv_eta = copy_to_host(cones.inv_eta); + auto inv_1pw0 = copy_to_host(cones.inv_1pw0); + auto rho = copy_to_host(cones.rho); + auto w_bar = copy_to_host(cones.w_bar); + auto omega = copy_to_host(cones.omega); + + for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { + auto ref = ref_nt_scaling_single(s_cones[cone], lambda_cones[cone]); + + EXPECT_NEAR(eta[cone], ref.eta, 1e-10) << "cone " << cone; + EXPECT_NEAR(inv_eta[cone], ref.inv_eta, 1e-10) << "cone " << cone; + EXPECT_NEAR(inv_1pw0[cone], ref.inv_1pw0, 1e-10) << "cone " << cone; + EXPECT_NEAR(rho[cone], ref.rho, 1e-10) << "cone " << cone; + + auto actual_w_bar = slice_cone(w_bar, offsets, cone); + auto actual_omega = slice_cone(omega, offsets, cone); + expect_vector_near(actual_w_bar, ref.w_bar, 1e-10, 1e-8, "w_bar"); + expect_vector_near(actual_omega, ref.omega, 1e-10, 1e-8, "omega"); + + EXPECT_NEAR(j_norm_sq(actual_w_bar), f_t(1), 1e-10) << "cone " << cone; + EXPECT_NEAR(j_norm_sq(actual_omega), rho[cone], 1e-10) << "cone " << cone; + } +} + +TEST_F(second_order_cone_test, nt_scaling_omega_equals_H_times_lambda) +{ + std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; + std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; + std::vector dims{5}; + + cone_data_t cones(1, dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + auto eta = copy_to_host(cones.eta); + auto inv_1pw0 = copy_to_host(cones.inv_1pw0); + auto w_bar = copy_to_host(cones.w_bar); + auto omega = copy_to_host(cones.omega); + + // NT symmetry: omega should equal both H^{-1}s and H*lambda. + auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta[0], inv_1pw0[0]); + expect_vector_near(omega, H_lambda, 1e-10, 1e-8, "omega_vs_H_lambda"); +} + +TEST_F(second_order_cone_test, nt_scaling_near_boundary_is_stable) +{ + // s and lambda barely inside the cone: ||tail||^2 ≈ head^2. + std::vector> s_cones{{1.00002, 0.6, 0.8, 1e-4, -2e-4}}; + std::vector> lambda_cones{{1.000015, 0.8, 0.6, -3e-5, 2e-5}}; + std::vector dims{5}; + + cone_data_t cones(1, dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + auto eta = copy_to_host(cones.eta); + auto inv_eta = copy_to_host(cones.inv_eta); + auto inv_1pw0 = copy_to_host(cones.inv_1pw0); + auto w_bar = copy_to_host(cones.w_bar); + auto omega = copy_to_host(cones.omega); + + EXPECT_NEAR(j_norm_sq(w_bar), f_t(1), 1e-8) << "w_bar J-norm not 1 near boundary"; + EXPECT_GT(w_bar[0], tail_norm(w_bar)) << "w_bar not interior near boundary"; + + // Round-trip: H(omega) should equal s. + auto H_omega = ref_apply_H_single(omega, w_bar, eta[0], inv_1pw0[0]); + expect_vector_near(H_omega, pack_cones(s_cones), 1e-8, 1e-6, "H_omega_vs_s_near_boundary"); + + // Symmetry: omega should also equal H*lambda. + auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta[0], inv_1pw0[0]); + expect_vector_near(omega, H_lambda, 1e-8, 1e-6, "omega_vs_H_lambda_near_boundary"); +} + +TEST_F(second_order_cone_test, apply_hinv_matches_reference_for_packed_cones) +{ + std::vector dims{1, 3, 5}; + auto offsets = build_offsets(dims); + + std::vector> z_cones{{3.0}, {2.0, -1.0, 0.5}, {1.0, 0.25, -0.75, 0.5, -0.125}}; + std::vector> w_bar_cones{ + {1.0}, {0.0, 0.15, -0.05}, {0.0, 0.10, -0.20, 0.05, 0.15}}; + std::vector inv_eta_host{0.5, 1.25, 0.75}; + std::vector inv_1pw0_host(inv_eta_host.size(), 0.0); + + for (std::size_t cone = 0; cone < w_bar_cones.size(); ++cone) { + f_t w1_sq = f_t(0); + for (std::size_t j = 1; j < w_bar_cones[cone].size(); ++j) { + w1_sq += w_bar_cones[cone][j] * w_bar_cones[cone][j]; + } + w_bar_cones[cone][0] = std::sqrt(f_t(1) + w1_sq); + inv_1pw0_host[cone] = f_t(1) / (f_t(1) + w_bar_cones[cone][0]); + } + + auto z = make_device_vector(pack_cones(z_cones)); + auto w_bar = make_device_vector(pack_cones(w_bar_cones)); + auto inv_eta = make_device_vector(inv_eta_host); + auto inv_1pw0 = make_device_vector(inv_1pw0_host); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector out(z.size(), stream_); + + launch_apply_hinv(z, out, w_bar, inv_eta, inv_1pw0, d_offsets, static_cast(dims.size())); + + auto actual_out = copy_to_host(out); + auto expected = pack_cones(std::vector>{ + ref_apply_hinv_single(z_cones[0], w_bar_cones[0], inv_eta_host[0], inv_1pw0_host[0]), + ref_apply_hinv_single(z_cones[1], w_bar_cones[1], inv_eta_host[1], inv_1pw0_host[1]), + ref_apply_hinv_single(z_cones[2], w_bar_cones[2], inv_eta_host[2], inv_1pw0_host[2])}); + + expect_vector_near(actual_out, expected, 1e-12, 1e-10, "apply_hinv"); +} + +TEST_F(second_order_cone_test, step_length_matches_reference_and_handles_q1) +{ + std::vector dims{1, 3}; + auto offsets = build_offsets(dims); + + std::vector> s_cones{{2.0}, {5.0, 1.0, 1.0}}; + std::vector> ds_cones{{-3.0}, {-0.5, 0.1, 0.1}}; + std::vector> lambda_cones{{5.0}, {5.0, 1.0, 1.0}}; + std::vector> dlambda_cones{{1.0}, {-0.5, 0.1, 0.1}}; + f_t alpha_max = 10.0; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(dims.size(), stream_); + + launch_step_length( + s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); + + auto actual_alpha = copy_to_host(alpha); + std::vector expected_alpha(dims.size(), alpha_max); + for (std::size_t cone = 0; cone < dims.size(); ++cone) { + expected_alpha[cone] = + std::min(ref_step_length_single(s_cones[cone], ds_cones[cone], alpha_max), + ref_step_length_single(lambda_cones[cone], dlambda_cones[cone], alpha_max)); + } + + expect_vector_near(actual_alpha, expected_alpha, 1e-12, 1e-10, "step_length"); + EXPECT_NEAR(actual_alpha[0], 2.0 / 3.0, 1e-12); + EXPECT_NEAR(actual_alpha[1], 5.5903758157691508, 1e-10); +} + +TEST_F(second_order_cone_test, step_length_matches_reference_for_large_cone) +{ + std::vector dims{513}; + auto offsets = build_offsets(dims); + + std::vector> s_cones{{make_patterned_cone(dims[0], 5.0, 0.01)}}; + std::vector> ds_cones{{make_patterned_cone(dims[0], -0.25, 0.002)}}; + std::vector> lambda_cones{{make_patterned_cone(dims[0], 6.0, 0.009)}}; + std::vector> dlambda_cones{{make_patterned_cone(dims[0], -0.15, 0.0015)}}; + f_t alpha_max = 20.0; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(dims.size(), stream_); + + launch_step_length( + s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); + + auto actual_alpha = copy_to_host(alpha); + std::vector expected_alpha(dims.size(), alpha_max); + for (std::size_t cone = 0; cone < dims.size(); ++cone) { + expected_alpha[cone] = + std::min(ref_step_length_single(s_cones[cone], ds_cones[cone], alpha_max), + ref_step_length_single(lambda_cones[cone], dlambda_cones[cone], alpha_max)); + } + + expect_vector_near(actual_alpha, expected_alpha, 1e-12, 1e-10, "step_length_large"); + EXPECT_GT(actual_alpha[0], 0.0); + EXPECT_LT(actual_alpha[0], alpha_max); +} + +TEST_F(second_order_cone_test, step_length_boundary_c_zero_matches_clarabel_branch) +{ + std::vector dims{3}; + auto offsets = build_offsets(dims); + + // Boundary point: c = u^T J u = 1^2 - 1^2 - 0^2 = 0. + // Direction: a = du^T J du = 1^2 - 1^2 - 1^2 = -1 < 0. + // Clarabel's c == 0 branch returns 0 in this case because the direction + // leaves the cone immediately. + std::vector> s_cones{{1.0, 1.0, 0.0}}; + std::vector> ds_cones{{1.0, 1.0, 1.0}}; + std::vector> lambda_cones{{1.0, 1.0, 0.0}}; + std::vector> dlambda_cones{{1.0, 1.0, 1.0}}; + f_t alpha_max = 10.0; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(dims.size(), stream_); + + launch_step_length( + s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); + + auto actual_alpha = copy_to_host(alpha); + ASSERT_EQ(actual_alpha.size(), 1); + EXPECT_EQ(actual_alpha[0], 0.0); +} + +TEST_F(second_order_cone_test, step_length_degenerate_a_zero) +{ + std::vector dims{2}; + auto offsets = build_offsets(dims); + + // u=(2,0), du=(-1,1): a = du_0^2 - du_1^2 = 1 - 1 = 0 (degenerate quadratic). + // Linear constraint: alpha <= 2. Degenerate branch: alpha = c/(-2b) = 4/2 = 2. + // But the linear constraint also gives alpha <= 2, so result is min(2, 2) = 2... + // Actually b = u0*du0 - u1*du1 = 2*(-1) - 0 = -2, c = u0^2 - u1^2 = 4. + // Degenerate: alpha = c/(-2b) = 4/4 = 1. And linear: alpha <= -u0/du0 = 2. + // So alpha = 1. + std::vector> s_cones{{2.0, 0.0}}; + std::vector> ds_cones{{-1.0, 1.0}}; + std::vector> lambda_cones{{5.0, 0.0}}; + std::vector> dlambda_cones{{0.0, 0.0}}; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(1, stream_); + + launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 10.0); + + auto actual = copy_to_host(alpha); + EXPECT_NEAR(actual[0], 1.0, 1e-14); +} + +TEST_F(second_order_cone_test, step_length_safe_direction_returns_alpha_max) +{ + std::vector dims{3}; + auto offsets = build_offsets(dims); + + // Interior point with direction along the identity element — stays in cone forever. + std::vector> s_cones{{10.0, 0.0, 0.0}}; + std::vector> ds_cones{{1.0, 0.0, 0.0}}; + std::vector> lambda_cones{{10.0, 0.0, 0.0}}; + std::vector> dlambda_cones{{0.0, 0.1, 0.0}}; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(1, stream_); + + launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 1.0); + + auto actual = copy_to_host(alpha); + EXPECT_DOUBLE_EQ(actual[0], 1.0); +} + +TEST_F(second_order_cone_test, step_length_boundary_tightness) +{ + std::vector dims{5}; + auto offsets = build_offsets(dims); + + std::vector> s_cones{{4.0, 1.0, -1.0, 0.5, 0.3}}; + std::vector> ds_cones{{-2.0, 1.0, 0.5, -0.3, 0.1}}; + std::vector> lambda_cones{{5.0, 0.5, 1.0, -0.3, 0.2}}; + std::vector> dlambda_cones{{-1.0, 2.0, 1.0, -0.5, 0.4}}; + + auto s = make_device_vector(pack_cones(s_cones)); + auto ds = make_device_vector(pack_cones(ds_cones)); + auto lambda = make_device_vector(pack_cones(lambda_cones)); + auto dlambda = make_device_vector(pack_cones(dlambda_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector alpha(1, stream_); + + launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 100.0); + + auto a = copy_to_host(alpha)[0]; + ASSERT_GT(a, 0.0); + + // At alpha, at least one of (s, lambda) should be on the cone boundary. + auto s_bnd = s_cones[0]; + auto l_bnd = lambda_cones[0]; + for (std::size_t j = 0; j < s_bnd.size(); ++j) { + s_bnd[j] += a * ds_cones[0][j]; + l_bnd[j] += a * dlambda_cones[0][j]; + } + f_t res_s = j_norm_sq(s_bnd); + f_t res_l = j_norm_sq(l_bnd); + EXPECT_GE(res_s, -1e-10) << "s left the cone"; + EXPECT_GE(res_l, -1e-10) << "lambda left the cone"; + EXPECT_NEAR(std::min(res_s, res_l), 0.0, 1e-10) << "neither hit the boundary"; + + // At (1 − ε) α, both should be strictly interior. + f_t a_int = a * (1.0 - 1e-8); + auto s_int = s_cones[0]; + auto l_int = lambda_cones[0]; + for (std::size_t j = 0; j < s_int.size(); ++j) { + s_int[j] += a_int * ds_cones[0][j]; + l_int[j] += a_int * dlambda_cones[0][j]; + } + EXPECT_GT(j_norm_sq(s_int), 0.0) << "s not interior at (1-eps)*alpha"; + EXPECT_GT(j_norm_sq(l_int), 0.0) << "lambda not interior at (1-eps)*alpha"; +} + +TEST_F(second_order_cone_test, interior_shift_matches_reference_and_preserves_tail) +{ + std::vector dims{1, 3, 4}; + auto offsets = build_offsets(dims); + + std::vector> cones{{-0.25}, {2.0, 0.3, 0.4}, {0.5, 0.6, 0.8, 0.0}}; + auto packed = pack_cones(cones); + auto expected = pack_cones(std::vector>{ref_interior_shift_single(cones[0]), + ref_interior_shift_single(cones[1]), + ref_interior_shift_single(cones[2])}); + + auto u = make_device_vector(packed); + auto d_offsets = make_device_vector(offsets); + launch_interior_shift(u, d_offsets, static_cast(dims.size())); + + auto actual = copy_to_host(u); + expect_vector_near(actual, expected, 1e-12, 1e-10, "interior_shift"); + + for (std::size_t cone = 0; cone < dims.size(); ++cone) { + auto shifted = slice_cone(actual, offsets, static_cast(cone)); + EXPECT_GT(shifted[0], tail_norm(shifted)); + for (std::size_t j = 1; j < shifted.size(); ++j) { + EXPECT_EQ(shifted[j], cones[cone][j]) << "cone " << cone << " tail " << j; + } + } +} + +TEST_F(second_order_cone_test, apply_hinv2_matches_reference_for_packed_cones) +{ + std::vector dims{1, 3, 5}; + auto offsets = build_offsets(dims); + + std::vector> v_cones{{3.0}, {2.0, -1.0, 0.5}, {1.0, 0.25, -0.75, 0.5, -0.125}}; + std::vector> w_bar_cones{ + {1.0}, {0.0, 0.15, -0.05}, {0.0, 0.10, -0.20, 0.05, 0.15}}; + std::vector inv_eta_host{0.5, 1.25, 0.75}; + + for (std::size_t cone = 0; cone < w_bar_cones.size(); ++cone) { + f_t w1_sq = f_t(0); + for (std::size_t j = 1; j < w_bar_cones[cone].size(); ++j) { + w1_sq += w_bar_cones[cone][j] * w_bar_cones[cone][j]; + } + w_bar_cones[cone][0] = std::sqrt(f_t(1) + w1_sq); + } + + auto v = make_device_vector(pack_cones(v_cones)); + auto w_bar = make_device_vector(pack_cones(w_bar_cones)); + auto inv_eta = make_device_vector(inv_eta_host); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector out(v.size(), stream_); + + launch_apply_hinv2(v, out, w_bar, inv_eta, d_offsets, static_cast(dims.size())); + + auto actual = copy_to_host(out); + auto expected = pack_cones(std::vector>{ + ref_apply_hinv2_single(v_cones[0], w_bar_cones[0], inv_eta_host[0]), + ref_apply_hinv2_single(v_cones[1], w_bar_cones[1], inv_eta_host[1]), + ref_apply_hinv2_single(v_cones[2], w_bar_cones[2], inv_eta_host[2])}); + + expect_vector_near(actual, expected, 1e-12, 1e-10, "apply_hinv2"); +} + +TEST_F(second_order_cone_test, apply_hinv2_equals_double_hinv_with_nt_scaling) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + cone_data_t cones(static_cast(dims.size()), dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + std::vector> v_cones{{1.0, -0.3, 0.2}, {0.5, 0.1, -0.15, 0.25, -0.1}}; + auto d_v = make_device_vector(pack_cones(v_cones)); + auto d_offsets = make_device_vector(offsets); + + // H^{-2} v (single kernel) + rmm::device_uvector d_hinv2(cones.omega.size(), stream_); + launch_apply_hinv2( + d_v, d_hinv2, cones.w_bar, cones.inv_eta, d_offsets, static_cast(dims.size())); + + // H^{-1}(H^{-1} v) (two passes) + rmm::device_uvector d_tmp(cones.omega.size(), stream_); + rmm::device_uvector d_double(cones.omega.size(), stream_); + launch_apply_hinv(d_v, + d_tmp, + cones.w_bar, + cones.inv_eta, + cones.inv_1pw0, + d_offsets, + static_cast(dims.size())); + launch_apply_hinv(d_tmp, + d_double, + cones.w_bar, + cones.inv_eta, + cones.inv_1pw0, + d_offsets, + static_cast(dims.size())); + + auto hinv2_actual = copy_to_host(d_hinv2); + auto double_actual = copy_to_host(d_double); + expect_vector_near(hinv2_actual, double_actual, 1e-10, 1e-8, "hinv2_vs_double_hinv"); +} + +TEST_F(second_order_cone_test, apply_hinv2_strided_loop_for_large_cone) +{ + std::vector dims{513}; + auto offsets = build_offsets(dims); + + auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); + auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); + + cone_data_t cones(1, dims, stream_); + copy_to_device(cones.s, s_cone); + copy_to_device(cones.lambda, lambda_cone); + + launch_nt_scaling(cones, stream_); + + auto v_cone = make_patterned_cone(dims[0], 3.0, 0.006); + auto d_v = make_device_vector(v_cone); + auto d_offsets = make_device_vector(offsets); + + // Direct H^{-2} apply + rmm::device_uvector d_hinv2(cones.omega.size(), stream_); + launch_apply_hinv2(d_v, d_hinv2, cones.w_bar, cones.inv_eta, d_offsets, 1); + + // Reference: two H^{-1} passes + rmm::device_uvector d_tmp(cones.omega.size(), stream_); + rmm::device_uvector d_double(cones.omega.size(), stream_); + launch_apply_hinv(d_v, d_tmp, cones.w_bar, cones.inv_eta, cones.inv_1pw0, d_offsets, 1); + launch_apply_hinv(d_tmp, d_double, cones.w_bar, cones.inv_eta, cones.inv_1pw0, d_offsets, 1); + + auto hinv2_actual = copy_to_host(d_hinv2); + auto double_actual = copy_to_host(d_double); + expect_vector_near(hinv2_actual, double_actual, 1e-8, 1e-6, "hinv2_large"); + + // Also check against CPU reference + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_host = copy_to_host(cones.inv_eta); + auto ref = ref_apply_hinv2_single(v_cone, w_bar_host, inv_eta_host[0]); + expect_vector_near(hinv2_actual, ref, 1e-8, 1e-6, "hinv2_large_ref"); +} + +TEST_F(second_order_cone_test, jordan_product_matches_reference_for_packed_cones) +{ + std::vector dims{1, 3, 4}; + auto offsets = build_offsets(dims); + + std::vector> a_cones{{2.0}, {2.0, 1.0, -0.5}, {3.0, 0.25, -0.75, 0.5}}; + std::vector> b_cones{{4.0}, {1.5, -0.5, 0.25}, {2.0, -0.25, 0.5, 1.0}}; + + auto a = make_device_vector(pack_cones(a_cones)); + auto b = make_device_vector(pack_cones(b_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector out(a.size(), stream_); + + launch_jordan_product(a, b, out, d_offsets, static_cast(dims.size())); + + auto actual = copy_to_host(out); + auto expected = + pack_cones(std::vector>{ref_jordan_product_single(a_cones[0], b_cones[0]), + ref_jordan_product_single(a_cones[1], b_cones[1]), + ref_jordan_product_single(a_cones[2], b_cones[2])}); + + expect_vector_near(actual, expected, 1e-12, 1e-10, "jordan_product"); +} + +TEST_F(second_order_cone_test, inverse_jordan_product_matches_reference_and_identity) +{ + std::vector dims{1, 3, 5}; + auto offsets = build_offsets(dims); + + std::vector> omega_cones{ + {2.0}, {2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> r_cones{{4.0}, {1.0, -0.25, 0.5}, {2.0, 0.5, -0.25, 0.25, 0.75}}; + + std::vector rho_host; + for (const auto& w : omega_cones) { + rho_host.push_back(j_norm_sq(w)); + } + + auto d_omega = make_device_vector(pack_cones(omega_cones)); + auto d_r = make_device_vector(pack_cones(r_cones)); + auto d_rho = make_device_vector(rho_host); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector d_out(d_omega.size(), stream_); + + launch_inverse_jordan_product( + d_omega, d_r, d_rho, d_out, d_offsets, static_cast(dims.size())); + + auto actual = copy_to_host(d_out); + auto expected = pack_cones(std::vector>{ + ref_inverse_jordan_product_single(omega_cones[0], r_cones[0], rho_host[0]), + ref_inverse_jordan_product_single(omega_cones[1], r_cones[1], rho_host[1]), + ref_inverse_jordan_product_single(omega_cones[2], r_cones[2], rho_host[2])}); + + expect_vector_near(actual, expected, 1e-12, 1e-10, "inverse_jordan_product"); + + // Identity check: omega circ (omega \ r) = r + auto d_inv = make_device_vector(actual); + rmm::device_uvector d_roundtrip(d_omega.size(), stream_); + launch_jordan_product(d_omega, d_inv, d_roundtrip, d_offsets, static_cast(dims.size())); + + auto roundtrip = copy_to_host(d_roundtrip); + auto r_packed = pack_cones(r_cones); + expect_vector_near(roundtrip, r_packed, 1e-10, 1e-8, "inverse_identity"); +} + +TEST_F(second_order_cone_test, jordan_and_inverse_jordan_strided_loop_for_large_cone) +{ + std::vector dims{513}; + auto offsets = build_offsets(dims); + + auto a_cone = make_patterned_cone(dims[0], 5.0, 0.005); + auto b_cone = make_patterned_cone(dims[0], 4.0, 0.004); + auto omega_cone = make_patterned_cone(dims[0], 6.0, 0.003); + f_t rho_val = j_norm_sq(omega_cone); + ASSERT_GT(rho_val, 0.0); + + std::vector> a_cones{a_cone}; + std::vector> b_cones{b_cone}; + std::vector> omega_cones{omega_cone}; + + auto d_a = make_device_vector(pack_cones(a_cones)); + auto d_b = make_device_vector(pack_cones(b_cones)); + auto d_omega = make_device_vector(pack_cones(omega_cones)); + auto d_rho = make_device_vector(std::vector{rho_val}); + auto d_offsets = make_device_vector(offsets); + + // Jordan product: strided path + rmm::device_uvector d_jp(d_a.size(), stream_); + launch_jordan_product(d_a, d_b, d_jp, d_offsets, 1); + auto jp_actual = copy_to_host(d_jp); + auto jp_expected = ref_jordan_product_single(a_cone, b_cone); + expect_vector_near(jp_actual, jp_expected, 1e-10, 1e-8, "jordan_large"); + + // Inverse Jordan product: strided path + identity + auto r_cone = make_patterned_cone(dims[0], 3.0, 0.006); + std::vector> r_cones{r_cone}; + + auto d_r = make_device_vector(pack_cones(r_cones)); + rmm::device_uvector d_inv(d_omega.size(), stream_); + launch_inverse_jordan_product(d_omega, d_r, d_rho, d_inv, d_offsets, 1); + + auto inv_actual = copy_to_host(d_inv); + auto inv_expected = ref_inverse_jordan_product_single(omega_cone, r_cone, rho_val); + expect_vector_near(inv_actual, inv_expected, 1e-10, 1e-8, "inv_jordan_large"); + + // Round-trip identity on the large cone + auto d_inv_vec = make_device_vector(inv_actual); + rmm::device_uvector d_rt(d_omega.size(), stream_); + launch_jordan_product(d_omega, d_inv_vec, d_rt, d_offsets, 1); + auto rt_actual = copy_to_host(d_rt); + expect_vector_near(rt_actual, r_cone, 1e-8, 1e-6, "identity_large"); +} + +TEST_F(second_order_cone_test, inverse_jordan_product_with_nt_scaling_rho) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + cone_data_t cones(static_cast(dims.size()), dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + auto omega_host = copy_to_host(cones.omega); + auto rho_host = copy_to_host(cones.rho); + + // Build an arbitrary r vector, run inverse Jordan with NT-produced rho/omega + std::vector> r_cones{{1.0, -0.3, 0.2}, {0.5, 0.1, -0.15, 0.25, -0.1}}; + auto d_r = make_device_vector(pack_cones(r_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector d_out(cones.omega.size(), stream_); + + launch_inverse_jordan_product( + cones.omega, d_r, cones.rho, d_out, d_offsets, static_cast(dims.size())); + + auto inv_actual = copy_to_host(d_out); + + // Verify host reference matches using NT-produced values + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto omega_c = slice_cone(omega_host, offsets, c); + auto r_c = r_cones[c]; + auto ref = ref_inverse_jordan_product_single(omega_c, r_c, rho_host[c]); + auto actual = slice_cone(inv_actual, offsets, c); + expect_vector_near(actual, ref, 1e-10, 1e-8, "nt_rho_inv_jordan"); + } + + // Round-trip identity with NT-produced omega + rmm::device_uvector d_rt(cones.omega.size(), stream_); + auto d_inv = make_device_vector(inv_actual); + launch_jordan_product(cones.omega, d_inv, d_rt, d_offsets, static_cast(dims.size())); + auto rt_actual = copy_to_host(d_rt); + auto r_packed = pack_cones(r_cones); + expect_vector_near(rt_actual, r_packed, 1e-8, 1e-6, "nt_identity"); +} + +TEST_F(second_order_cone_test, fused_corrector_matches_reference_with_nt_scaling) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + cone_data_t cones(static_cast(dims.size()), dims, stream_); + copy_to_device(cones.s, pack_cones(s_cones)); + copy_to_device(cones.lambda, pack_cones(lambda_cones)); + + launch_nt_scaling(cones, stream_); + + std::vector> dx_aff_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; + f_t sigma_mu = 0.1; + + auto d_dx_aff = make_device_vector(pack_cones(dx_aff_cones)); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector d_out(cones.omega.size(), stream_); + + launch_fused_corrector(d_dx_aff, + cones.omega, + cones.w_bar, + cones.inv_eta, + cones.inv_1pw0, + cones.rho, + sigma_mu, + d_out, + d_offsets, + static_cast(dims.size())); + + auto actual = copy_to_host(d_out); + auto omega_host = copy_to_host(cones.omega); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + auto inv_1pw0_h = copy_to_host(cones.inv_1pw0); + auto rho_h = copy_to_host(cones.rho); + + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto ref = ref_fused_corrector_single(dx_aff_cones[c], + slice_cone(omega_host, offsets, c), + slice_cone(w_bar_host, offsets, c), + inv_eta_h[c], + inv_1pw0_h[c], + rho_h[c], + sigma_mu); + auto act = slice_cone(actual, offsets, c); + expect_vector_near(act, ref, 1e-10, 1e-8, "fused_corrector"); + } +} + +TEST_F(second_order_cone_test, fused_corrector_strided_loop_for_large_cone) +{ + std::vector dims{513}; + auto offsets = build_offsets(dims); + + auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); + auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); + + cone_data_t cones(1, dims, stream_); + copy_to_device(cones.s, s_cone); + copy_to_device(cones.lambda, lambda_cone); + + launch_nt_scaling(cones, stream_); + + auto dx_aff_cone = make_patterned_cone(dims[0], 0.5, 0.003); + f_t sigma_mu = 0.25; + + auto d_dx_aff = make_device_vector(dx_aff_cone); + auto d_offsets = make_device_vector(offsets); + rmm::device_uvector d_out(cones.omega.size(), stream_); + + launch_fused_corrector(d_dx_aff, + cones.omega, + cones.w_bar, + cones.inv_eta, + cones.inv_1pw0, + cones.rho, + sigma_mu, + d_out, + d_offsets, + 1); + + auto actual = copy_to_host(d_out); + auto omega_host = copy_to_host(cones.omega); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + auto inv_1pw0_h = copy_to_host(cones.inv_1pw0); + auto rho_h = copy_to_host(cones.rho); + + auto ref = ref_fused_corrector_single( + dx_aff_cone, omega_host, w_bar_host, inv_eta_h[0], inv_1pw0_h[0], rho_h[0], sigma_mu); + expect_vector_near(actual, ref, 1e-8, 1e-6, "fused_corrector_large"); +} + +} // namespace cuopt::linear_programming::dual_simplex::test From 9ee0ed76ba5d482d8e25b314d296b53b5be20a2f Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 7 Apr 2026 12:26:17 -0700 Subject: [PATCH 02/22] feat(barrier): integrate H^-2 cone blocks into the augmented system Wire the SOCP cone Hessian updates into the augmented solve path and fold in the follow-up cleanup that removes now-redundant kernel plumbing. --- cpp/src/barrier/barrier.cu | 117 +++++- cpp/src/barrier/second_order_cone.cuh | 322 ++++++++++------ .../unit_tests/second_order_cone_test.cu | 360 +++++++++++++++--- 3 files changed, 625 insertions(+), 174 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 902e691e64..bb848e6037 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,7 @@ #include #include +#include #include #include @@ -163,6 +165,8 @@ class iteration_data_t { d_inv_diag(lp.num_cols, lp.handle_ptr->get_stream()), d_cols_to_remove(0, lp.handle_ptr->get_stream()), d_augmented_diagonal_indices_(0, lp.handle_ptr->get_stream()), + d_cone_csr_indices_(0, lp.handle_ptr->get_stream()), + d_cone_Q_values_(0, lp.handle_ptr->get_stream()), use_augmented(false), has_factorization(false), num_factorizations(0), @@ -321,6 +325,11 @@ class iteration_data_t { use_augmented = !Q_diagonal; } + if (cones_.has_value() && !use_augmented) { + n_dense_columns = 0; + use_augmented = true; + } + if (use_augmented) { settings.log.printf("Linear system : augmented\n"); } else { @@ -427,16 +436,87 @@ class iteration_data_t { i_t factorization_size = n + m; const f_t dual_perturb = 0.0; const f_t primal_perturb = 1e-6; + + const bool has_cones = cones_.has_value() && cones_->K > 0; + const i_t m_c = has_cones ? cones_->m_c : 0; + i_t total_block_nnz = 0; + + std::vector cone_offsets_host; + std::vector cone_block_offsets_host; + if (has_cones) { + cone_offsets_host.resize(cones_->K + 1); + cone_block_offsets_host.resize(cones_->K + 1); + raft::copy( + cone_offsets_host.data(), cones_->cone_offsets.data(), cones_->K + 1, stream_view_); + raft::copy( + cone_block_offsets_host.data(), cones_->block_offsets.data(), cones_->K + 1, stream_view_); + handle_ptr->sync_stream(); + total_block_nnz = cone_block_offsets_host[cones_->K]; + } + if (first_call) { - i_t new_nnz = 2 * nnzA + n + m + nnzQ; + i_t new_nnz = 2 * nnzA + n + m + nnzQ + total_block_nnz; csr_matrix_t augmented_CSR(n + m, n + m, new_nnz); std::vector augmented_diagonal_indices(n + m, -1); + std::vector cone_csr_indices_host(total_block_nnz, -1); + std::vector cone_Q_values_host(total_block_nnz, f_t(0)); i_t q = 0; i_t off_diag_Qnz = 0; for (i_t i = 0; i < n; i++) { augmented_CSR.row_start[i] = q; - if (nnzQ == 0) { + + const bool is_cone_row = has_cones && i >= cone_var_start_ && i < cone_var_start_ + m_c; + + if (is_cone_row) { + // Determine which cone this variable belongs to and its local row + i_t local_idx = i - cone_var_start_; + i_t k = 0; + while (k + 1 < cones_->K && cone_offsets_host[k + 1] <= local_idx) { + k++; + } + i_t local_r = local_idx - cone_offsets_host[k]; + i_t q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; + i_t cone_col_start = cone_var_start_ + cone_offsets_host[k]; + i_t block_base = cone_block_offsets_host[k] + local_r * q_k; + + // Merge-join: Q entries (sorted) with dense cone block columns (contiguous) + i_t qp = (nnzQ > 0) ? Q.col_start[i] : 0; + i_t q_end = (nnzQ > 0) ? Q.col_start[i + 1] : 0; + + // Q entries before cone block + while (qp < q_end && Q.i[qp] < cone_col_start) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + qp++; + } + + // Dense cone block, absorbing any Q entries that fall inside + for (i_t c = 0; c < q_k; c++) { + i_t col = cone_col_start + c; + f_t q_val = (c == local_r) ? dual_perturb : f_t(0); + + if (qp < q_end && Q.i[qp] == col) { + q_val += Q.x[qp]; + qp++; + } + + cone_csr_indices_host[block_base + c] = q; + cone_Q_values_host[block_base + c] = q_val; + if (col == i) { augmented_diagonal_indices[i] = q; } + augmented_CSR.j[q] = col; + augmented_CSR.x[q++] = f_t(0); + } + + // Q entries after cone block + while (qp < q_end) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + qp++; + } + } else if (nnzQ == 0) { augmented_diagonal_indices[i] = q; augmented_CSR.j[q] = i; augmented_CSR.x[q++] = -diag[i] - dual_perturb; @@ -489,8 +569,9 @@ class iteration_data_t { augmented_CSR.nz_max = q; augmented_CSR.j.resize(q); augmented_CSR.x.resize(q); - settings_.log.debug("augmented nz %d predicted %d\n", q, off_diag_Qnz + nnzA + n); - cuopt_assert(q == 2 * nnzA + n + m + off_diag_Qnz, "augmented nnz != predicted"); + i_t expected_nnz = 2 * nnzA + (n - m_c) + total_block_nnz + m + off_diag_Qnz; + settings_.log.debug("augmented nz %d predicted %d\n", q, expected_nnz); + cuopt_assert(q == expected_nnz, "augmented nnz != predicted"); cuopt_assert(A.col_start[n] == AT.col_start[m], "A nz != AT nz"); device_augmented.copy(augmented_CSR, handle_ptr->get_stream()); @@ -500,6 +581,20 @@ class iteration_data_t { augmented_diagonal_indices.data(), augmented_diagonal_indices.size(), handle_ptr->get_stream()); + + if (has_cones) { + d_cone_csr_indices_.resize(total_block_nnz, handle_ptr->get_stream()); + raft::copy(d_cone_csr_indices_.data(), + cone_csr_indices_host.data(), + total_block_nnz, + handle_ptr->get_stream()); + d_cone_Q_values_.resize(total_block_nnz, handle_ptr->get_stream()); + raft::copy(d_cone_Q_values_.data(), + cone_Q_values_host.data(), + total_block_nnz, + handle_ptr->get_stream()); + } + handle_ptr->sync_stream(); #ifdef CHECK_SYMMETRY csc_matrix_t augmented_transpose(1, 1, 1); @@ -538,6 +633,15 @@ class iteration_data_t { span_x[span_diag_indices[j]] = primal_perturb_value; }); RAFT_CHECK_CUDA(handle_ptr->get_stream()); + + if (has_cones) { + scatter_hinv2_into_augmented(*cones_, + device_augmented.x, + d_cone_csr_indices_, + d_cone_Q_values_, + handle_ptr->get_stream()); + RAFT_CHECK_CUDA(handle_ptr->get_stream()); + } } } @@ -1530,9 +1634,14 @@ class iteration_data_t { std::vector Qdiag; bool Q_diagonal; rmm::device_uvector d_augmented_diagonal_indices_; + rmm::device_uvector d_cone_csr_indices_; + rmm::device_uvector d_cone_Q_values_; bool indefinite_Q; cusparse_view_t cusparse_Q_view_; + std::optional> cones_; + i_t cone_var_start_ = 0; + bool use_augmented; i_t symbolic_status; diff --git a/cpp/src/barrier/second_order_cone.cuh b/cpp/src/barrier/second_order_cone.cuh index eb337b9ce1..ca23ed39cc 100644 --- a/cpp/src/barrier/second_order_cone.cuh +++ b/cpp/src/barrier/second_order_cone.cuh @@ -12,12 +12,18 @@ #include +#include #include #include #include +#include + +#include +#include +#include #include #include #include @@ -135,24 +141,25 @@ DI triplet_t reduce_broadcast(triplet_t val, smem_warp_reduce_t -__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv_kernel(const f_t* __restrict__ z, - f_t* __restrict__ out, - const f_t* __restrict__ w_bar, - const f_t* __restrict__ inv_eta, - const f_t* __restrict__ inv_1pw0, - const i_t* __restrict__ cone_offsets, - i_t K) +__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv_kernel( + raft::device_span z, + raft::device_span out, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span cone_offsets, + i_t K) { __shared__ smem_reduce_t smem; i_t cone = static_cast(blockIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - const f_t* w_cone = w_bar + off; - const f_t* z_cone = z + off; - f_t* out_cone = out + off; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + auto w_cone = w_bar.subspan(off, q); + auto z_cone = z.subspan(off, q); + auto out_cone = out.subspan(off, q); f_t z0 = z_cone[0]; f_t w0 = w_cone[0]; @@ -184,11 +191,11 @@ __global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv_kernel(const f_t* __rest // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv2_kernel( - const f_t* __restrict__ v, - f_t* __restrict__ out, - const f_t* __restrict__ w_bar, - const f_t* __restrict__ inv_eta, - const i_t* __restrict__ cone_offsets, + raft::device_span v, + raft::device_span out, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span cone_offsets, i_t K) { __shared__ smem_reduce_t smem; @@ -196,11 +203,11 @@ __global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv2_kernel( i_t cone = static_cast(blockIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - const f_t* w_cone = w_bar + off; - const f_t* v_cone = v + off; - f_t* out_cone = out + off; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + auto w_cone = w_bar.subspan(off, q); + auto v_cone = v.subspan(off, q); + auto out_cone = out.subspan(off, q); f_t v0 = v_cone[0]; f_t w0 = w_cone[0]; @@ -237,10 +244,10 @@ __global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv2_kernel( // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void jordan_product_kernel( - const f_t* __restrict__ a, - const f_t* __restrict__ b, - f_t* __restrict__ out, - const i_t* __restrict__ cone_offsets, + raft::device_span a, + raft::device_span b, + raft::device_span out, + raft::device_span cone_offsets, i_t K) { __shared__ smem_reduce_t smem; @@ -248,11 +255,11 @@ __global__ __launch_bounds__(BLOCK_DIM) void jordan_product_kernel( i_t cone = static_cast(blockIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - const f_t* a_cone = a + off; - const f_t* b_cone = b + off; - f_t* out_cone = out + off; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + auto a_cone = a.subspan(off, q); + auto b_cone = b.subspan(off, q); + auto out_cone = out.subspan(off, q); f_t a0 = a_cone[0]; f_t b0 = b_cone[0]; @@ -280,11 +287,11 @@ __global__ __launch_bounds__(BLOCK_DIM) void jordan_product_kernel( // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void inverse_jordan_product_kernel( - const f_t* __restrict__ omega, - const f_t* __restrict__ r, - const f_t* __restrict__ rho, - f_t* __restrict__ out, - const i_t* __restrict__ cone_offsets, + raft::device_span omega, + raft::device_span r, + raft::device_span rho, + raft::device_span out, + raft::device_span cone_offsets, i_t K) { __shared__ smem_reduce_t smem; @@ -292,11 +299,11 @@ __global__ __launch_bounds__(BLOCK_DIM) void inverse_jordan_product_kernel( i_t cone = static_cast(blockIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - const f_t* omega_cone = omega + off; - const f_t* r_cone = r + off; - f_t* out_cone = out + off; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + auto omega_cone = omega.subspan(off, q); + auto r_cone = r.subspan(off, q); + auto out_cone = out.subspan(off, q); f_t omega_0 = omega_cone[0]; f_t r_0 = r_cone[0]; @@ -337,15 +344,15 @@ __global__ __launch_bounds__(BLOCK_DIM) void inverse_jordan_product_kernel( // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void fused_corrector_kernel( - const f_t* __restrict__ dx_aff, - const f_t* __restrict__ omega, - const f_t* __restrict__ w_bar, - const f_t* __restrict__ inv_eta, - const f_t* __restrict__ inv_1pw0, - const f_t* __restrict__ rho, + raft::device_span dx_aff, + raft::device_span omega, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span rho, f_t sigma_mu, - f_t* __restrict__ out, - const i_t* __restrict__ cone_offsets, + raft::device_span out, + raft::device_span cone_offsets, i_t K) { __shared__ smem_reduce_t smem; @@ -353,12 +360,12 @@ __global__ __launch_bounds__(BLOCK_DIM) void fused_corrector_kernel( i_t cone = static_cast(blockIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - const f_t* dx_a = dx_aff + off; - const f_t* omega_cone = omega + off; - const f_t* w_cone = w_bar + off; - f_t* out_cone = out + off; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + auto dx_a = dx_aff.subspan(off, q); + auto omega_cone = omega.subspan(off, q); + auto w_cone = w_bar.subspan(off, q); + auto out_cone = out.subspan(off, q); f_t ie = inv_eta[cone]; f_t ipw = inv_1pw0[cone]; @@ -473,17 +480,18 @@ struct nt_warp_storage { }; template -__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_kernel(const f_t* __restrict__ s, - const f_t* __restrict__ lambda, - f_t* __restrict__ eta, - f_t* __restrict__ inv_eta, - f_t* __restrict__ inv_1pw0, - f_t* __restrict__ w_bar, - f_t* __restrict__ omega, - f_t* __restrict__ rho, - const i_t* __restrict__ cone_offsets, - const i_t* __restrict__ cone_ids, - i_t num_cones) +__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_kernel( + raft::device_span s, + raft::device_span lambda, + raft::device_span eta, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span w_bar, + raft::device_span omega, + raft::device_span rho, + raft::device_span cone_offsets, + raft::device_span cone_ids, + i_t num_cones) { static_assert(BLOCK_DIM % 32 == 0, "NT scaling kernel requires warp-aligned BLOCK_DIM"); __shared__ nt_block_storage storage; @@ -571,16 +579,16 @@ __global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_kernel(const f_t* __rest template __global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_small_kernel( - const f_t* __restrict__ s, - const f_t* __restrict__ lambda, - f_t* __restrict__ eta, - f_t* __restrict__ inv_eta, - f_t* __restrict__ inv_1pw0, - f_t* __restrict__ w_bar, - f_t* __restrict__ omega, - f_t* __restrict__ rho, - const i_t* __restrict__ cone_offsets, - const i_t* __restrict__ cone_ids, + raft::device_span s, + raft::device_span lambda, + raft::device_span eta, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span w_bar, + raft::device_span omega, + raft::device_span rho, + raft::device_span cone_offsets, + raft::device_span cone_ids, i_t num_cones) { static_assert(BLOCK_DIM % 32 == 0, "Small-cone NT kernel requires warp-aligned CTAs"); @@ -671,12 +679,12 @@ __global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_small_kernel( // --------------------------------------------------------------------------- template DI f_t -cone_step_length_single(const f_t* __restrict__ u, - const f_t* __restrict__ du, - i_t q, +cone_step_length_single(raft::device_span u, + raft::device_span du, typename block_reduce_t, BLOCK_DIM>::TempStorage& temp, f_t alpha) { + i_t q = static_cast(u.size()); auto partial = triplet_t{}; auto& [du1_sq_p, u1du1_p, u1_sq_p] = partial; for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { @@ -731,12 +739,12 @@ cone_step_length_single(const f_t* __restrict__ u, // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( - const f_t* __restrict__ s, - const f_t* __restrict__ ds, - const f_t* __restrict__ lambda, - const f_t* __restrict__ dlambda, - f_t* __restrict__ alpha, - const i_t* __restrict__ cone_offsets, + raft::device_span s, + raft::device_span ds, + raft::device_span lambda, + raft::device_span dlambda, + raft::device_span alpha, + raft::device_span cone_offsets, i_t K, f_t alpha_max) { @@ -748,10 +756,10 @@ __global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( i_t off = cone_offsets[cone]; i_t q = cone_offsets[cone + 1] - off; - f_t alpha_s = - cone_step_length_single(s + off, ds + off, q, temp_storage, alpha_max); + f_t alpha_s = cone_step_length_single( + s.subspan(off, q), ds.subspan(off, q), temp_storage, alpha_max); f_t alpha_l = cone_step_length_single( - lambda + off, dlambda + off, q, temp_storage, alpha_max); + lambda.subspan(off, q), dlambda.subspan(off, q), temp_storage, alpha_max); if (threadIdx.x == 0) { alpha[cone] = min(alpha_s, alpha_l); } } @@ -766,7 +774,7 @@ __global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( // --------------------------------------------------------------------------- template __global__ __launch_bounds__(BLOCK_DIM) void interior_shift_kernel( - f_t* __restrict__ u, const i_t* __restrict__ cone_offsets, i_t K) + raft::device_span u, raft::device_span cone_offsets, i_t K) { __shared__ typename block_reduce_t::TempStorage temp_storage; @@ -791,11 +799,15 @@ __global__ __launch_bounds__(BLOCK_DIM) void interior_shift_kernel( } /** - * Owns device storage for second-order cone topology, iterates, and NT scaling. + * Device storage for second-order cone topology, NT scaling, and iterate views. * * Flat arrays are packed by cone: elements [cone_offsets[i], cone_offsets[i+1]) * belong to cone i, which has dimension cone_dims[i]. * + * Primal/dual iterates (s, lambda) are non-owning spans, pre-sliced by the + * caller to cover the cone portion of the global x/z vectors. The caller + * must keep the underlying memory alive. + * * Search directions, RHS vectors, and workspace live directly in * iteration_data_t (matching the existing LP/QP pattern where dx_aff, dual_rhs, * etc. are all top-level fields of iteration_data_t). @@ -806,12 +818,13 @@ struct cone_data_t { i_t K; // number of second-order cones i_t m_c; // total cone dimension = sum of cone_dims - rmm::device_uvector cone_offsets; // [K+1] prefix sums of cone_dims - rmm::device_uvector cone_dims; // [K] dimension q_i of each cone + rmm::device_uvector cone_offsets; // [K+1] prefix sums of cone_dims + rmm::device_uvector cone_dims; // [K] dimension q_i of each cone + rmm::device_uvector block_offsets; // [K+1] prefix sums of q_i^2 (for dense block build) - // --- Primal/dual cone iterates (rewritten each iteration) --- - rmm::device_uvector s; // [m_c] cone slack: s_i in int(Q^{q_i}) - rmm::device_uvector lambda; // [m_c] cone dual: lambda_i in int(Q^{q_i}) + // --- Primal/dual cone iterates (non-owning views, set by caller) --- + raft::device_span s; // [m_c] cone slack: s_i in int(Q^{q_i}) + raft::device_span lambda; // [m_c] cone dual: lambda_i in int(Q^{q_i}) // --- NT scaling state (recomputed each iteration from s, lambda) --- rmm::device_uvector eta; // [K] scaling factor eta_i = (||s_i||_J / ||lambda_i||_J)^{1/2} @@ -824,13 +837,18 @@ struct cone_data_t { rmm::device_uvector medium_cone_ids; // [n_medium] cone ids with 32 < q <= 2048 rmm::device_uvector large_cone_ids; // [n_large] cone ids with q > 2048 - cone_data_t(i_t K_in, const std::vector& dims, rmm::cuda_stream_view stream) + cone_data_t(i_t K_in, + const std::vector& dims, + raft::device_span s_in, + raft::device_span lambda_in, + rmm::cuda_stream_view stream) : K(K_in), m_c(std::accumulate(dims.begin(), dims.end(), i_t(0))), cone_offsets(K_in + 1, stream), cone_dims(K_in, stream), - s(m_c, stream), - lambda(m_c, stream), + block_offsets(K_in + 1, stream), + s(s_in), + lambda(lambda_in), eta(K_in, stream), inv_eta(K_in, stream), inv_1pw0(K_in, stream), @@ -842,12 +860,14 @@ struct cone_data_t { large_cone_ids(0, stream) { std::vector offsets(K + 1, 0); + std::vector blk_offsets(K + 1, 0); std::vector small_ids; std::vector medium_ids; std::vector large_ids; for (i_t i = 0; i < K; ++i) { - offsets[i + 1] = offsets[i] + dims[i]; + offsets[i + 1] = offsets[i] + dims[i]; + blk_offsets[i + 1] = blk_offsets[i] + dims[i] * dims[i]; if (dims[i] <= small_cone_limit) { small_ids.push_back(i); } else if (dims[i] <= medium_cone_limit) { @@ -866,12 +886,76 @@ struct cone_data_t { raft::copy(cone_offsets.data(), offsets.data(), K + 1, stream); raft::copy(cone_dims.data(), dims.data(), K, stream); + raft::copy(block_offsets.data(), blk_offsets.data(), K + 1, stream); init_device_vec(small_cone_ids, small_ids); init_device_vec(medium_cone_ids, medium_ids); init_device_vec(large_cone_ids, large_ids); } }; +// --------------------------------------------------------------------------- +// Compute flat H^{-2} cone-block entries and scatter them into the augmented +// CSR value array. +// +// The caller provides one flat entry per dense cone-block element: +// - `csr_indices[e]` gives the destination slot in `augmented_x` +// - `q_values[e]` stores any pre-merged Q contribution for that slot +// +// For each flat entry we identify its owning cone from `block_offsets`, +// recover local (r, c) coordinates, evaluate H_k^{-2}(r, c), and scatter +// -(H_k^{-2}(r, c) + q_values[e]) +// into `augmented_x[csr_indices[e]]`. +// --------------------------------------------------------------------------- +template +void scatter_hinv2_into_augmented(const cone_data_t& cones, + rmm::device_uvector& augmented_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& q_values, + rmm::cuda_stream_view stream) +{ + i_t count = static_cast(csr_indices.size()); + if (count == 0) return; + + auto values = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_w_bar = cuopt::make_span(cones.w_bar), + span_inv_eta = cuopt::make_span(cones.inv_eta), + span_block_offsets = cuopt::make_span(cones.block_offsets), + span_cone_offsets = cuopt::make_span(cones.cone_offsets), + span_q_values = cuopt::make_span(q_values)] __device__(i_t e) -> f_t { + i_t lo = 0; + i_t hi = static_cast(span_block_offsets.size()) - 1; + while (lo + 1 < hi) { + i_t mid = lo + (hi - lo) / 2; + if (span_block_offsets[mid] <= e) { + lo = mid; + } else { + hi = mid; + } + } + i_t cone = lo; + i_t off = span_cone_offsets[cone]; + i_t q = span_cone_offsets[cone + 1] - off; + i_t blk_off = span_block_offsets[cone]; + i_t local = e - blk_off; + i_t r = local / q; + i_t c = local % q; + + f_t ie_sq = span_inv_eta[cone] * span_inv_eta[cone]; + f_t w0 = span_w_bar[off]; + f_t u_r = (r == 0) ? w0 : -span_w_bar[off + r]; + f_t u_c = (c == 0) ? w0 : -span_w_bar[off + c]; + f_t val = f_t(2) * u_r * ie_sq * u_c; + f_t diag_correction = (r == 0) ? -ie_sq : ie_sq; + if (r == c) { val += diag_correction; } + + return -val - span_q_values[e]; + }); + + thrust::scatter( + rmm::exec_policy(stream), values, values + count, csr_indices.begin(), augmented_x.begin()); +} + template void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) { @@ -881,16 +965,16 @@ void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view strea if (bucket_size == 0) return; nt_scaling_kernel - <<>>(cones.s.data(), - cones.lambda.data(), - cones.eta.data(), - cones.inv_eta.data(), - cones.inv_1pw0.data(), - cones.w_bar.data(), - cones.omega.data(), - cones.rho.data(), - cones.cone_offsets.data(), - cone_ids.data(), + <<>>(cones.s, + cones.lambda, + cuopt::make_span(cones.eta), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.omega), + cuopt::make_span(cones.rho), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cone_ids), bucket_size); }; @@ -899,16 +983,16 @@ void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view strea constexpr int warps_per_block = small_block_dim / 32; i_t grid_dim = (small_count + warps_per_block - 1) / warps_per_block; nt_scaling_small_kernel - <<>>(cones.s.data(), - cones.lambda.data(), - cones.eta.data(), - cones.inv_eta.data(), - cones.inv_1pw0.data(), - cones.w_bar.data(), - cones.omega.data(), - cones.rho.data(), - cones.cone_offsets.data(), - cones.small_cone_ids.data(), + <<>>(cones.s, + cones.lambda, + cuopt::make_span(cones.eta), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.omega), + cuopt::make_span(cones.rho), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cones.small_cone_ids), small_count); } diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu index 2f7d51dcae..611a236562 100644 --- a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu @@ -121,6 +121,24 @@ auto ref_apply_H_single(const std::vector& z, return out; } +template +auto ref_build_hinv2_block_single(const std::vector& w_bar, f_t inv_eta) -> std::vector +{ + std::size_t q = w_bar.size(); + std::vector block(q * q, f_t(0)); + f_t ie_sq = inv_eta * inv_eta; + + for (std::size_t r = 0; r < q; ++r) { + f_t u_r = (r == 0) ? w_bar[0] : -w_bar[r]; + for (std::size_t c = 0; c < q; ++c) { + f_t u_c = (c == 0) ? w_bar[0] : -w_bar[c]; + f_t j_rc = (r == c) ? ((r == 0) ? f_t(1) : f_t(-1)) : f_t(0); + block[r * q + c] = ie_sq * (f_t(2) * u_r * u_c - j_rc); + } + } + return block; +} + template auto ref_apply_hinv2_single(const std::vector& v, const std::vector& w_bar, f_t inv_eta) -> std::vector @@ -391,8 +409,13 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - apply_Hinv_kernel<<>>( - z.data(), out.data(), w_bar.data(), inv_eta.data(), inv_1pw0.data(), cone_offsets.data(), k); + apply_Hinv_kernel<<>>(cuopt::make_span(z), + cuopt::make_span(out), + cuopt::make_span(w_bar), + cuopt::make_span(inv_eta), + cuopt::make_span(inv_1pw0), + cuopt::make_span(cone_offsets), + k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } @@ -406,12 +429,12 @@ class second_order_cone_test : public ::testing::Test { i_t k, f_t alpha_max) { - step_length_kernel<<>>(s.data(), - ds.data(), - lambda.data(), - dlambda.data(), - alpha.data(), - cone_offsets.data(), + step_length_kernel<<>>(cuopt::make_span(s), + cuopt::make_span(ds), + cuopt::make_span(lambda), + cuopt::make_span(dlambda), + cuopt::make_span(alpha), + cuopt::make_span(cone_offsets), k, alpha_max); RAFT_CUDA_TRY(cudaPeekAtLastError()); @@ -422,7 +445,8 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - interior_shift_kernel<<>>(u.data(), cone_offsets.data(), k); + interior_shift_kernel + <<>>(cuopt::make_span(u), cuopt::make_span(cone_offsets), k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } @@ -434,8 +458,12 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - apply_Hinv2_kernel<<>>( - v.data(), out.data(), w_bar.data(), inv_eta.data(), cone_offsets.data(), k); + apply_Hinv2_kernel<<>>(cuopt::make_span(v), + cuopt::make_span(out), + cuopt::make_span(w_bar), + cuopt::make_span(inv_eta), + cuopt::make_span(cone_offsets), + k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } @@ -446,8 +474,11 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - jordan_product_kernel - <<>>(a.data(), b.data(), out.data(), cone_offsets.data(), k); + jordan_product_kernel<<>>(cuopt::make_span(a), + cuopt::make_span(b), + cuopt::make_span(out), + cuopt::make_span(cone_offsets), + k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } @@ -459,8 +490,13 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - inverse_jordan_product_kernel<<>>( - omega.data(), r.data(), rho.data(), out.data(), cone_offsets.data(), k); + inverse_jordan_product_kernel + <<>>(cuopt::make_span(omega), + cuopt::make_span(r), + cuopt::make_span(rho), + cuopt::make_span(out), + cuopt::make_span(cone_offsets), + k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } @@ -476,25 +512,34 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - fused_corrector_kernel<<>>(dx_aff.data(), - omega.data(), - w_bar.data(), - inv_eta.data(), - inv_1pw0.data(), - rho.data(), + fused_corrector_kernel<<>>(cuopt::make_span(dx_aff), + cuopt::make_span(omega), + cuopt::make_span(w_bar), + cuopt::make_span(inv_eta), + cuopt::make_span(inv_1pw0), + cuopt::make_span(rho), sigma_mu, - out.data(), - cone_offsets.data(), + cuopt::make_span(out), + cuopt::make_span(cone_offsets), k); RAFT_CUDA_TRY(cudaPeekAtLastError()); sync(); } + + void launch_cone_block_scatter(const cone_data_t& cones, + rmm::device_uvector& aug_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& q_values) + { + scatter_hinv2_into_augmented(cones, aug_x, csr_indices, q_values, stream_); + sync(); + } }; TEST_F(second_order_cone_test, cone_data_topology_and_bucket_partitioning) { std::vector dims{1, 32, 33, 2048, 2049}; - cone_data_t cones(static_cast(dims.size()), dims, stream_); + cone_data_t cones(static_cast(dims.size()), dims, {}, {}, stream_); auto expected_offsets = build_offsets(dims); auto actual_offsets = copy_to_host(cones.cone_offsets); @@ -520,9 +565,9 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) std::vector> lambda_cones{{2.0, 0.5, 0.5}}; std::vector dims{3}; - cone_data_t cones(1, dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); launch_nt_scaling(cones, stream_); @@ -558,9 +603,13 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_across_bucket_sizes) std::vector dims{1, 33, 2049}; auto offsets = build_offsets(dims); - cone_data_t cones(static_cast(dims.size()), dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); launch_nt_scaling(cones, stream_); @@ -595,9 +644,9 @@ TEST_F(second_order_cone_test, nt_scaling_omega_equals_H_times_lambda) std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; std::vector dims{5}; - cone_data_t cones(1, dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); launch_nt_scaling(cones, stream_); @@ -618,9 +667,9 @@ TEST_F(second_order_cone_test, nt_scaling_near_boundary_is_stable) std::vector> lambda_cones{{1.000015, 0.8, 0.6, -3e-5, 2e-5}}; std::vector dims{5}; - cone_data_t cones(1, dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); launch_nt_scaling(cones, stream_); @@ -947,9 +996,13 @@ TEST_F(second_order_cone_test, apply_hinv2_equals_double_hinv_with_nt_scaling) std::vector dims{3, 5}; auto offsets = build_offsets(dims); - cone_data_t cones(static_cast(dims.size()), dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); launch_nt_scaling(cones, stream_); @@ -993,9 +1046,9 @@ TEST_F(second_order_cone_test, apply_hinv2_strided_loop_for_large_cone) auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); - cone_data_t cones(1, dims, stream_); - copy_to_device(cones.s, s_cone); - copy_to_device(cones.lambda, lambda_cone); + auto d_s = make_device_vector(s_cone); + auto d_lambda = make_device_vector(lambda_cone); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); launch_nt_scaling(cones, stream_); @@ -1024,6 +1077,159 @@ TEST_F(second_order_cone_test, apply_hinv2_strided_loop_for_large_cone) expect_vector_near(hinv2_actual, ref, 1e-8, 1e-6, "hinv2_large_ref"); } +TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_matches_reference_with_nt_scaling) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); + + launch_nt_scaling(cones, stream_); + + auto block_offsets_host = copy_to_host(cones.block_offsets); + i_t total_blk = dims[0] * dims[0] + dims[1] * dims[1]; + std::vector q_vals(total_blk, f_t(0)); + std::vector csr_indices(total_blk); + constexpr i_t aug_offset = 2; + for (i_t e = 0; e < total_blk; ++e) { + csr_indices[e] = aug_offset + (total_blk - 1 - e); + } + auto d_csr_indices = make_device_vector(csr_indices); + auto d_q_values = make_device_vector(q_vals); + rmm::device_uvector d_aug_x(total_blk + aug_offset, stream_); + RAFT_CUDA_TRY( + cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * (total_blk + aug_offset), stream_)); + launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); + + auto actual = copy_to_host(d_aug_x); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + for (i_t e = 0; e < aug_offset; ++e) { + EXPECT_EQ(actual[e], f_t(0)) << "untouched prefix entry " << e; + } + + i_t blk_off = 0; + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto w_c = slice_cone(w_bar_host, offsets, c); + auto ref = ref_build_hinv2_block_single(w_c, inv_eta_h[c]); + i_t blk_sz = dims[c] * dims[c]; + for (i_t e = 0; e < blk_sz; ++e) { + EXPECT_NEAR(actual[csr_indices[blk_off + e]], -ref[e], 1e-10 + 1e-8 * std::abs(ref[e])) + << "cone " << c << " entry " << e; + } + blk_off += blk_sz; + } +} + +TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_matvec_matches_apply_hinv2) +{ + std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; + std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; + std::vector dims{5}; + i_t q = dims[0]; + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); + + launch_nt_scaling(cones, stream_); + + i_t total_blk = q * q; + std::vector csr_indices(total_blk); + std::iota(csr_indices.begin(), csr_indices.end(), 0); + std::vector q_vals(total_blk, f_t(0)); + auto d_csr_indices = make_device_vector(csr_indices); + auto d_q_values = make_device_vector(q_vals); + rmm::device_uvector d_aug_x(total_blk, stream_); + RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_blk, stream_)); + launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); + + auto scattered = copy_to_host(d_aug_x); + std::vector block(total_blk); + for (i_t e = 0; e < total_blk; ++e) { + block[e] = -scattered[e]; + } + + std::vector> test_vectors{ + {1.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0, 0.0}, {0.3, -0.1, 0.2, -0.5, 0.15}}; + + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + for (const auto& v : test_vectors) { + // Host mat-vec: y = block * v + std::vector y(q, f_t(0)); + for (i_t r = 0; r < q; ++r) { + for (i_t c = 0; c < q; ++c) { + y[r] += block[r * q + c] * v[c]; + } + } + + auto ref = ref_apply_hinv2_single(v, w_bar_host, inv_eta_h[0]); + expect_vector_near(y, ref, 1e-10, 1e-8, "block_matvec_vs_apply"); + } +} + +TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_large_cone) +{ + std::vector dims{513}; + + auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); + auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); + + auto d_s = make_device_vector(s_cone); + auto d_lambda = make_device_vector(lambda_cone); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); + + launch_nt_scaling(cones, stream_); + + i_t total_blk = dims[0] * dims[0]; + std::vector csr_indices(total_blk); + std::iota(csr_indices.begin(), csr_indices.end(), 0); + std::vector q_vals(total_blk, f_t(0)); + auto d_csr_indices = make_device_vector(csr_indices); + auto d_q_values = make_device_vector(q_vals); + rmm::device_uvector d_aug_x(total_blk, stream_); + RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_blk, stream_)); + launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); + + auto scattered = copy_to_host(d_aug_x); + std::vector block(total_blk); + for (i_t e = 0; e < total_blk; ++e) { + block[e] = -scattered[e]; + } + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + // Spot-check: block * e_0 should match apply_Hinv2(e_0) + i_t q = dims[0]; + std::vector col0(q); + for (i_t r = 0; r < q; ++r) { + col0[r] = block[r * q]; + } + std::vector e0(q, f_t(0)); + e0[0] = f_t(1); + auto ref = ref_apply_hinv2_single(e0, w_bar_host, inv_eta_h[0]); + expect_vector_near(col0, ref, 1e-8, 1e-6, "hinv2_block_col0_large"); + + // Symmetry check: block[r][c] == block[c][r] + for (i_t r = 0; r < std::min(q, i_t(50)); ++r) { + for (i_t c = r + 1; c < std::min(q, i_t(50)); ++c) { + EXPECT_NEAR(block[r * q + c], block[c * q + r], 1e-10) + << "asymmetry at (" << r << "," << c << ")"; + } + } +} + TEST_F(second_order_cone_test, jordan_product_matches_reference_for_packed_cones) { std::vector dims{1, 3, 4}; @@ -1144,9 +1350,13 @@ TEST_F(second_order_cone_test, inverse_jordan_product_with_nt_scaling_rho) std::vector dims{3, 5}; auto offsets = build_offsets(dims); - cone_data_t cones(static_cast(dims.size()), dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); launch_nt_scaling(cones, stream_); @@ -1189,9 +1399,13 @@ TEST_F(second_order_cone_test, fused_corrector_matches_reference_with_nt_scaling std::vector dims{3, 5}; auto offsets = build_offsets(dims); - cone_data_t cones(static_cast(dims.size()), dims, stream_); - copy_to_device(cones.s, pack_cones(s_cones)); - copy_to_device(cones.lambda, pack_cones(lambda_cones)); + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); launch_nt_scaling(cones, stream_); @@ -1241,9 +1455,9 @@ TEST_F(second_order_cone_test, fused_corrector_strided_loop_for_large_cone) auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); - cone_data_t cones(1, dims, stream_); - copy_to_device(cones.s, s_cone); - copy_to_device(cones.lambda, lambda_cone); + auto d_s = make_device_vector(s_cone); + auto d_lambda = make_device_vector(lambda_cone); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); launch_nt_scaling(cones, stream_); @@ -1277,4 +1491,48 @@ TEST_F(second_order_cone_test, fused_corrector_strided_loop_for_large_cone) expect_vector_near(actual, ref, 1e-8, 1e-6, "fused_corrector_large"); } +TEST_F(second_order_cone_test, cone_block_scatter_with_q_overlap) +{ + std::vector> s_cones{{3.0, 0.5, -0.3}}; + std::vector> lambda_cones{{2.0, -0.2, 0.4}}; + std::vector dims{3}; + i_t K = 1; + i_t q_k = 3; + i_t total_block_nnz = q_k * q_k; + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(K, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); + launch_nt_scaling(cones, stream_); + + f_t dual_perturb = 1e-6; + std::vector q_vals(total_block_nnz, f_t(0)); + q_vals[0] = 0.5 + dual_perturb; + q_vals[4] = 0.3 + dual_perturb; + q_vals[8] = 0.1 + dual_perturb; + q_vals[1] = 0.05; + q_vals[3] = 0.05; + + std::vector cone_csr_indices(total_block_nnz); + std::iota(cone_csr_indices.begin(), cone_csr_indices.end(), 0); + auto d_cone_csr_indices = make_device_vector(cone_csr_indices); + auto d_cone_Q_values = make_device_vector(q_vals); + + rmm::device_uvector d_aug_x(total_block_nnz, stream_); + RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_block_nnz, stream_)); + + launch_cone_block_scatter(cones, d_aug_x, d_cone_csr_indices, d_cone_Q_values); + + auto actual = copy_to_host(d_aug_x); + auto w_bar_h = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + auto ref_block = ref_build_hinv2_block_single(w_bar_h, inv_eta_h[0]); + + for (i_t e = 0; e < total_block_nnz; ++e) { + f_t expected = -ref_block[e] - q_vals[e]; + EXPECT_NEAR(actual[e], expected, 1e-10 + 1e-8 * std::abs(expected)) + << "entry " << e << " (Q overlap test)"; + } +} + } // namespace cuopt::linear_programming::dual_simplex::test From 41f763c65dc633bfef01acb671a7b7a843a9c0d6 Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 7 Apr 2026 12:26:29 -0700 Subject: [PATCH 03/22] fix(barrier): stabilize SOCP barrier convergence Tighten the augmented-system SOCP updates so the barrier path converges on the mixed LP/QP cone cases covered by the new regression tests. --- cpp/src/barrier/barrier.cu | 355 +++++++++++++++--- cpp/src/barrier/dense_vector.hpp | 22 ++ cpp/src/barrier/second_order_cone.cuh | 193 ++++++++++ cpp/src/dual_simplex/presolve.cpp | 82 ++-- cpp/src/dual_simplex/presolve.hpp | 2 + cpp/src/dual_simplex/scaling.cpp | 13 +- cpp/src/dual_simplex/solve.cpp | 20 + cpp/src/dual_simplex/user_problem.hpp | 2 + .../unit_tests/second_order_cone_test.cu | 189 +++++++--- .../dual_simplex/unit_tests/solve_barrier.cu | 131 +++++++ 10 files changed, 883 insertions(+), 126 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index bb848e6037..efa941bfe8 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -213,6 +213,8 @@ class iteration_data_t { d_complementarity_xz_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), d_complementarity_wv_rhs_(0, lp.handle_ptr->get_stream()), d_dual_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), + d_cone_rhs_term_(0, lp.handle_ptr->get_stream()), + d_cone_hinv2_dx_(0, lp.handle_ptr->get_stream()), d_Q_diag_(0, lp.handle_ptr->get_stream()), d_Qx_(Qin.m, lp.handle_ptr->get_stream()), restrict_u_(0), @@ -220,7 +222,9 @@ class iteration_data_t { sum_reduce_helper_(lp.handle_ptr->get_stream()), indefinite_Q(false), Q_diagonal(false), - symbolic_status(0) + symbolic_status(0), + cone_combined_step_(false), + cone_sigma_mu_(f_t(0)) { raft::common::nvtx::range fun_scope("Barrier: LP Data Creation"); @@ -262,6 +266,20 @@ class iteration_data_t { raft::copy(d_Q_diag_.data(), Qdiag.data(), Qdiag.size(), stream_view_); } + if (!lp.second_order_cone_dims.empty()) { + cone_var_start_ = lp.cone_var_start; + i_t total_cone_dim = + std::accumulate(lp.second_order_cone_dims.begin(), lp.second_order_cone_dims.end(), i_t(0)); + cuopt_assert(cone_var_start_ >= 0, "cone_var_start must be nonnegative"); + cuopt_assert(cone_var_start_ + total_cone_dim <= lp.num_cols, + "cone variables exceed problem dimension"); + cones_.emplace(static_cast(lp.second_order_cone_dims.size()), + lp.second_order_cone_dims, + raft::device_span{}, + raft::device_span{}, + stream_view_); + } + // Allocating GPU flag data for Form ADAT RAFT_CUDA_TRY(cub::DeviceSelect::Flagged( nullptr, @@ -1515,8 +1533,9 @@ class iteration_data_t { f_t beta, rmm::device_uvector& y) { - const i_t m = A.m; - const i_t n = A.n; + const i_t m = A.m; + const i_t n = A.n; + const bool has_cones = cones_.has_value() && cones_->K > 0; rmm::device_uvector d_x1(n, handle_ptr->get_stream()); rmm::device_uvector d_x2(m, handle_ptr->get_stream()); @@ -1535,12 +1554,25 @@ class iteration_data_t { // diag.pairwise_product(x1, r1); // r1 <- D * x_1 pairwise_multiply(d_x1.data(), d_diag_.data(), d_r1.data(), n, stream_view_); + if (has_cones) { + thrust::fill_n( + rmm::exec_policy(stream_view_), d_r1.begin() + cone_var_start_, cones_->m_c, f_t(0)); + } // r1 <- Q x1 + D x1 if (Q.n > 0) { // matrix_vector_multiply(Q, 1.0, x1, 1.0, r1); cusparse_Q_view_.spmv(1.0, d_x1, 1.0, d_r1); } + if (has_cones) { + accumulate_cone_hinv2_matvec( + raft::device_span(d_x1.data() + cone_var_start_, cones_->m_c), + *cones_, + d_cone_hinv2_dx_, + raft::device_span(d_r1.data() + cone_var_start_, cones_->m_c), + stream_view_); + RAFT_CHECK_CUDA(stream_view_); + } // y1 <- - alpha * r1 + beta * y1 // y1.axpy(-alpha, r1, beta); @@ -1739,6 +1771,8 @@ class iteration_data_t { rmm::device_uvector d_complementarity_xz_rhs_; rmm::device_uvector d_complementarity_wv_rhs_; rmm::device_uvector d_dual_rhs_; + rmm::device_uvector d_cone_rhs_term_; + rmm::device_uvector d_cone_hinv2_dx_; rmm::device_uvector d_Q_diag_; rmm::device_uvector d_Qx_; @@ -1748,6 +1782,9 @@ class iteration_data_t { transform_reduce_helper_t transform_reduce_helper_; sum_reduce_helper_t sum_reduce_helper_; + bool cone_combined_step_; + f_t cone_sigma_mu_; + rmm::cuda_stream_view stream_view_; const simplex_solver_settings_t& settings_; @@ -1846,6 +1883,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: initial_point"); const bool use_augmented = data.use_augmented; + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; // Perform a numerical factorization i_t status; @@ -2004,8 +2042,10 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v[k] = -c[j] + epsilon; } } - // Now hande the case with no upper bounds + // Now hande the case with no upper bounds (skip cone variables) + const i_t cone_end = has_cones ? data.cone_var_start_ + data.cones_->m_c : 0; for (i_t j = 0; j < lp.num_cols; j++) { + if (has_cones && j >= data.cone_var_start_ && j < cone_end) continue; if (lp.upper[j] == inf) { if (c[j] > epsilon_adjust) { data.z[j] = c[j]; @@ -2034,7 +2074,8 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive(epsilon_adjust); + data.z.ensure_positive_skip_range( + epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); } else { // First compute rhs = A*Dinv*c dense_vector_t rhs(lp.num_rows); @@ -2058,7 +2099,8 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.gather_upper_bounds(data.z, data.v); data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive(epsilon_adjust); + data.z.ensure_positive_skip_range( + epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); } // Verify A'*y + z - E*v - Q*x = c @@ -2075,9 +2117,33 @@ int barrier_solver_t::initial_point(iteration_data_t& data) settings.log.printf("||A^T y + z - E*v - Q*x - c ||: %e\n", vector_norm2(data.dual_residual)); #endif - // Make sure (w, x, v, z) > 0 + // Make sure (w, x, v, z) > 0; skip cone vars — handled by shift_into_interior below data.w.ensure_positive(epsilon_adjust); - data.x.ensure_positive(epsilon_adjust); + data.x.ensure_positive_skip_range( + epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); + + if (has_cones) { + const auto& dims = lp.second_order_cone_dims; + i_t cs = data.cone_var_start_; + i_t off = 0; + + for (i_t k = 0; k < static_cast(dims.size()); ++k) { + i_t q_k = dims[k]; + auto shift_into_interior = [&](auto& vec) { + f_t tail_sq = f_t(0); + for (i_t j = 1; j < q_k; ++j) { + f_t v = vec[cs + off + j]; + tail_sq += v * v; + } + f_t gap = std::sqrt(tail_sq) - vec[cs + off]; + if (gap >= f_t(0)) { vec[cs + off] += f_t(1) + gap; } + }; + shift_into_interior(data.x); + shift_into_interior(data.z); + off += q_k; + } + } + #ifdef PRINT_INFO settings.log.printf("min v %e min z %e\n", data.v.minimum(), data.z.minimum()); #endif @@ -2272,16 +2338,51 @@ f_t barrier_solver_t::gpu_max_step_to_boundary(iteration_data_t& x, const rmm::device_uvector& dx) { + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool skip_cone_range = + has_cones && static_cast(x.size()) >= data.cone_var_start_ + data.cones_->m_c; + + auto ratio_test = [] HD(const thrust::tuple t) { + const f_t dx = thrust::get<0>(t); + const f_t x = thrust::get<1>(t); + if (dx < f_t(0.0)) return -x / dx; + return f_t(1.0); + }; + + if (skip_cone_range) { + i_t cs = data.cone_var_start_; + i_t mc = data.cones_->m_c; + f_t alpha = f_t(1); + + if (cs > 0) { + alpha = std::min(alpha, + data.transform_reduce_helper_.transform_reduce( + thrust::make_zip_iterator(dx.data(), x.data()), + thrust::minimum(), + ratio_test, + f_t(1), + cs, + stream_view_)); + } + i_t tail_start = cs + mc; + i_t tail_size = static_cast(x.size()) - tail_start; + if (tail_size > 0) { + alpha = std::min(alpha, + data.transform_reduce_helper_.transform_reduce( + thrust::make_zip_iterator(dx.data() + tail_start, x.data() + tail_start), + thrust::minimum(), + ratio_test, + f_t(1), + tail_size, + stream_view_)); + } + return alpha; + } + return data.transform_reduce_helper_.transform_reduce( thrust::make_zip_iterator(dx.data(), x.data()), thrust::minimum(), - [] HD(const thrust::tuple t) { - const f_t dx = thrust::get<0>(t); - const f_t x = thrust::get<1>(t); - - if (dx < f_t(0.0)) return -x / dx; - return f_t(1.0); - }, + ratio_test, f_t(1.0), x.size(), stream_view_); @@ -2300,6 +2401,9 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tK > 0; + const i_t m_c = has_cones ? data.cones_->m_c : 0; + const i_t cone_var_start = data.cone_var_start_; { raft::common::nvtx::range fun_scope("Barrier: GPU allocation and copies"); @@ -2340,6 +2444,24 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_ts = raft::device_span(data.d_x_.data() + cone_var_start, m_c); + data.cones_->lambda = raft::device_span(data.d_z_.data() + cone_var_start, m_c); + launch_nt_scaling(*data.cones_, stream_view_); + + if (data.cone_combined_step_) { + compute_combined_cone_rhs_term( + raft::device_span(data.d_dx_aff_.data() + cone_var_start, m_c), + *data.cones_, + data.cone_sigma_mu_, + data.d_cone_rhs_term_, + stream_view_); + } else { + compute_affine_cone_rhs_term(*data.cones_, data.d_cone_rhs_term_, stream_view_); + } + RAFT_CHECK_CUDA(stream_view_); + } + // Solves the linear system // // dw dx dy dv dz @@ -2353,13 +2475,18 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t{}, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); + + if (has_cones) { + thrust::fill_n( + rmm::exec_policy(stream_view_), data.d_diag_.begin() + cone_var_start, m_c, f_t(1)); + } // diag = z ./ x + E * (v ./ w) * E' if (data.n_upper_bounds > 0) { @@ -2450,22 +2577,45 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t thrust::tuple { - const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs; - return {tmp, inv_diag * tmp}; - }, - stream_view_.value()); + if (has_cones) { + thrust::for_each_n(rmm::exec_policy(stream_view_), + thrust::make_counting_iterator(0), + lp.num_cols, + [span_inv_diag = cuopt::make_span(data.d_inv_diag), + span_tmp3 = cuopt::make_span(data.d_tmp3_), + span_tmp4 = cuopt::make_span(data.d_tmp4_), + span_xz_rhs = cuopt::make_span(data.d_complementarity_xz_rhs_), + span_x = cuopt::make_span(data.d_x_), + span_dual_rhs = cuopt::make_span(data.d_dual_rhs_), + span_cone_rhs = cuopt::make_span(data.d_cone_rhs_term_), + cone_start = cone_var_start, + cone_size = m_c] __device__(i_t j) { + bool is_cone = (j >= cone_start && j < cone_start + cone_size); + f_t cone_rhs = is_cone ? span_cone_rhs[j - cone_start] : f_t(0); + f_t xz_term = is_cone ? f_t(0) : -(span_xz_rhs[j] / span_x[j]); + f_t tmp = span_tmp3[j] + xz_term + span_dual_rhs[j] + cone_rhs; + span_tmp3[j] = tmp; + span_tmp4[j] = span_inv_diag[j] * tmp; + }); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_inv_diag.data(), + data.d_tmp3_.data(), + data.d_complementarity_xz_rhs_.data(), + data.d_x_.data(), + data.d_dual_rhs_.data()), + thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()), + lp.num_cols, + [] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs) + -> thrust::tuple { + const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs; + return {tmp, inv_diag * tmp}; + }, + stream_view_.value()); + } RAFT_CHECK_CUDA(stream_view_); raft::copy(data.d_r1_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); raft::copy(data.d_r1_prime_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); @@ -2749,18 +2899,42 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data.d_dx_.data() + cone_var_start, m_c), + *data.cones_, + data.d_cone_rhs_term_, + data.d_cone_hinv2_dx_, + raft::device_span(data.d_dz_.data() + cone_var_start, m_c), + stream_view_); + + thrust::for_each_n(rmm::exec_policy(stream_view_), + thrust::make_counting_iterator(0), + data.d_dz_.size(), + [span_xz_rhs = cuopt::make_span(data.d_complementarity_xz_rhs_), + span_z = cuopt::make_span(data.d_z_), + span_dx = cuopt::make_span(data.d_dx_), + span_x = cuopt::make_span(data.d_x_), + span_dz = cuopt::make_span(data.d_dz_), + cone_start = cone_var_start, + cone_size = m_c] __device__(i_t j) { + if (j < cone_start || j >= cone_start + cone_size) { + span_dz[j] = (span_xz_rhs[j] - span_z[j] * span_dx[j]) / span_x[j]; + } + }); + } else { + // dz = (complementarity_xz_rhs - z.* dx) ./ x; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(), + data.d_z_.data(), + data.d_dx_.data(), + data.d_x_.data()), + data.d_dz_.data(), + data.d_dz_.size(), + [] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x) { + return (complementarity_xz_rhs - z * dx) / x; + }, + stream_view_.value()); + } RAFT_CHECK_CUDA(stream_view_); raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); } @@ -2944,10 +3118,13 @@ template void barrier_solver_t::compute_affine_rhs(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - data.primal_rhs = data.primal_residual; - data.bound_rhs = data.bound_residual; - data.dual_rhs = data.dual_residual; + data.primal_rhs = data.primal_residual; + data.bound_rhs = data.bound_residual; + data.dual_rhs = data.dual_residual; + data.cone_combined_step_ = false; + data.cone_sigma_mu_ = f_t(0); raft::copy(data.d_complementarity_xz_rhs_.data(), data.d_complementarity_xz_residual_.data(), @@ -2966,6 +3143,12 @@ void barrier_solver_t::compute_affine_rhs(iteration_data_t& [] HD(f_t xz_rhs) { return -xz_rhs; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); + if (has_cones) { + thrust::fill_n(rmm::exec_policy(stream_view_), + data.d_complementarity_xz_rhs_.begin() + data.cone_var_start_, + data.cones_->m_c, + f_t(0)); + } // w.*v -> -w .* v cub::DeviceTransform::Transform( data.d_complementarity_wv_rhs_.data(), @@ -2981,6 +3164,7 @@ void barrier_solver_t::compute_target_mu( iteration_data_t& data, f_t mu, f_t& mu_aff, f_t& sigma, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_target_mu"); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; f_t complementarity_aff_sum = 0.0; // TMP no copy and data should always be on the GPU @@ -2999,6 +3183,21 @@ void barrier_solver_t::compute_target_mu( f_t step_dual_aff = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_aff_), gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_aff_)); + if (has_cones) { + i_t cs = data.cone_var_start_; + i_t mc = data.cones_->m_c; + step_primal_aff = std::min( + step_primal_aff, + compute_cone_step_length(*data.cones_, + raft::device_span(data.d_x_.data() + cs, mc), + raft::device_span(data.d_dx_aff_.data() + cs, mc), + raft::device_span(data.d_z_.data() + cs, mc), + raft::device_span(data.d_dz_aff_.data() + cs, mc), + step_primal_aff, + stream_view_)); + step_dual_aff = step_primal_aff; + } + if (data.Q.n > 0) { step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); } // Compute complementarity_xz_aff_sum = sum(x_aff * z_aff), @@ -3050,9 +3249,12 @@ void barrier_solver_t::compute_target_mu( stream_view_); complementarity_aff_sum = complementarity_xz_aff_sum + complementarity_wv_aff_sum; - - mu_aff = (complementarity_aff_sum) / - (static_cast(data.x.size()) + static_cast(data.n_upper_bounds)); + f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); + if (has_cones) { + mu_denom -= static_cast(data.cones_->m_c); + mu_denom += static_cast(data.cones_->K); + } + mu_aff = complementarity_aff_sum / mu_denom; sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); new_mu = sigma * mu_aff; } @@ -3061,6 +3263,7 @@ template void barrier_solver_t::compute_cc_rhs(iteration_data_t& data, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_cc_rhs"); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; cub::DeviceTransform::Transform( cuda::std::make_tuple(data.d_dx_aff_.data(), data.d_dz_aff_.data()), @@ -3076,11 +3279,19 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data [new_mu] HD(f_t dw_aff, f_t dv_aff) { return -(dw_aff * dv_aff) + new_mu; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); + if (has_cones) { + thrust::fill_n(rmm::exec_policy(stream_view_), + data.d_complementarity_xz_rhs_.begin() + data.cone_var_start_, + data.cones_->m_c, + f_t(0)); + } // TMP should be CPU to 0 if CPU and GPU to 0 if GPU data.primal_rhs.set_scalar(0.0); data.bound_rhs.set_scalar(0.0); data.dual_rhs.set_scalar(0.0); + data.cone_combined_step_ = has_cones; + data.cone_sigma_mu_ = has_cones ? new_mu : f_t(0); } template @@ -3153,6 +3364,8 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ f_t& step_dual) { raft::common::nvtx::range fun_scope("Barrier: compute_primal_dual_step_length"); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + f_t max_step_primal = 0.0; f_t max_step_dual = 0.0; max_step_primal = std::min(gpu_max_step_to_boundary(data, data.d_w_, data.d_dw_), @@ -3160,6 +3373,25 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ max_step_dual = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_), gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_)); + if (has_cones) { + i_t cs = data.cone_var_start_; + i_t mc = data.cones_->m_c; + f_t cone_primal = + compute_single_cone_step_length(*data.cones_, + raft::device_span(data.d_x_.data() + cs, mc), + raft::device_span(data.d_dx_.data() + cs, mc), + f_t(1), + stream_view_); + f_t cone_dual = + compute_single_cone_step_length(*data.cones_, + raft::device_span(data.d_z_.data() + cs, mc), + raft::device_span(data.d_dz_.data() + cs, mc), + f_t(1), + stream_view_); + max_step_primal = std::min(max_step_primal, cone_primal); + max_step_dual = std::min(max_step_dual, cone_dual); + } + step_primal = step_scale * max_step_primal; step_dual = step_scale * max_step_dual; @@ -3253,6 +3485,13 @@ template void barrier_solver_t::compute_mu(iteration_data_t& data, f_t& mu) { raft::common::nvtx::range fun_scope("Barrier: compute_mu"); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + + f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); + if (has_cones) { + mu_denom -= static_cast(data.cones_->m_c); + mu_denom += static_cast(data.cones_->K); + } mu = (data.sum_reduce_helper_.sum(data.d_complementarity_xz_residual_.begin(), data.d_complementarity_xz_residual_.size(), @@ -3260,7 +3499,7 @@ void barrier_solver_t::compute_mu(iteration_data_t& data, f_ data.sum_reduce_helper_.sum(data.d_complementarity_wv_residual_.begin(), data.d_complementarity_wv_residual_.size(), stream_view_)) / - (static_cast(data.x.size()) + static_cast(data.n_upper_bounds)); + mu_denom; } template @@ -3585,8 +3824,13 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t complementarity_residual_norm = std::max(vector_norm_inf(data.complementarity_xz_residual, stream_view_), vector_norm_inf(data.complementarity_wv_residual, stream_view_)); - f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / - (static_cast(n) + static_cast(num_upper_bounds)); + f_t mu_denom = static_cast(n) + static_cast(num_upper_bounds); + if (data.cones_.has_value() && data.cones_->K > 0) { + mu_denom -= static_cast(data.cones_->m_c); + mu_denom += static_cast(data.cones_->K); + } + f_t mu = + (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / mu_denom; f_t norm_b = vector_norm_inf(data.b, stream_view_); f_t norm_c = vector_norm_inf(data.c, stream_view_); @@ -3627,9 +3871,14 @@ lp_status_t barrier_solver_t::solve(f_t start_time, relative_complementarity_residual, elapsed_time); + f_t duality_gap_abs = std::abs(primal_objective - dual_objective); + f_t duality_gap_rel = + duality_gap_abs / + std::max(f_t(1), std::min(std::abs(primal_objective), std::abs(dual_objective))); bool converged = primal_residual_norm < settings.barrier_relative_feasibility_tol && dual_residual_norm < settings.barrier_relative_optimality_tol && - complementarity_residual_norm < settings.barrier_relative_complementarity_tol; + (duality_gap_abs < settings.barrier_relative_complementarity_tol || + duality_gap_rel < settings.barrier_relative_complementarity_tol); data.d_complementarity_xz_residual_.resize(data.complementarity_xz_residual.size(), stream_view_); diff --git a/cpp/src/barrier/dense_vector.hpp b/cpp/src/barrier/dense_vector.hpp index f73a9a5fce..009b9ea955 100644 --- a/cpp/src/barrier/dense_vector.hpp +++ b/cpp/src/barrier/dense_vector.hpp @@ -184,6 +184,28 @@ class dense_vector_t : public std::vector { } } + void ensure_positive_skip_range(f_t epsilon_adjust, i_t skip_start, i_t skip_count) + { + if (skip_count == 0) { + ensure_positive(epsilon_adjust); + return; + } + const i_t n = this->size(); + const i_t skip_end = skip_start + skip_count; + f_t min_val = std::numeric_limits::max(); + for (i_t i = 0; i < n; i++) { + if (i >= skip_start && i < skip_end) continue; + min_val = std::min(min_val, (*this)[i]); + } + if (min_val <= 0.0) { + const f_t delta = -min_val + epsilon_adjust; + for (i_t i = 0; i < n; i++) { + if (i >= skip_start && i < skip_end) continue; + (*this)[i] += delta; + } + } + } + void bound_away_from_zero(f_t epsilon_adjust) { const i_t n = this->size(); diff --git a/cpp/src/barrier/second_order_cone.cuh b/cpp/src/barrier/second_order_cone.cuh index ca23ed39cc..4d69e787d1 100644 --- a/cpp/src/barrier/second_order_cone.cuh +++ b/cpp/src/barrier/second_order_cone.cuh @@ -764,6 +764,33 @@ __global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( if (threadIdx.x == 0) { alpha[cone] = min(alpha_s, alpha_l); } } +// --------------------------------------------------------------------------- +// Single-variable cone step length kernel (one block per cone). +// Like step_length_kernel but only checks u + alpha*du in Q^{q_i}. +// --------------------------------------------------------------------------- +template +__global__ __launch_bounds__(BLOCK_DIM) void step_length_single_kernel( + raft::device_span u, + raft::device_span du, + raft::device_span alpha, + raft::device_span cone_offsets, + i_t K, + f_t alpha_max) +{ + __shared__ typename block_reduce_t, BLOCK_DIM>::TempStorage temp_storage; + + i_t cone = static_cast(blockIdx.x); + if (cone >= K) return; + + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + + f_t a = cone_step_length_single( + u.subspan(off, q), du.subspan(off, q), temp_storage, alpha_max); + + if (threadIdx.x == 0) { alpha[cone] = a; } +} + // --------------------------------------------------------------------------- // Shift u into int(Q^q) if it is not already interior (one block per cone). // @@ -893,6 +920,104 @@ struct cone_data_t { } }; +template +void compute_affine_cone_rhs_term(const cone_data_t& cones, + rmm::device_uvector& out, + rmm::cuda_stream_view stream) +{ + out.resize(cones.m_c, stream); + if (cones.K == 0) return; + + apply_Hinv2_kernel + <<>>(cones.s, + cuopt::make_span(out), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.cone_offsets), + cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void compute_combined_cone_rhs_term(raft::device_span dx_aff, + const cone_data_t& cones, + f_t sigma_mu, + rmm::device_uvector& out, + rmm::cuda_stream_view stream) +{ + out.resize(cones.m_c, stream); + if (cones.K == 0) return; + + fused_corrector_kernel + <<>>(dx_aff, + cuopt::make_span(cones.omega), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.rho), + sigma_mu, + cuopt::make_span(out), + cuopt::make_span(cones.cone_offsets), + cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void recover_cone_dz(raft::device_span dx, + const cone_data_t& cones, + const rmm::device_uvector& cone_rhs_term, + rmm::device_uvector& hinv2_dx, + raft::device_span dz, + rmm::cuda_stream_view stream) +{ + hinv2_dx.resize(cones.m_c, stream); + if (cones.K == 0) return; + + apply_Hinv2_kernel + <<>>(dx, + cuopt::make_span(hinv2_dx), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.cone_offsets), + cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + cones.m_c, + [span_rhs = cuopt::make_span(cone_rhs_term), + span_hinv2 = cuopt::make_span(hinv2_dx), + span_dz = dz] __device__(i_t j) { span_dz[j] = -span_rhs[j] - span_hinv2[j]; }); +} + +template +void accumulate_cone_hinv2_matvec(raft::device_span x, + const cone_data_t& cones, + rmm::device_uvector& hinv2_x, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + hinv2_x.resize(cones.m_c, stream); + if (cones.K == 0) return; + + apply_Hinv2_kernel + <<>>(x, + cuopt::make_span(hinv2_x), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.cone_offsets), + cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + cones.m_c, + [span_hinv2 = cuopt::make_span(hinv2_x), span_out = out] __device__(i_t j) { + span_out[j] += span_hinv2[j]; + }); +} + // --------------------------------------------------------------------------- // Compute flat H^{-2} cone-block entries and scatter them into the augmented // CSR value array. @@ -956,6 +1081,74 @@ void scatter_hinv2_into_augmented(const cone_data_t& cones, rmm::exec_policy(stream), values, values + count, csr_indices.begin(), augmented_x.begin()); } +// --------------------------------------------------------------------------- +// Compute the maximum feasible step length for the cone portion of (x, z). +// +// Launches step_length_kernel (one CTA per cone), then reduces the per-cone +// results to a single scalar. Returns min over all cones of the step length +// that keeps both x_K + alpha*dx_K and z_K + alpha*dz_K in their cones. +// --------------------------------------------------------------------------- +template +f_t compute_cone_step_length(const cone_data_t& cones, + raft::device_span x_K, + raft::device_span dx_K, + raft::device_span z_K, + raft::device_span dz_K, + f_t alpha_max, + rmm::cuda_stream_view stream) +{ + if (cones.K == 0) return alpha_max; + + rmm::device_uvector d_alpha(cones.K, stream); + step_length_kernel + <<>>(x_K, + dx_K, + z_K, + dz_K, + cuopt::make_span(d_alpha), + cuopt::make_span(cones.cone_offsets), + cones.K, + alpha_max); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + f_t result = thrust::reduce( + rmm::exec_policy(stream), d_alpha.begin(), d_alpha.end(), alpha_max, thrust::minimum()); + return result; +} + +template +f_t compute_single_cone_step_length(const cone_data_t& cones, + raft::device_span u_K, + raft::device_span du_K, + f_t alpha_max, + rmm::cuda_stream_view stream) +{ + if (cones.K == 0) return alpha_max; + + rmm::device_uvector d_alpha(cones.K, stream); + step_length_single_kernel<<>>( + u_K, du_K, cuopt::make_span(d_alpha), cuopt::make_span(cones.cone_offsets), cones.K, alpha_max); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + return thrust::reduce( + rmm::exec_policy(stream), d_alpha.begin(), d_alpha.end(), alpha_max, thrust::minimum()); +} + +// --------------------------------------------------------------------------- +// Shift cone slices of a vector into the strict interior of their cones. +// Operates on a subspan of the global vector (pre-sliced to cone portion). +// --------------------------------------------------------------------------- +template +void launch_interior_shift(raft::device_span u_K, + const cone_data_t& cones, + rmm::cuda_stream_view stream) +{ + if (cones.K == 0) return; + interior_shift_kernel + <<>>(u_K, cuopt::make_span(cones.cone_offsets), cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) { diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index c5ef847106..ab5f990809 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -22,22 +22,15 @@ namespace cuopt::linear_programming::dual_simplex { template i_t remove_empty_cols(lp_problem_t& problem, i_t& num_empty_cols, - presolve_info_t& presolve_info) + presolve_info_t& presolve_info, + const std::vector& is_cone_variable) { constexpr bool verbose = false; if (verbose) { printf("Removing %d empty columns\n", num_empty_cols); } - // We have a variable x_j that does not appear in any rows - // The cost function - // sum_{k != j} c_k * x_k + c_j * x_j - // becomes - // sum_{k != j} c_k * x_k + c_j * l_j if c_j > 0 - // or - // sum_{k != j} c_k * x_k + c_j * u_j if c_j < 0 presolve_info.removed_variables.reserve(num_empty_cols); presolve_info.removed_values.reserve(num_empty_cols); presolve_info.removed_reduced_costs.reserve(num_empty_cols); - // Check to see if a variable participates in a quadratic objective std::vector has_quadratic_term(problem.num_cols, false); if (problem.Q.n > 0) { @@ -45,7 +38,6 @@ i_t remove_empty_cols(lp_problem_t& problem, const i_t row_start = problem.Q.row_start[j]; const i_t row_end = problem.Q.row_start[j + 1]; if (row_end - row_start == 0) { continue; } - // Q is symmetric, so its sufficient to check only the row size has_quadratic_term[j] = true; } } @@ -55,11 +47,12 @@ i_t remove_empty_cols(lp_problem_t& problem, for (i_t j = 0; j < problem.num_cols; ++j) { bool remove_var = false; if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { - if (problem.objective[j] >= 0 && problem.lower[j] > -inf && !has_quadratic_term[j]) { + bool non_removable = has_quadratic_term[j] || is_cone_variable[j]; + if (problem.objective[j] >= 0 && problem.lower[j] > -inf && !non_removable) { presolve_info.removed_values.push_back(problem.lower[j]); problem.obj_constant += problem.objective[j] * problem.lower[j]; remove_var = true; - } else if (problem.objective[j] <= 0 && problem.upper[j] < inf && !has_quadratic_term[j]) { + } else if (problem.objective[j] <= 0 && problem.upper[j] < inf && !non_removable) { presolve_info.removed_values.push_back(problem.upper[j]); problem.obj_constant += problem.objective[j] * problem.upper[j]; remove_var = true; @@ -570,16 +563,18 @@ void convert_user_problem(const user_problem_t& user_problem, } // Copy info from user_problem to problem - problem.num_rows = user_problem.num_rows; - problem.num_cols = user_problem.num_cols; - problem.A = user_problem.A; - problem.objective = user_problem.objective; - problem.obj_scale = user_problem.obj_scale; - problem.obj_constant = user_problem.obj_constant; - problem.objective_is_integral = user_problem.objective_is_integral; - problem.rhs = user_problem.rhs; - problem.lower = user_problem.lower; - problem.upper = user_problem.upper; + problem.num_rows = user_problem.num_rows; + problem.num_cols = user_problem.num_cols; + problem.A = user_problem.A; + problem.objective = user_problem.objective; + problem.obj_scale = user_problem.obj_scale; + problem.obj_constant = user_problem.obj_constant; + problem.objective_is_integral = user_problem.objective_is_integral; + problem.rhs = user_problem.rhs; + problem.lower = user_problem.lower; + problem.upper = user_problem.upper; + problem.cone_var_start = user_problem.cone_var_start; + problem.second_order_cone_dims = user_problem.second_order_cone_dims; // Make a copy of row_sense so we can modify it std::vector row_sense = user_problem.row_sense; @@ -636,6 +631,7 @@ void convert_user_problem(const user_problem_t& user_problem, settings.log.debug( "equality rows %d less rows %d columns %d\n", equal_rows, less_rows, problem.num_cols); if (settings.barrier && settings.dualize != 0 && user_problem.Q_values.size() == 0 && + user_problem.second_order_cone_dims.empty() && (settings.dualize == 1 || (settings.dualize == -1 && less_rows > 1.2 * problem.num_cols && equal_rows < 2e4))) { settings.log.debug("Dualizing in presolve\n"); @@ -821,10 +817,26 @@ i_t presolve(const lp_problem_t& original, { problem = original; std::vector row_sense(problem.num_rows, '='); + auto build_is_cone_variable = [](const lp_problem_t& current_problem) { + std::vector is_cone_variable(current_problem.num_cols, false); + if (!current_problem.second_order_cone_dims.empty()) { + i_t cone_end = current_problem.cone_var_start; + for (auto q_k : current_problem.second_order_cone_dims) { + cone_end += q_k; + } + for (i_t j = current_problem.cone_var_start; j < cone_end; ++j) { + is_cone_variable[j] = true; + } + } + return is_cone_variable; + }; + auto is_cone_variable = build_is_cone_variable(problem); // Check for free variables i_t free_variables = 0; for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { + free_variables++; + } } if (settings.barrier_presolve && free_variables > 0) { @@ -835,7 +847,7 @@ i_t presolve(const lp_problem_t& original, current_free_variables.reserve(problem.num_cols); constraints_to_check.reserve(problem.num_rows); for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf) { + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { current_free_variables.push_back(j); const i_t col_start = problem.A.col_start[j]; const i_t col_end = problem.A.col_start[j + 1]; @@ -975,7 +987,9 @@ i_t presolve(const lp_problem_t& original, i_t new_free_variables = 0; for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf) { new_free_variables++; } + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { + new_free_variables++; + } } if (removed_free_variables != 0) { settings.log.printf("Bounded %d free variables\n", removed_free_variables); @@ -1134,9 +1148,18 @@ i_t presolve(const lp_problem_t& original, } if (num_empty_cols > 0) { settings.log.printf("Presolve attempt to remove %d empty cols\n", num_empty_cols); - remove_empty_cols(problem, num_empty_cols, presolve_info); + remove_empty_cols(problem, num_empty_cols, presolve_info, is_cone_variable); } + is_cone_variable = build_is_cone_variable(problem); + + // Check for free variables (exclude cone variables — they are naturally unbounded) + free_variables = 0; + for (i_t j = 0; j < problem.num_cols; j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { + free_variables++; + } + } problem.Q.check_matrix("Before free variable expansion"); if (settings.barrier_presolve && free_variables > 0) { @@ -1156,7 +1179,7 @@ i_t presolve(const lp_problem_t& original, i_t num_cols = problem.num_cols + free_variables; i_t nnz = problem.A.col_start[problem.num_cols]; for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf) { + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { nnz += (problem.A.col_start[j + 1] - problem.A.col_start[j]); } } @@ -1173,7 +1196,7 @@ i_t presolve(const lp_problem_t& original, i_t q = problem.A.col_start[problem.num_cols]; i_t col = problem.num_cols; for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf) { + if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) { i_t i = problem.A.i[p]; f_t aij = problem.A.x[p]; @@ -1286,7 +1309,8 @@ i_t presolve(const lp_problem_t& original, problem.num_cols = num_cols; } - if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0) { + if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0 && + problem.second_order_cone_dims.empty()) { folding(problem, settings, presolve_info); } diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp index d570ea933e..b052f67e9c 100644 --- a/cpp/src/dual_simplex/presolve.hpp +++ b/cpp/src/dual_simplex/presolve.hpp @@ -49,6 +49,8 @@ struct lp_problem_t { f_t obj_constant; f_t obj_scale; // 1.0 for min, -1.0 for max bool objective_is_integral{false}; + i_t cone_var_start{0}; + std::vector second_order_cone_dims; void write_problem(const std::string& path) const { diff --git a/cpp/src/dual_simplex/scaling.cpp b/cpp/src/dual_simplex/scaling.cpp index 1531c91486..54fe5dcf46 100644 --- a/cpp/src/dual_simplex/scaling.cpp +++ b/cpp/src/dual_simplex/scaling.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -29,9 +29,20 @@ i_t column_scaling(const lp_problem_t& unscaled, } column_scaling.resize(n); + + i_t cone_start = unscaled.cone_var_start; + i_t cone_end = cone_start; + for (auto q_k : unscaled.second_order_cone_dims) { + cone_end += q_k; + } + f_t max = 0; f_t min = std::numeric_limits::max(); for (i_t j = 0; j < n; ++j) { + if (j >= cone_start && j < cone_end) { + column_scaling[j] = 1.0; + continue; + } const i_t col_start = scaled.A.col_start[j]; const i_t col_end = scaled.A.col_start[j + 1]; f_t sum = 0.0; diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index b7c619f246..9bc12bb925 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -345,6 +345,26 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us barrier_settings.barrier_presolve = true; dualize_info_t dualize_info; convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks, dualize_info); + + if (!user_problem.second_order_cone_dims.empty()) { + i_t cone_end = user_problem.cone_var_start; + for (auto q_k : user_problem.second_order_cone_dims) { + cone_end += q_k; + } + for (i_t j = user_problem.cone_var_start; j < cone_end; ++j) { + if (user_problem.lower[j] != 0.0 && user_problem.lower[j] > -1e30) { + settings.log.printf("Error: explicit lower bound on conic variable %d is not supported\n", + j); + return lp_status_t::NUMERICAL_ISSUES; + } + if (user_problem.upper[j] < 1e30) { + settings.log.printf("Error: explicit upper bound on conic variable %d is not supported\n", + j); + return lp_status_t::NUMERICAL_ISSUES; + } + } + } + lp_solution_t lp_solution(original_lp.num_rows, original_lp.num_cols); // Presolve the linear program diff --git a/cpp/src/dual_simplex/user_problem.hpp b/cpp/src/dual_simplex/user_problem.hpp index 73c4c391be..8b0588064c 100644 --- a/cpp/src/dual_simplex/user_problem.hpp +++ b/cpp/src/dual_simplex/user_problem.hpp @@ -52,6 +52,8 @@ struct user_problem_t { std::vector Q_offsets; std::vector Q_indices; std::vector Q_values; + i_t cone_var_start{0}; + std::vector second_order_cone_dims; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu index 611a236562..035fde5ac6 100644 --- a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu @@ -502,27 +502,38 @@ class second_order_cone_test : public ::testing::Test { } void launch_fused_corrector(const rmm::device_uvector& dx_aff, - const rmm::device_uvector& omega, - const rmm::device_uvector& w_bar, - const rmm::device_uvector& inv_eta, - const rmm::device_uvector& inv_1pw0, - const rmm::device_uvector& rho, + const cone_data_t& cones, f_t sigma_mu, - rmm::device_uvector& out, - const rmm::device_uvector& cone_offsets, - i_t k) + rmm::device_uvector& out) { - fused_corrector_kernel<<>>(cuopt::make_span(dx_aff), - cuopt::make_span(omega), - cuopt::make_span(w_bar), - cuopt::make_span(inv_eta), - cuopt::make_span(inv_1pw0), - cuopt::make_span(rho), - sigma_mu, - cuopt::make_span(out), - cuopt::make_span(cone_offsets), - k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + compute_combined_cone_rhs_term(cuopt::make_span(dx_aff), cones, sigma_mu, out, stream_); + sync(); + } + + void launch_affine_cone_rhs(const cone_data_t& cones, rmm::device_uvector& out) + { + compute_affine_cone_rhs_term(cones, out, stream_); + sync(); + } + + void launch_recover_cone_dz(const rmm::device_uvector& dx, + const cone_data_t& cones, + const rmm::device_uvector& cone_rhs_term, + rmm::device_uvector& hinv2_dx, + rmm::device_uvector& dz) + { + recover_cone_dz( + cuopt::make_span(dx), cones, cone_rhs_term, hinv2_dx, cuopt::make_span(dz), stream_); + sync(); + } + + void launch_accumulate_cone_hinv2(const rmm::device_uvector& x, + const cone_data_t& cones, + rmm::device_uvector& hinv2_x, + rmm::device_uvector& out) + { + accumulate_cone_hinv2_matvec( + cuopt::make_span(x), cones, hinv2_x, cuopt::make_span(out), stream_); sync(); } @@ -1077,6 +1088,75 @@ TEST_F(second_order_cone_test, apply_hinv2_strided_loop_for_large_cone) expect_vector_near(hinv2_actual, ref, 1e-8, 1e-6, "hinv2_large_ref"); } +TEST_F(second_order_cone_test, affine_cone_rhs_matches_hinv2_of_primal) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); + launch_nt_scaling(cones, stream_); + + rmm::device_uvector d_out(cones.m_c, stream_); + launch_affine_cone_rhs(cones, d_out); + + auto actual = copy_to_host(d_out); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto ref = ref_apply_hinv2_single(s_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); + auto act = slice_cone(actual, offsets, c); + expect_vector_near(act, ref, 1e-10, 1e-8, "affine_cone_rhs"); + } +} + +TEST_F(second_order_cone_test, accumulate_cone_hinv2_matvec_matches_reference) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector> x_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; + std::vector> base_cones{{1.0, 2.0, 3.0}, {0.5, -0.5, 0.25, -0.25, 0.75}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + auto d_x = make_device_vector(pack_cones(x_cones)); + auto d_out = make_device_vector(pack_cones(base_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); + launch_nt_scaling(cones, stream_); + + rmm::device_uvector d_hinv2_x(cones.m_c, stream_); + launch_accumulate_cone_hinv2(d_x, cones, d_hinv2_x, d_out); + + auto actual = copy_to_host(d_out); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto ref_hinv2 = + ref_apply_hinv2_single(x_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); + auto actual_c = slice_cone(actual, offsets, c); + std::vector ref(actual_c.size()); + for (i_t j = 0; j < static_cast(ref.size()); ++j) { + ref[j] = base_cones[c][j] + ref_hinv2[j]; + } + expect_vector_near(actual_c, ref, 1e-10, 1e-8, "accumulate_cone_hinv2"); + } +} + TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_matches_reference_with_nt_scaling) { std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; @@ -1412,20 +1492,10 @@ TEST_F(second_order_cone_test, fused_corrector_matches_reference_with_nt_scaling std::vector> dx_aff_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; f_t sigma_mu = 0.1; - auto d_dx_aff = make_device_vector(pack_cones(dx_aff_cones)); - auto d_offsets = make_device_vector(offsets); + auto d_dx_aff = make_device_vector(pack_cones(dx_aff_cones)); rmm::device_uvector d_out(cones.omega.size(), stream_); - launch_fused_corrector(d_dx_aff, - cones.omega, - cones.w_bar, - cones.inv_eta, - cones.inv_1pw0, - cones.rho, - sigma_mu, - d_out, - d_offsets, - static_cast(dims.size())); + launch_fused_corrector(d_dx_aff, cones, sigma_mu, d_out); auto actual = copy_to_host(d_out); auto omega_host = copy_to_host(cones.omega); @@ -1464,20 +1534,10 @@ TEST_F(second_order_cone_test, fused_corrector_strided_loop_for_large_cone) auto dx_aff_cone = make_patterned_cone(dims[0], 0.5, 0.003); f_t sigma_mu = 0.25; - auto d_dx_aff = make_device_vector(dx_aff_cone); - auto d_offsets = make_device_vector(offsets); + auto d_dx_aff = make_device_vector(dx_aff_cone); rmm::device_uvector d_out(cones.omega.size(), stream_); - launch_fused_corrector(d_dx_aff, - cones.omega, - cones.w_bar, - cones.inv_eta, - cones.inv_1pw0, - cones.rho, - sigma_mu, - d_out, - d_offsets, - 1); + launch_fused_corrector(d_dx_aff, cones, sigma_mu, d_out); auto actual = copy_to_host(d_out); auto omega_host = copy_to_host(cones.omega); @@ -1535,4 +1595,47 @@ TEST_F(second_order_cone_test, cone_block_scatter_with_q_overlap) } } +TEST_F(second_order_cone_test, recover_cone_dz_matches_reference) +{ + std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; + std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; + std::vector> dx_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; + std::vector dims{3, 5}; + auto offsets = build_offsets(dims); + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + auto d_dx = make_device_vector(pack_cones(dx_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); + launch_nt_scaling(cones, stream_); + + rmm::device_uvector d_rhs(cones.m_c, stream_); + launch_affine_cone_rhs(cones, d_rhs); + + rmm::device_uvector d_hinv2_dx(cones.m_c, stream_); + rmm::device_uvector d_dz(cones.m_c, stream_); + launch_recover_cone_dz(d_dx, cones, d_rhs, d_hinv2_dx, d_dz); + + auto actual = copy_to_host(d_dz); + auto rhs_actual = copy_to_host(d_rhs); + auto w_bar_host = copy_to_host(cones.w_bar); + auto inv_eta_h = copy_to_host(cones.inv_eta); + + for (i_t c = 0; c < static_cast(dims.size()); ++c) { + auto ref_hinv2 = + ref_apply_hinv2_single(dx_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); + auto rhs_c = slice_cone(rhs_actual, offsets, c); + auto act = slice_cone(actual, offsets, c); + std::vector ref(act.size()); + for (i_t j = 0; j < static_cast(ref.size()); ++j) { + ref[j] = -rhs_c[j] - ref_hinv2[j]; + } + expect_vector_near(act, ref, 1e-10, 1e-8, "recover_cone_dz"); + } +} + } // namespace cuopt::linear_programming::dual_simplex::test diff --git a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu index abfe37c9fd..1bf604bff8 100644 --- a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu +++ b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -174,4 +175,134 @@ TEST(barrier, dual_variable_greater_than) EXPECT_NEAR(solution.z[1], 0.0, 1e-5); } +TEST(barrier, cone_metadata_preserved_through_barrier_setup) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 5; + constexpr int nz = 5; + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective.assign(n, 0.0); + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start.resize(n + 1); + for (int j = 0; j < n; ++j) { + user_problem.A.col_start[j] = j; + user_problem.A.i[j] = 0; + user_problem.A.x[j] = 1.0; + } + user_problem.A.col_start[n] = nz; + user_problem.rhs = {1.0}; + user_problem.row_sense = {'L'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.second_order_cone_dims = {2, 2}; + user_problem.cone_var_start = 1; + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + EXPECT_EQ(original_lp.second_order_cone_dims, user_problem.second_order_cone_dims); + EXPECT_EQ(original_lp.cone_var_start, user_problem.cone_var_start); + + lp_problem_t barrier_lp(user_problem.handle_ptr, + original_lp.num_rows, + original_lp.num_cols, + original_lp.A.col_start[original_lp.num_cols]); + std::vector column_scales; + column_scaling(original_lp, settings, barrier_lp, column_scales); + + EXPECT_EQ(barrier_lp.second_order_cone_dims, user_problem.second_order_cone_dims); + EXPECT_EQ(barrier_lp.cone_var_start, user_problem.cone_var_start); +} + +TEST(barrier, socp_min_x0_subject_to_norm_constraint) +{ + // minimize x_0 + // subject to x_1 = 1 + // (x_0, x_1, x_2) in Q^3 + // + // Optimal: x* = (1, 1, 0), obj* = 1 + + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 3; + constexpr int nz = 1; + + user_problem.num_rows = m; + user_problem.num_cols = n; + + user_problem.objective = {1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 0, 1, 1}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + + user_problem.lower = {0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "socp_norm_cone"; + + user_problem.cone_var_start = 0; + user_problem.second_order_cone_dims = {3}; + + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.set_log(true); + settings.log.log_to_console = false; + settings.log.enable_log_to_file(); + settings.log.set_log_file("/tmp/socp_barrier_test.log"); + + lp_solution_t solution(m, n); + printf("=== Calling solve_linear_program_with_barrier ===\n"); + fflush(stdout); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + printf("=== status=%d obj=%e x=[%e %e %e] ===\n", + static_cast(status), + solution.objective, + solution.x[0], + solution.x[1], + solution.x[2]); + fflush(stdout); + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[2]), 0.0, 1e-4); +} + } // namespace cuopt::linear_programming::dual_simplex::test From c99368f754ec9fb89008bea158e058a0f1e5f70e Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 7 Apr 2026 12:27:08 -0700 Subject: [PATCH 04/22] refactor(dual_simplex): keep cone layout packed through presolve Flatten the remaining SOCP barrier plumbing, remove superseded cone kernels, and harden presolve so linear columns stay ahead of the trailing cone block. --- cpp/src/barrier/barrier.cu | 363 ++-- cpp/src/barrier/second_order_cone.cuh | 1695 +++++++---------- cpp/src/dual_simplex/presolve.cpp | 454 +++-- cpp/src/dual_simplex/scaling.cpp | 12 +- cpp/src/dual_simplex/solve.cpp | 10 + cpp/src/dual_simplex/vector_math.cuh | 21 +- .../infeasibility_information.cu | 6 - .../unit_tests/second_order_cone_test.cu | 656 +++---- .../dual_simplex/unit_tests/solve_barrier.cu | 704 ++++++- 9 files changed, 2199 insertions(+), 1722 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index efa941bfe8..1d937a4c5e 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -45,6 +45,7 @@ #include #include #include +#include namespace cuopt::linear_programming::dual_simplex { @@ -210,10 +211,10 @@ class iteration_data_t { d_dw_residual_(0, lp.handle_ptr->get_stream()), d_wv_residual_(0, lp.handle_ptr->get_stream()), d_bound_rhs_(0, lp.handle_ptr->get_stream()), - d_complementarity_xz_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), + d_complementarity_xz_rhs_(0, lp.handle_ptr->get_stream()), d_complementarity_wv_rhs_(0, lp.handle_ptr->get_stream()), d_dual_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), - d_cone_rhs_term_(0, lp.handle_ptr->get_stream()), + d_complementarity_target_(lp.num_cols, lp.handle_ptr->get_stream()), d_cone_hinv2_dx_(0, lp.handle_ptr->get_stream()), d_Q_diag_(0, lp.handle_ptr->get_stream()), d_Qx_(Qin.m, lp.handle_ptr->get_stream()), @@ -273,12 +274,17 @@ class iteration_data_t { cuopt_assert(cone_var_start_ >= 0, "cone_var_start must be nonnegative"); cuopt_assert(cone_var_start_ + total_cone_dim <= lp.num_cols, "cone variables exceed problem dimension"); + cuopt_assert(cone_var_start_ + total_cone_dim == lp.num_cols, + "barrier expects [linear | cone] layout"); cones_.emplace(static_cast(lp.second_order_cone_dims.size()), lp.second_order_cone_dims, raft::device_span{}, raft::device_span{}, stream_view_); } + i_t linear_xz_rhs_size = lp.num_cols; + if (cones_.has_value() && cones_->K > 0) { linear_xz_rhs_size -= cones_->m_c; } + d_complementarity_xz_rhs_.resize(linear_xz_rhs_size, stream_view_); // Allocating GPU flag data for Form ADAT RAFT_CUDA_TRY(cub::DeviceSelect::Flagged( @@ -1771,7 +1777,7 @@ class iteration_data_t { rmm::device_uvector d_complementarity_xz_rhs_; rmm::device_uvector d_complementarity_wv_rhs_; rmm::device_uvector d_dual_rhs_; - rmm::device_uvector d_cone_rhs_term_; + rmm::device_uvector d_complementarity_target_; rmm::device_uvector d_cone_hinv2_dx_; rmm::device_uvector d_Q_diag_; @@ -2327,10 +2333,32 @@ void barrier_solver_t::gpu_compute_residual_norms(const rmm::device_uv primal_residual_norm = std::max(device_vector_norm_inf(data.d_primal_residual_, stream_view_), device_vector_norm_inf(data.d_bound_residual_, stream_view_)); - dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); + dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const i_t linear_xz_size = + has_cones ? data.cone_var_start_ : static_cast(data.d_complementarity_xz_residual_.size()); + auto linear_xz_span = + raft::device_span(data.d_complementarity_xz_residual_.data(), linear_xz_size); complementarity_residual_norm = - std::max(device_vector_norm_inf(data.d_complementarity_xz_residual_, stream_view_), + std::max(device_vector_norm_inf(linear_xz_span, stream_view_), device_vector_norm_inf(data.d_complementarity_wv_residual_, stream_view_)); + if (has_cones) { + f_t cone_complementarity_norm = f_t(0); + auto cone_dot = data.cones_->scratch.hinv2_tail_dot(); + segmented_sum(data.d_complementarity_xz_residual_.data() + data.cone_var_start_, + cuopt::make_span(data.cones_->cone_offsets), + data.cones_->K, + cone_dot, + data.cones_->scratch.segmented_reduce_workspace, + stream_view_); + cone_complementarity_norm = thrust::reduce(rmm::exec_policy(stream_view_), + cone_dot.begin(), + cone_dot.end(), + f_t(0), + thrust::maximum()); + complementarity_residual_norm = + std::max(complementarity_residual_norm, cone_complementarity_norm); + } } template @@ -2404,6 +2432,37 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tK > 0; const i_t m_c = has_cones ? data.cones_->m_c : 0; const i_t cone_var_start = data.cone_var_start_; + const i_t linear_size = has_cones ? cone_var_start : lp.num_cols; + + auto fill_linear_target = [&](raft::device_span target, + raft::device_span xz_rhs, + raft::device_span x) { + if (target.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(xz_rhs.data(), x.data()), + target.data(), + target.size(), + [] HD(f_t complementarity_xz_rhs, f_t x_val) { return complementarity_xz_rhs / x_val; }, + stream_view_.value()); + RAFT_CHECK_CUDA(stream_view_); + }; + + auto recover_linear_dz = [&](raft::device_span target, + raft::device_span z, + raft::device_span dx_span, + raft::device_span x, + raft::device_span dz_span) { + if (dz_span.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(target.data(), z.data(), dx_span.data(), x.data()), + dz_span.data(), + dz_span.size(), + [] HD(f_t target_val, f_t z_val, f_t dx_val, f_t x_val) { + return target_val - (z_val * dx_val) / x_val; + }, + stream_view_.value()); + RAFT_CHECK_CUDA(stream_view_); + }; { raft::common::nvtx::range fun_scope("Barrier: GPU allocation and copies"); @@ -2449,17 +2508,30 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tlambda = raft::device_span(data.d_z_.data() + cone_var_start, m_c); launch_nt_scaling(*data.cones_, stream_view_); + cuopt_assert(cone_var_start + m_c == lp.num_cols, "barrier expects [linear | cone] layout"); + fill_linear_target( + raft::device_span(data.d_complementarity_target_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_x_.data(), linear_size)); + + auto cone_target = + raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c); if (data.cone_combined_step_) { compute_combined_cone_rhs_term( raft::device_span(data.d_dx_aff_.data() + cone_var_start, m_c), *data.cones_, data.cone_sigma_mu_, - data.d_cone_rhs_term_, - stream_view_); + cone_target, + stream_view_, + f_t(-1)); } else { - compute_affine_cone_rhs_term(*data.cones_, data.d_cone_rhs_term_, stream_view_); + compute_affine_cone_rhs_term(*data.cones_, cone_target, stream_view_, f_t(-1)); } RAFT_CHECK_CUDA(stream_view_); + } else { + fill_linear_target(cuopt::make_span(data.d_complementarity_target_), + cuopt::make_span(data.d_complementarity_xz_rhs_), + cuopt::make_span(data.d_x_)); } // Solves the linear system @@ -2556,7 +2628,7 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t(0), - lp.num_cols, - [span_inv_diag = cuopt::make_span(data.d_inv_diag), - span_tmp3 = cuopt::make_span(data.d_tmp3_), - span_tmp4 = cuopt::make_span(data.d_tmp4_), - span_xz_rhs = cuopt::make_span(data.d_complementarity_xz_rhs_), - span_x = cuopt::make_span(data.d_x_), - span_dual_rhs = cuopt::make_span(data.d_dual_rhs_), - span_cone_rhs = cuopt::make_span(data.d_cone_rhs_term_), - cone_start = cone_var_start, - cone_size = m_c] __device__(i_t j) { - bool is_cone = (j >= cone_start && j < cone_start + cone_size); - f_t cone_rhs = is_cone ? span_cone_rhs[j - cone_start] : f_t(0); - f_t xz_term = is_cone ? f_t(0) : -(span_xz_rhs[j] / span_x[j]); - f_t tmp = span_tmp3[j] + xz_term + span_dual_rhs[j] + cone_rhs; - span_tmp3[j] = tmp; - span_tmp4[j] = span_inv_diag[j] * tmp; - }); - } else { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_inv_diag.data(), - data.d_tmp3_.data(), - data.d_complementarity_xz_rhs_.data(), - data.d_x_.data(), - data.d_dual_rhs_.data()), - thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()), - lp.num_cols, - [] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs) - -> thrust::tuple { - const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs; - return {tmp, inv_diag * tmp}; - }, - stream_view_.value()); - } + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_inv_diag.data(), + data.d_tmp3_.data(), + data.d_dual_rhs_.data(), + data.d_complementarity_target_.data()), + thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()), + lp.num_cols, + [] HD(f_t inv_diag, f_t tmp3, f_t dual_rhs, f_t target) -> thrust::tuple { + const f_t tmp = tmp3 + dual_rhs - target; + return {tmp, inv_diag * tmp}; + }, + stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); raft::copy(data.d_r1_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); raft::copy(data.d_r1_prime_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); @@ -2900,40 +2947,28 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data.d_dx_.data() + cone_var_start, m_c), - *data.cones_, - data.d_cone_rhs_term_, - data.d_cone_hinv2_dx_, - raft::device_span(data.d_dz_.data() + cone_var_start, m_c), - stream_view_); - - thrust::for_each_n(rmm::exec_policy(stream_view_), - thrust::make_counting_iterator(0), - data.d_dz_.size(), - [span_xz_rhs = cuopt::make_span(data.d_complementarity_xz_rhs_), - span_z = cuopt::make_span(data.d_z_), - span_dx = cuopt::make_span(data.d_dx_), - span_x = cuopt::make_span(data.d_x_), - span_dz = cuopt::make_span(data.d_dz_), - cone_start = cone_var_start, - cone_size = m_c] __device__(i_t j) { - if (j < cone_start || j >= cone_start + cone_size) { - span_dz[j] = (span_xz_rhs[j] - span_z[j] * span_dx[j]) / span_x[j]; - } - }); + recover_cone_dz_from_target( + raft::device_span(data.d_dx_.data() + cone_var_start, m_c), + *data.cones_, + raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c), + data.d_cone_hinv2_dx_, + raft::device_span(data.d_dz_.data() + cone_var_start, m_c), + stream_view_); + + if (cone_var_start > 0) { + recover_linear_dz( + raft::device_span(data.d_complementarity_target_.data(), cone_var_start), + raft::device_span(data.d_z_.data(), cone_var_start), + raft::device_span(data.d_dx_.data(), cone_var_start), + raft::device_span(data.d_x_.data(), cone_var_start), + raft::device_span(data.d_dz_.data(), cone_var_start)); + } } else { - // dz = (complementarity_xz_rhs - z.* dx) ./ x; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(), - data.d_z_.data(), - data.d_dx_.data(), - data.d_x_.data()), - data.d_dz_.data(), - data.d_dz_.size(), - [] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x) { - return (complementarity_xz_rhs - z * dx) / x; - }, - stream_view_.value()); + recover_linear_dz(cuopt::make_span(data.d_complementarity_target_), + cuopt::make_span(data.d_z_), + cuopt::make_span(data.d_dx_), + cuopt::make_span(data.d_x_), + cuopt::make_span(data.d_dz_)); } RAFT_CHECK_CUDA(stream_view_); raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); @@ -2943,18 +2978,29 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t out, + raft::device_span rhs, + raft::device_span z, + raft::device_span dz_span, + raft::device_span dx_span, + raft::device_span x) { + if (out.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(rhs.data(), z.data(), dz_span.data(), dx_span.data(), x.data()), + out.data(), + out.size(), + [] HD(f_t complementarity_xz_rhs, f_t z_val, f_t dz_val, f_t dx_val, f_t x_val) { + return z_val * dx_val + x_val * dz_val - complementarity_xz_rhs; + }, + stream_view_.value()); + }; + compute_linear_xz_residual( + raft::device_span(data.d_xz_residual_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_z_.data(), linear_size), + raft::device_span(data.d_dz_.data(), linear_size), + raft::device_span(data.d_dx_.data(), linear_size), + raft::device_span(data.d_x_.data(), linear_size)); RAFT_CHECK_CUDA(stream_view_); const f_t xz_residual_norm = device_vector_norm_inf(data.d_xz_residual_, stream_view_); @@ -3118,7 +3164,8 @@ template void barrier_solver_t::compute_affine_rhs(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const i_t linear_size = has_cones ? data.cone_var_start_ : lp.num_cols; data.primal_rhs = data.primal_residual; data.bound_rhs = data.bound_residual; @@ -3126,29 +3173,24 @@ void barrier_solver_t::compute_affine_rhs(iteration_data_t& data.cone_combined_step_ = false; data.cone_sigma_mu_ = f_t(0); - raft::copy(data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_residual_.data(), - data.d_complementarity_xz_residual_.size(), - stream_view_); raft::copy(data.d_complementarity_wv_rhs_.data(), data.d_complementarity_wv_residual_.data(), data.d_complementarity_wv_residual_.size(), stream_view_); - // x.*z -> -x .* z - cub::DeviceTransform::Transform( - data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_rhs_.size(), - [] HD(f_t xz_rhs) { return -xz_rhs; }, - stream_view_.value()); + auto negate_linear_rhs = [&](raft::device_span out, raft::device_span residual) { + if (out.empty()) return; + cub::DeviceTransform::Transform( + residual.data(), + out.data(), + out.size(), + [] HD(f_t xz_rhs) { return -xz_rhs; }, + stream_view_.value()); + }; + negate_linear_rhs( + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_residual_.data(), linear_size)); RAFT_CHECK_CUDA(stream_view_); - if (has_cones) { - thrust::fill_n(rmm::exec_policy(stream_view_), - data.d_complementarity_xz_rhs_.begin() + data.cone_var_start_, - data.cones_->m_c, - f_t(0)); - } // w.*v -> -w .* v cub::DeviceTransform::Transform( data.d_complementarity_wv_rhs_.data(), @@ -3184,21 +3226,24 @@ void barrier_solver_t::compute_target_mu( gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_aff_)); if (has_cones) { - i_t cs = data.cone_var_start_; - i_t mc = data.cones_->m_c; - step_primal_aff = std::min( - step_primal_aff, + i_t cs = data.cone_var_start_; + i_t mc = data.cones_->m_c; + auto [cone_p, cone_d] = compute_cone_step_length(*data.cones_, raft::device_span(data.d_x_.data() + cs, mc), raft::device_span(data.d_dx_aff_.data() + cs, mc), raft::device_span(data.d_z_.data() + cs, mc), raft::device_span(data.d_dz_aff_.data() + cs, mc), - step_primal_aff, - stream_view_)); - step_dual_aff = step_primal_aff; + std::min(step_primal_aff, step_dual_aff), + stream_view_); + f_t cone_aff = std::min(cone_p, cone_d); + step_primal_aff = std::min(step_primal_aff, cone_aff); + step_dual_aff = step_primal_aff; } - if (data.Q.n > 0) { step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); } + if (data.Q.n > 0 || has_cones) { + step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); + } // Compute complementarity_xz_aff_sum = sum(x_aff * z_aff), // where x_aff = x + step_primal_aff * dx_aff and z_aff = z + step_dual_aff * dz_aff @@ -3256,21 +3301,30 @@ void barrier_solver_t::compute_target_mu( } mu_aff = complementarity_aff_sum / mu_denom; sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); - new_mu = sigma * mu_aff; + new_mu = sigma * mu; } template void barrier_solver_t::compute_cc_rhs(iteration_data_t& data, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_cc_rhs"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const i_t linear_size = has_cones ? data.cone_var_start_ : lp.num_cols; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_dx_aff_.data(), data.d_dz_aff_.data()), - data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_rhs_.size(), - [new_mu] HD(f_t dx_aff, f_t dz_aff) { return -(dx_aff * dz_aff) + new_mu; }, - stream_view_.value()); + auto fill_linear_cc_rhs = [&](raft::device_span out, + raft::device_span dx_aff, + raft::device_span dz_aff) { + if (out.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dx_aff.data(), dz_aff.data()), + out.data(), + out.size(), + [new_mu] HD(f_t dx_aff_val, f_t dz_aff_val) { return -(dx_aff_val * dz_aff_val) + new_mu; }, + stream_view_.value()); + }; + fill_linear_cc_rhs(raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_dx_aff_.data(), linear_size), + raft::device_span(data.d_dz_aff_.data(), linear_size)); RAFT_CHECK_CUDA(stream_view_); cub::DeviceTransform::Transform( cuda::std::make_tuple(data.d_dw_aff_.data(), data.d_dv_aff_.data()), @@ -3279,13 +3333,6 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data [new_mu] HD(f_t dw_aff, f_t dv_aff) { return -(dw_aff * dv_aff) + new_mu; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - if (has_cones) { - thrust::fill_n(rmm::exec_policy(stream_view_), - data.d_complementarity_xz_rhs_.begin() + data.cone_var_start_, - data.cones_->m_c, - f_t(0)); - } - // TMP should be CPU to 0 if CPU and GPU to 0 if GPU data.primal_rhs.set_scalar(0.0); data.bound_rhs.set_scalar(0.0); @@ -3376,18 +3423,14 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ if (has_cones) { i_t cs = data.cone_var_start_; i_t mc = data.cones_->m_c; - f_t cone_primal = - compute_single_cone_step_length(*data.cones_, - raft::device_span(data.d_x_.data() + cs, mc), - raft::device_span(data.d_dx_.data() + cs, mc), - f_t(1), - stream_view_); - f_t cone_dual = - compute_single_cone_step_length(*data.cones_, - raft::device_span(data.d_z_.data() + cs, mc), - raft::device_span(data.d_dz_.data() + cs, mc), - f_t(1), - stream_view_); + auto [cone_primal, cone_dual] = + compute_cone_step_length(*data.cones_, + raft::device_span(data.d_x_.data() + cs, mc), + raft::device_span(data.d_dx_.data() + cs, mc), + raft::device_span(data.d_z_.data() + cs, mc), + raft::device_span(data.d_dz_.data() + cs, mc), + f_t(1), + stream_view_); max_step_primal = std::min(max_step_primal, cone_primal); max_step_dual = std::min(max_step_dual, cone_dual); } @@ -3395,7 +3438,7 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ step_primal = step_scale * max_step_primal; step_dual = step_scale * max_step_dual; - if (data.Q.n > 0) { step_primal = step_dual = std::min(step_primal, step_dual); } + if (data.Q.n > 0 || has_cones) { step_primal = step_dual = std::min(step_primal, step_dual); } } template @@ -3821,9 +3864,28 @@ lp_status_t barrier_solver_t::solve(f_t start_time, std::max(vector_norm_inf(data.primal_residual, stream_view_), vector_norm_inf(data.bound_residual, stream_view_)); f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); + const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const i_t linear_xz_size = + has_cones ? data.cone_var_start_ : static_cast(data.complementarity_xz_residual.size()); + auto linear_xz_span = + raft::host_span(data.complementarity_xz_residual.data(), linear_xz_size); f_t complementarity_residual_norm = - std::max(vector_norm_inf(data.complementarity_xz_residual, stream_view_), + std::max(vector_norm_inf(linear_xz_span, stream_view_), vector_norm_inf(data.complementarity_wv_residual, stream_view_)); + if (has_cones) { + f_t cone_complementarity_norm = f_t(0); + i_t off = data.cone_var_start_; + for (auto q_k : lp.second_order_cone_dims) { + f_t cone_dot = f_t(0); + for (i_t j = 0; j < q_k; ++j) { + cone_dot += data.complementarity_xz_residual[off + j]; + } + cone_complementarity_norm = std::max(cone_complementarity_norm, cone_dot); + off += q_k; + } + complementarity_residual_norm = + std::max(complementarity_residual_norm, cone_complementarity_norm); + } f_t mu_denom = static_cast(n) + static_cast(num_upper_bounds); if (data.cones_.has_value() && data.cones_->K > 0) { mu_denom -= static_cast(data.cones_->m_c); @@ -3880,11 +3942,14 @@ lp_status_t barrier_solver_t::solve(f_t start_time, (duality_gap_abs < settings.barrier_relative_complementarity_tol || duality_gap_rel < settings.barrier_relative_complementarity_tol); + const i_t linear_xz_rhs_size = + has_cones ? data.cone_var_start_ : static_cast(data.complementarity_xz_rhs.size()); + data.d_complementarity_xz_residual_.resize(data.complementarity_xz_residual.size(), stream_view_); data.d_complementarity_wv_residual_.resize(data.complementarity_wv_residual.size(), stream_view_); - data.d_complementarity_xz_rhs_.resize(data.complementarity_xz_rhs.size(), stream_view_); + data.d_complementarity_xz_rhs_.resize(linear_xz_rhs_size, stream_view_); data.d_complementarity_wv_rhs_.resize(data.complementarity_wv_rhs.size(), stream_view_); raft::copy(data.d_complementarity_xz_residual_.data(), data.complementarity_xz_residual.data(), @@ -3896,7 +3961,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, stream_view_); raft::copy(data.d_complementarity_xz_rhs_.data(), data.complementarity_xz_rhs.data(), - data.complementarity_xz_rhs.size(), + linear_xz_rhs_size, stream_view_); raft::copy(data.d_complementarity_wv_rhs_.data(), data.complementarity_wv_rhs.data(), diff --git a/cpp/src/barrier/second_order_cone.cuh b/cpp/src/barrier/second_order_cone.cuh index 4d69e787d1..3487707f12 100644 --- a/cpp/src/barrier/second_order_cone.cuh +++ b/cpp/src/barrier/second_order_cone.cuh @@ -15,814 +15,319 @@ #include #include -#include - -#include +#include #include #include #include -#include #include +#include #include -#include #include namespace cuopt::linear_programming::dual_simplex { // --------------------------------------------------------------------------- -// Shared reduction primitives -// --------------------------------------------------------------------------- - -template -using triplet_t = cuda::std::tuple; - -template -struct triplet_sum { - DI triplet_t operator()(const triplet_t& lhs, const triplet_t& rhs) const - { - const auto& [v0_l, v1_l, v2_l] = lhs; - const auto& [v0_r, v1_r, v2_r] = rhs; - return {v0_l + v0_r, v1_l + v1_r, v2_l + v2_r}; - } -}; - -template -using block_reduce_t = cub::BlockReduce; - -template -struct smem_reduce_t { - using ScalarReduce = block_reduce_t; - using TripletReduce = block_reduce_t, BLOCK_DIM>; - - union { - typename ScalarReduce::TempStorage scalar_temp; - typename TripletReduce::TempStorage triplet_temp; - f_t scalar_broadcast; - triplet_t triplet_broadcast; - }; -}; - -// --------------------------------------------------------------------------- -// reduce_broadcast: block-reduce a value, then broadcast to all threads. +// Flat cone kernels: segmented reductions compute per-cone scalars, then a +// single elementwise launch applies the result across all packed cone entries. +// This keeps the cone math vectorized instead of one block per cone. // --------------------------------------------------------------------------- -template -DI f_t reduce_broadcast(f_t val, smem_reduce_t& s) -{ - f_t agg = typename smem_reduce_t::ScalarReduce(s.scalar_temp).Sum(val); - __syncthreads(); - if (threadIdx.x == 0) { s.scalar_broadcast = agg; } - __syncthreads(); - return s.scalar_broadcast; -} +constexpr int flat_block_dim = 256; -template -DI triplet_t reduce_broadcast(triplet_t val, smem_reduce_t& s) +template +__global__ void apply_hinv2_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + f_t output_scale) { - auto agg = typename smem_reduce_t::TripletReduce(s.triplet_temp) - .Reduce(val, triplet_sum{}); - __syncthreads(); - if (threadIdx.x == 0) { s.triplet_broadcast = agg; } - __syncthreads(); - return s.triplet_broadcast; + i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (flat_idx >= static_cast(out.size())) return; + + i_t cone = element_cone_ids[flat_idx]; + i_t cone_off = cone_offsets[cone]; + i_t local_idx = flat_idx - cone_off; + + f_t ie_sq = inv_eta[cone] * inv_eta[cone]; + f_t u_tv = w_bar[cone_off] * v[cone_off] - tail_dot[cone]; + f_t coeff = f_t(2) * u_tv * ie_sq; + int sign = (local_idx == 0) * 2 - 1; + f_t value = coeff * w_bar[flat_idx] - ie_sq * v[flat_idx]; + out[flat_idx] = output_scale * value * sign; } -template -struct smem_warp_reduce_t { - static constexpr int warps_per_block = BLOCK_DIM / 32; - - using ScalarReduce = cub::WarpReduce; - using TripletReduce = cub::WarpReduce, 32>; - - union { - typename ScalarReduce::TempStorage scalar_temp[warps_per_block]; - typename TripletReduce::TempStorage triplet_temp[warps_per_block]; - f_t scalar_broadcast[warps_per_block]; - triplet_t triplet_broadcast[warps_per_block]; - }; +template +struct corrector_raw_t { + f_t zeta; + f_t xi; + f_t psi; }; -// --------------------------------------------------------------------------- -// reduce_broadcast: warp-reduce a value, then broadcast within the warp. -// --------------------------------------------------------------------------- - -template -DI f_t reduce_broadcast(f_t val, smem_warp_reduce_t& s) -{ - static_assert(BLOCK_DIM % 32 == 0, "Warp reduce requires warp-aligned CTAs"); - - int warp = threadIdx.x >> 5; - int lane = threadIdx.x & 31; - f_t agg = typename smem_warp_reduce_t::ScalarReduce(s.scalar_temp[warp]).Sum(val); - if (lane == 0) { s.scalar_broadcast[warp] = agg; } - __syncwarp(); - return s.scalar_broadcast[warp]; -} - -template -DI triplet_t reduce_broadcast(triplet_t val, smem_warp_reduce_t& s) -{ - static_assert(BLOCK_DIM % 32 == 0, "Warp reduce requires warp-aligned CTAs"); - - int warp = threadIdx.x >> 5; - int lane = threadIdx.x & 31; - auto agg = typename smem_warp_reduce_t::TripletReduce(s.triplet_temp[warp]) - .Reduce(val, triplet_sum{}); - if (lane == 0) { s.triplet_broadcast[warp] = agg; } - __syncwarp(); - return s.triplet_broadcast[warp]; -} - -// --------------------------------------------------------------------------- -// Apply H^{-1} to one vector per cone (one thread-block per cone). -// -// H^{-1}z = (1/η)(w̄₀z₀ − ζ, z₁ + (−z₀ + ζ/(1+w̄₀))w̄₁), ζ = w̄₁ᵀz₁ -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv_kernel( - raft::device_span z, - raft::device_span out, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span cone_offsets, - i_t K) -{ - __shared__ smem_reduce_t smem; - - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - auto w_cone = w_bar.subspan(off, q); - auto z_cone = z.subspan(off, q); - auto out_cone = out.subspan(off, q); - - f_t z0 = z_cone[0]; - f_t w0 = w_cone[0]; - - // Phase 1: ζ = w̄₁ᵀ z₁ - f_t partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - partial += w_cone[j] * z_cone[j]; +template +struct corrector_raw_sum_t { + HD corrector_raw_t operator()(const corrector_raw_t& lhs, + const corrector_raw_t& rhs) const + { + return {lhs.zeta + rhs.zeta, lhs.xi + rhs.xi, lhs.psi + rhs.psi}; } - f_t zeta = reduce_broadcast(partial, smem); - - // Phase 2: element-wise output - f_t ie = inv_eta[cone]; - f_t ipw = inv_1pw0[cone]; - f_t coeff = -z0 + zeta * ipw; +}; - if (threadIdx.x == 0) { out_cone[0] = (w0 * z0 - zeta) * ie; } - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - out_cone[j] = (z_cone[j] + coeff * w_cone[j]) * ie; +template +struct cone_scratch_t { + i_t K; + rmm::device_uvector> corrector_raw; // [K] {zeta, xi, psi} + rmm::device_uvector scalar_slots; // [6 * K] reusable K-length scalar scratch slots + rmm::device_uvector step_alpha_primal; + rmm::device_uvector step_alpha_dual; + rmm::device_uvector segmented_reduce_workspace; + + cone_scratch_t(i_t K_in, rmm::cuda_stream_view stream) + : K(K_in), + corrector_raw(K_in, stream), + scalar_slots(6 * K_in, stream), + step_alpha_primal(K_in, stream), + step_alpha_dual(K_in, stream), + segmented_reduce_workspace(0, stream) + { } -} -// --------------------------------------------------------------------------- -// Apply H^{-2} to one vector per cone (one thread-block per cone). -// -// H^{-2}v = η⁻²(2u(uᵀv) − Jv), u = Jw̄, J = diag(1,−1,…,−1). -// -// One dot product (uᵀv) plus element-wise work — same structure as apply_Hinv. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void apply_Hinv2_kernel( - raft::device_span v, - raft::device_span out, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span cone_offsets, - i_t K) -{ - __shared__ smem_reduce_t smem; + raft::device_span hinv2_tail_dot() { return slot_span(0); } + raft::device_span step_s_du1_sq() { return slot_span(0); } + raft::device_span step_s_u1du1() { return slot_span(1); } + raft::device_span step_s_u1_sq() { return slot_span(2); } + raft::device_span step_l_du1_sq() { return slot_span(3); } + raft::device_span step_l_u1du1() { return slot_span(4); } + raft::device_span step_l_u1_sq() { return slot_span(5); } - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - auto w_cone = w_bar.subspan(off, q); - auto v_cone = v.subspan(off, q); - auto out_cone = out.subspan(off, q); + raft::device_span nt_s1_sq() { return slot_span(0); } + raft::device_span nt_l1_sq() { return slot_span(1); } + raft::device_span nt_sl() { return slot_span(2); } - f_t v0 = v_cone[0]; - f_t w0 = w_cone[0]; - - // Phase 1: uᵀv = w̄₀v₀ − Σ w̄_j v_j (tail dot, then subtract from head) - f_t partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - partial += w_cone[j] * v_cone[j]; - } - f_t tail_dot = reduce_broadcast(partial, smem); - f_t uTv = w0 * v0 - tail_dot; + raft::device_span step_alpha_primal_span() { return cuopt::make_span(step_alpha_primal); } + raft::device_span step_alpha_dual_span() { return cuopt::make_span(step_alpha_dual); } - // Phase 2: element-wise output - f_t ie_sq = inv_eta[cone] * inv_eta[cone]; - f_t coeff = f_t(2) * uTv * ie_sq; - - if (threadIdx.x == 0) { out_cone[0] = coeff * w0 - ie_sq * v0; } - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - out_cone[j] = -coeff * w_cone[j] + ie_sq * v_cone[j]; + private: + raft::device_span slot_span(i_t slot) + { + return raft::device_span(scalar_slots.data() + slot * K, K); } -} - -// --------------------------------------------------------------------------- -// Cone-algebra primitives for the deferred combined-step corrector: -// r_K = omega circ omega + dx_scaled circ dz_scaled - sigma mu e -// corr = omega \ r_K -// t_K = H^{-1} corr -// --------------------------------------------------------------------------- +}; -// --------------------------------------------------------------------------- -// Jordan product for packed SOC vectors (one CTA per cone). -// -// For a, b in Q^q: (a ∘ b)_0 = a^T b, (a ∘ b)_j = a_0 b_j + b_0 a_j. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void jordan_product_kernel( - raft::device_span a, - raft::device_span b, - raft::device_span out, - raft::device_span cone_offsets, - i_t K) +template +__global__ void fused_corrector_write_kernel(raft::device_span s, + raft::device_span lambda, + raft::device_span dx_aff, + raft::device_span omega, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span rho, + raft::device_span> raw, + raft::device_span out, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + f_t sigma_mu, + f_t output_scale) { - __shared__ smem_reduce_t smem; - - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - auto a_cone = a.subspan(off, q); - auto b_cone = b.subspan(off, q); - auto out_cone = out.subspan(off, q); - - f_t a0 = a_cone[0]; - f_t b0 = b_cone[0]; - - f_t partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - partial += a_cone[j] * b_cone[j]; + i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (flat_idx >= static_cast(out.size())) return; + + i_t cone = element_cone_ids[flat_idx]; + i_t cone_off = cone_offsets[cone]; + i_t local_idx = flat_idx - cone_off; + f_t ie = inv_eta[cone]; + f_t ipw = inv_1pw0[cone]; + f_t w0 = w_bar[cone_off]; + f_t omega0 = omega[cone_off]; + f_t dx_a0 = dx_aff[cone_off]; + auto raw_vals = raw[cone]; + f_t coeff_a = -dx_a0 + raw_vals.zeta * ipw; + f_t dx0 = (w0 * dx_a0 - raw_vals.zeta) * ie; + f_t dz0 = -omega0 - dx0; + f_t w_sq_sum = max(f_t(0), w0 * w0 - f_t(1)); + f_t w_omega_sum = f_t(0.5) * (ie * s[cone_off] - lambda[cone_off] / ie); + f_t omega_sq_sum = max(f_t(0), omega0 * omega0 - rho[cone]); + f_t omega_dx_sum = ie * (raw_vals.xi + coeff_a * w_omega_sum); + f_t dx_sq_sum = + ie * ie * (raw_vals.psi + f_t(2) * coeff_a * raw_vals.zeta + coeff_a * coeff_a * w_sq_sum); + f_t r_K_0 = (omega0 * omega0 + omega_sq_sum) + (dx0 * dz0 - omega_dx_sum - dx_sq_sum) - sigma_mu; + f_t nu = (f_t(2) * omega0 - dx0) * omega_sq_sum - (omega0 + f_t(2) * dx0) * omega_dx_sum; + f_t inv_rho = f_t(1) / rho[cone]; + f_t corr0 = (omega0 * r_K_0 - nu) * inv_rho; + f_t inv_omega0 = f_t(1) / omega0; + f_t c_inv = (nu * inv_omega0 - r_K_0) * inv_rho; + f_t p1 = c_inv + f_t(2) - dx0 * inv_omega0; + f_t p2 = -(f_t(1) + f_t(2) * dx0 * inv_omega0); + f_t w_dx_sum = ie * (raw_vals.zeta + coeff_a * w_sq_sum); + f_t zeta2 = p1 * w_omega_sum + p2 * w_dx_sum; + f_t coeff_c = -corr0 + zeta2 * ipw; + + if (local_idx == 0) { + out[flat_idx] = output_scale * ((w0 * corr0 - zeta2) * ie); + return; } - f_t tail_dot = reduce_broadcast(partial, smem); - if (threadIdx.x == 0) { out_cone[0] = a0 * b0 + tail_dot; } - - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - out_cone[j] = a0 * b_cone[j] + b0 * a_cone[j]; - } + f_t dx_j = (dx_aff[flat_idx] + coeff_a * w_bar[flat_idx]) * ie; + f_t corr_j = p1 * omega[flat_idx] + p2 * dx_j; + out[flat_idx] = output_scale * ((corr_j + coeff_c * w_bar[flat_idx]) * ie); } // --------------------------------------------------------------------------- -// Inverse Jordan product for packed SOC vectors (one CTA per cone). -// -// For omega in int(Q^q) and vector r, -// (omega \ r)_0 = (omega_0 r_0 − nu) / rho -// (omega \ r)_j = ((nu/omega_0 − r_0)/rho) omega_j + r_j/omega_0 -// where nu = omega_1^T r_1 and rho = ||omega||_J^2 (stored per-cone). +// Flattened NT scaling / step-length kernels. +// All follow the same pattern: segmented reduction to per-cone scalars, then +// flat or scalar kernels to write the packed cone outputs. // --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void inverse_jordan_product_kernel( - raft::device_span omega, - raft::device_span r, - raft::device_span rho, - raft::device_span out, - raft::device_span cone_offsets, - i_t K) -{ - __shared__ smem_reduce_t smem; - - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - auto omega_cone = omega.subspan(off, q); - auto r_cone = r.subspan(off, q); - auto out_cone = out.subspan(off, q); - - f_t omega_0 = omega_cone[0]; - f_t r_0 = r_cone[0]; - f_t partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - partial += omega_cone[j] * r_cone[j]; - } - f_t nu = reduce_broadcast(partial, smem); - - f_t rho_val = rho[cone]; - f_t inv_rho = f_t(1) / rho_val; - f_t c_omega_j = ((nu / omega_0) - r_0) * inv_rho; - f_t c_r_j = f_t(1) / omega_0; - - if (threadIdx.x == 0) { out_cone[0] = (omega_0 * r_0 - nu) * inv_rho; } - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - out_cone[j] = c_omega_j * omega_cone[j] + c_r_j * r_cone[j]; - } -} - -// --------------------------------------------------------------------------- -// Fused corrector for the combined-step SOC correction (one CTA per cone). -// -// Computes in a single kernel launch: -// 1. dx = H^{-1} Δx_aff (affine scaled direction) -// 2. dz = −ω − dx (complementary direction) -// 3. r_K = ω∘ω + dx∘dz − σμ e (combined cone residual) -// 4. corr = ω \ r_K (inverse Jordan product) -// 5. t_K = H^{-1} corr (corrector for reduced RHS) -// -// Uses the `out` buffer as scratch (holds dx during phases 1–3) and writes -// the final t_K there, so zero extra temporary buffers are needed. -// -// Algebraic shortcut: the triplet (Σ ω_j², Σ ω_j dx_j, Σ dx_j²) computed -// for r_K_0 also yields ν = Σ ω_j r_K_j via a linear combination, avoiding -// a fourth reduction pass. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void fused_corrector_kernel( - raft::device_span dx_aff, - raft::device_span omega, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span rho, - f_t sigma_mu, - raft::device_span out, - raft::device_span cone_offsets, - i_t K) +template +__global__ void nt_scaling_scalar_kernel(raft::device_span s, + raft::device_span lambda, + raft::device_span cone_offsets, + raft::device_span s1_sq, + raft::device_span l1_sq, + raft::device_span sl, + raft::device_span inv_eta, + raft::device_span inv_1pw0, + raft::device_span w_bar, + raft::device_span omega, + raft::device_span rho, + i_t K) { - __shared__ smem_reduce_t smem; - - i_t cone = static_cast(blockIdx.x); + i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (cone >= K) return; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - auto dx_a = dx_aff.subspan(off, q); - auto omega_cone = omega.subspan(off, q); - auto w_cone = w_bar.subspan(off, q); - auto out_cone = out.subspan(off, q); - - f_t ie = inv_eta[cone]; - f_t ipw = inv_1pw0[cone]; - f_t rho_val = rho[cone]; - f_t omega_0 = omega_cone[0]; - f_t w_0 = w_cone[0]; - f_t dx_a_0 = dx_a[0]; - - // ================================================================= - // Phase A — reduce ζ = Σ_{j≥1} w̄_j (Δx_aff)_j for H^{-1} - // ================================================================= - f_t partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - partial += w_cone[j] * dx_a[j]; - } - f_t zeta = reduce_broadcast(partial, smem); - - f_t dx_0 = (w_0 * dx_a_0 - zeta) * ie; - f_t coeff_a = -dx_a_0 + zeta * ipw; - f_t dz_0 = -omega_0 - dx_0; - - // ================================================================= - // Phase A→B — write dx to out; accumulate (A, B, C) for r_K and ν - // A = Σ ω_j², B = Σ ω_j dx_j, C = Σ dx_j² (j ≥ 1) - // ================================================================= - auto trip = triplet_t{}; - auto& [A_p, B_p, C_p] = trip; - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t dx_j = (dx_a[j] + coeff_a * w_cone[j]) * ie; - out_cone[j] = dx_j; - f_t omega_j = omega_cone[j]; - A_p += omega_j * omega_j; - B_p += omega_j * dx_j; - C_p += dx_j * dx_j; - } - auto [A, B, C] = reduce_broadcast(trip, smem); - - // ================================================================= - // Phase B — form r_K_0, derive ν, then inverse-Jordan scalars - // ================================================================= - f_t r_K_0 = (omega_0 * omega_0 + A) + (dx_0 * dz_0 - B - C) - sigma_mu; - f_t nu = (f_t(2) * omega_0 - dx_0) * A - (omega_0 + f_t(2) * dx_0) * B; - - f_t inv_rho = f_t(1) / rho_val; - f_t corr_0 = (omega_0 * r_K_0 - nu) * inv_rho; - f_t inv_omega_0 = f_t(1) / omega_0; - f_t c_inv = (nu * inv_omega_0 - r_K_0) * inv_rho; - f_t p1 = c_inv + f_t(2) - dx_0 * inv_omega_0; - f_t p2 = -(f_t(1) + f_t(2) * dx_0 * inv_omega_0); - - // ================================================================= - // Phase B→C — accumulate ζ₂ = Σ_{j≥1} w̄_j corr_j for final H^{-1} - // corr_j = p1 ω_j + p2 dx_j (dx_j still in out_cone[j]) - // ================================================================= - f_t partial2 = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t corr_j = p1 * omega_cone[j] + p2 * out_cone[j]; - partial2 += w_cone[j] * corr_j; - } - f_t zeta2 = reduce_broadcast(partial2, smem); - - // ================================================================= - // Phase C — write t_K = H^{-1}(corr) - // ================================================================= - f_t coeff_c = -corr_0 + zeta2 * ipw; - - if (threadIdx.x == 0) { out_cone[0] = (w_0 * corr_0 - zeta2) * ie; } - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t corr_j = p1 * omega_cone[j] + p2 * out_cone[j]; - out_cone[j] = (corr_j + coeff_c * w_cone[j]) * ie; - } -} - -// --------------------------------------------------------------------------- -// Compute NT scaling from (s, lambda). -// -// Medium/large cones use one CTA per cone and stream s/lambda twice: -// Pass 1: reduce ||s_1||^2, ||lambda_1||^2, and s^T lambda. -// Pass 2: compute omega/w_bar directly from raw inputs and reduce ||w_bar_1||^2. -// -// Small cones (q <= 32) use one warp per cone and keep one element per lane in -// registers for the whole computation. In both paths, shared memory only stores -// per-warp partial reductions plus a small scalar broadcast struct. -// --------------------------------------------------------------------------- - -constexpr int small_cone_limit = 32; -constexpr int medium_cone_limit = 2048; -constexpr int small_block_dim = 64; -constexpr int medium_block_dim = 128; -constexpr int large_block_dim = 256; - -template -struct nt_broadcast_coeffs { - f_t w_from_s; - f_t w_from_lambda; - f_t omega_s_coeff; - f_t omega_lambda_coeff; -}; - -template -struct nt_block_storage { - smem_reduce_t reduce; - nt_broadcast_coeffs coeffs; -}; - -template -struct nt_warp_storage { - static constexpr int warps_per_block = BLOCK_DIM / 32; - - smem_warp_reduce_t reduce; - nt_broadcast_coeffs coeffs[warps_per_block]; -}; - -template -__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_kernel( - raft::device_span s, - raft::device_span lambda, - raft::device_span eta, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span w_bar, - raft::device_span omega, - raft::device_span rho, - raft::device_span cone_offsets, - raft::device_span cone_ids, - i_t num_cones) -{ - static_assert(BLOCK_DIM % 32 == 0, "NT scaling kernel requires warp-aligned BLOCK_DIM"); - __shared__ nt_block_storage storage; - - i_t cone_idx = static_cast(blockIdx.x); - if (cone_idx >= num_cones) return; - - i_t cone = cone_ids[cone_idx]; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - - f_t s0 = s[off]; - f_t l0 = lambda[off]; - - auto partial = triplet_t{}; - auto& [s1_sq_p, l1_sq_p, sl_p] = partial; - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t sj = s[off + j]; - f_t lj = lambda[off + j]; - s1_sq_p += sj * sj; - l1_sq_p += lj * lj; - sl_p += sj * lj; - } - - auto [s1_sq, l1_sq, sl] = reduce_broadcast(partial, storage.reduce); - f_t owner_eta = f_t(0); - f_t owner_inv_eta = f_t(0); - f_t owner_rho = f_t(0); - f_t owner_omega_0 = f_t(0); - if (threadIdx.x == 0) { - // Clamp radicands to zero: near the cone boundary, roundoff can make these - // slightly negative. - f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq)); - f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq)); - f_t inv_s_J = f_t(1) / s_J; - f_t inv_l_J = f_t(1) / l_J; - owner_rho = s_J * l_J; - owner_eta = sqrt(s_J / l_J); - owner_inv_eta = f_t(1) / owner_eta; - f_t scale = sqrt(owner_rho); - - f_t s_dot_l = (s0 * l0 + sl) * inv_s_J * inv_l_J; - f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); - f_t inv_2g = f_t(1) / (f_t(2) * gamma); - f_t sb0 = s0 * inv_s_J; - f_t lb0 = l0 * inv_l_J; - f_t D = sb0 + lb0 + f_t(2) * gamma; - f_t inv_D = f_t(1) / D; - f_t c_s = (gamma + sb0) * inv_D; - f_t c_l = (gamma + lb0) * inv_D; - - storage.coeffs.w_from_s = inv_2g * inv_s_J; - storage.coeffs.w_from_lambda = -inv_2g * inv_l_J; - // Name these by the raw tail element they multiply: - // omega_j = omega_s_coeff * s_j + omega_lambda_coeff * lambda_j. - // The closed-form NT expression is cross-coupled, so c_l multiplies s_j - // and c_s multiplies lambda_j. - storage.coeffs.omega_s_coeff = scale * c_l * inv_s_J; - storage.coeffs.omega_lambda_coeff = scale * c_s * inv_l_J; - owner_omega_0 = gamma * scale; - } - __syncthreads(); - - f_t w1_sq_partial = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t sj = s[off + j]; - f_t lj = lambda[off + j]; - f_t wj = storage.coeffs.w_from_s * sj + storage.coeffs.w_from_lambda * lj; - w_bar[off + j] = wj; - omega[off + j] = storage.coeffs.omega_s_coeff * sj + storage.coeffs.omega_lambda_coeff * lj; - w1_sq_partial += wj * wj; - } - - f_t w1_sq = reduce_broadcast(w1_sq_partial, storage.reduce); - if (threadIdx.x == 0) { - f_t w0 = sqrt(f_t(1) + w1_sq); - omega[off] = owner_omega_0; - w_bar[off] = w0; - eta[cone] = owner_eta; - inv_eta[cone] = owner_inv_eta; - inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); - rho[cone] = owner_rho; - } + i_t off = cone_offsets[cone]; + f_t s0 = s[off]; + f_t l0 = lambda[off]; + + f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq[cone])); + f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq[cone])); + f_t inv_s_J = f_t(1) / s_J; + f_t inv_l_J = f_t(1) / l_J; + f_t rho_val = s_J * l_J; + f_t inv_eta_v = sqrt(l_J / s_J); + f_t scale = sqrt(rho_val); + + f_t s_dot_l = (s0 * l0 + sl[cone]) * inv_s_J * inv_l_J; + f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); + f_t inv_2g = f_t(1) / (f_t(2) * gamma); + f_t sb0 = s0 * inv_s_J; + f_t lb0 = l0 * inv_l_J; + + f_t w0 = (sb0 + lb0) * inv_2g; + inv_eta[cone] = inv_eta_v; + inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); + w_bar[off] = w0; + omega[off] = gamma * scale; + rho[cone] = rho_val; } -template -__global__ __launch_bounds__(BLOCK_DIM) void nt_scaling_small_kernel( - raft::device_span s, - raft::device_span lambda, - raft::device_span eta, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span w_bar, - raft::device_span omega, - raft::device_span rho, - raft::device_span cone_offsets, - raft::device_span cone_ids, - i_t num_cones) +template +__global__ void nt_scaling_tail_kernel(raft::device_span s, + raft::device_span lambda, + raft::device_span inv_eta, + raft::device_span rho, + raft::device_span w_bar, + raft::device_span omega, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) { - static_assert(BLOCK_DIM % 32 == 0, "Small-cone NT kernel requires warp-aligned CTAs"); - __shared__ nt_warp_storage storage; - - constexpr int warps_per_block = BLOCK_DIM / 32; - i_t warp_idx = - static_cast(blockIdx.x) * warps_per_block + static_cast(threadIdx.x >> 5); - if (warp_idx >= num_cones) return; - - int warp = threadIdx.x >> 5; - int lane = threadIdx.x & 31; - i_t cone = cone_ids[warp_idx]; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - - f_t sj = (lane < q) ? s[off + lane] : f_t(0); - f_t lj = (lane < q) ? lambda[off + lane] : f_t(0); - - auto partial = triplet_t{(lane > 0 && lane < q) ? sj * sj : f_t(0), - (lane > 0 && lane < q) ? lj * lj : f_t(0), - (lane > 0 && lane < q) ? sj * lj : f_t(0)}; - auto [s1_sq, l1_sq, sl] = reduce_broadcast(partial, storage.reduce); - - f_t owner_eta = f_t(0); - f_t owner_inv_eta = f_t(0); - f_t owner_rho = f_t(0); - f_t owner_omega_0 = f_t(0); - - if (lane == 0) { - f_t s0 = sj; - f_t l0 = lj; - f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq)); - f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq)); - f_t inv_s_J = f_t(1) / s_J; - f_t inv_l_J = f_t(1) / l_J; - owner_rho = s_J * l_J; - owner_eta = sqrt(s_J / l_J); - owner_inv_eta = f_t(1) / owner_eta; - f_t scale = sqrt(owner_rho); - - f_t s_dot_l = (s0 * l0 + sl) * inv_s_J * inv_l_J; - f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); - f_t inv_2g = f_t(1) / (f_t(2) * gamma); - f_t sb0 = s0 * inv_s_J; - f_t lb0 = l0 * inv_l_J; - f_t D = sb0 + lb0 + f_t(2) * gamma; - f_t inv_D = f_t(1) / D; - f_t c_s = (gamma + sb0) * inv_D; - f_t c_l = (gamma + lb0) * inv_D; - - storage.coeffs[warp].w_from_s = inv_2g * inv_s_J; - storage.coeffs[warp].w_from_lambda = -inv_2g * inv_l_J; - storage.coeffs[warp].omega_s_coeff = scale * c_l * inv_s_J; - storage.coeffs[warp].omega_lambda_coeff = scale * c_s * inv_l_J; - owner_omega_0 = gamma * scale; - } - __syncwarp(); - - f_t w1_sq = f_t(0); - if (lane > 0 && lane < q) { - f_t wj = storage.coeffs[warp].w_from_s * sj + storage.coeffs[warp].w_from_lambda * lj; - w_bar[off + lane] = wj; - omega[off + lane] = - storage.coeffs[warp].omega_s_coeff * sj + storage.coeffs[warp].omega_lambda_coeff * lj; - w1_sq = wj * wj; - } - w1_sq = reduce_broadcast(w1_sq, storage.reduce); - - if (lane == 0) { - f_t w0 = sqrt(f_t(1) + w1_sq); - omega[off] = owner_omega_0; - w_bar[off] = w0; - eta[cone] = owner_eta; - inv_eta[cone] = owner_inv_eta; - inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); - rho[cone] = owner_rho; - } + i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (flat_idx >= static_cast(w_bar.size())) return; + + i_t cone = element_cone_ids[flat_idx]; + i_t cone_off = cone_offsets[cone]; + if (flat_idx == cone_off) return; + + f_t s0 = s[cone_off]; + f_t l0 = lambda[cone_off]; + f_t inv_eta_val = inv_eta[cone]; + f_t rho_val = rho[cone]; + f_t scale = sqrt(rho_val); + + f_t s_J = scale / inv_eta_val; + f_t l_J = scale * inv_eta_val; + f_t inv_s_J = f_t(1) / s_J; + f_t inv_l_J = f_t(1) / l_J; + + f_t gamma = omega[cone_off] / scale; + f_t inv_2g = f_t(1) / (f_t(2) * gamma); + f_t sb0 = s0 * inv_s_J; + f_t lb0 = l0 * inv_l_J; + f_t D = sb0 + lb0 + f_t(2) * gamma; + f_t inv_D = f_t(1) / D; + f_t c_s = (gamma + sb0) * inv_D; + f_t c_l = (gamma + lb0) * inv_D; + + f_t w_from_s = inv_2g * inv_s_J; + f_t w_from_lambda = -inv_2g * inv_l_J; + f_t omega_s_coeff = scale * c_l * inv_s_J; + f_t omega_lambda_coeff = scale * c_s * inv_l_J; + + f_t sj = s[flat_idx]; + f_t lj = lambda[flat_idx]; + w_bar[flat_idx] = w_from_s * sj + w_from_lambda * lj; + omega[flat_idx] = omega_s_coeff * sj + omega_lambda_coeff * lj; } -// --------------------------------------------------------------------------- -// Step length for a single (u, du) pair in Q^q. -// -// Finds the largest alpha in [0, alpha_max] such that u + alpha*du in Q^q. -// The cone condition u_0 + alpha*du_0 >= ||u_1 + alpha*du_1|| reduces to a -// linear test plus a quadratic a*alpha^2 + 2b*alpha + c >= 0 where -// a = du_0^2 - ||du_1||^2, b = u_0*du_0 - u_1^T du_1, c = u_0^2 - ||u_1||^2. -// --------------------------------------------------------------------------- -template -DI f_t -cone_step_length_single(raft::device_span u, - raft::device_span du, - typename block_reduce_t, BLOCK_DIM>::TempStorage& temp, - f_t alpha) +template +DI f_t cone_step_length_from_scalars(f_t u0, f_t du0, f_t du1_sq, f_t u1du1, f_t c, f_t alpha_max) { - i_t q = static_cast(u.size()); - auto partial = triplet_t{}; - auto& [du1_sq_p, u1du1_p, u1_sq_p] = partial; - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t uj = u[j]; - f_t duj = du[j]; - du1_sq_p += duj * duj; - u1du1_p += uj * duj; - u1_sq_p += uj * uj; - } - - auto [du1_sq, u1du1, u1_sq] = - block_reduce_t, BLOCK_DIM>(temp).Reduce(partial, triplet_sum{}); - __syncthreads(); - - if (threadIdx.x == 0) { - f_t a = du[0] * du[0] - du1_sq; - f_t b = u[0] * du[0] - u1du1; - f_t c = max(f_t(0), u[0] * u[0] - u1_sq); - f_t disc = b * b - a * c; - - // Linear constraint: u_0 + alpha * du_0 >= 0. - if (du[0] < f_t(0)) { alpha = min(alpha, -u[0] / du[0]); } - - // Quadratic constraint. - if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { - // No positive root (parabola stays non-negative for alpha > 0). - } else if (a == f_t(0)) { - // Degenerate: 2b*alpha + c = 0. - if (b < f_t(0)) { alpha = min(alpha, c / (f_t(-2) * b)); } - } else if (c == f_t(0)) { - // Starting exactly on the cone boundary: take a full step only if the - // direction stays in the cone, otherwise the maximum feasible step is 0. - alpha = (a >= f_t(0)) ? alpha : f_t(0); - } else { - f_t t = -(b + copysign(sqrt(disc), b)); - f_t r1 = c / t; - f_t r2 = t / a; - if (r1 < f_t(0)) { r1 = alpha; } - if (r2 < f_t(0)) { r2 = alpha; } - alpha = min(alpha, min(r1, r2)); - } + f_t a = du0 * du0 - du1_sq; + f_t b = u0 * du0 - u1du1; + f_t disc = b * b - a * c; + f_t alpha = alpha_max; + + if (du0 < f_t(0)) { alpha = min(alpha, -u0 / du0); } + if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { + return alpha; + } else if (a == f_t(0)) { + if (b < f_t(0)) { alpha = min(alpha, c / (f_t(-2) * b)); } + } else if (c == f_t(0)) { + alpha = (a >= f_t(0)) ? alpha : f_t(0); + } else { + f_t t = -(b + copysign(sqrt(disc), b)); + f_t r1 = c / t; + f_t r2 = t / a; + if (r1 < f_t(0)) { r1 = alpha; } + if (r2 < f_t(0)) { r2 = alpha; } + alpha = min(alpha, min(r1, r2)); } return alpha; } -// --------------------------------------------------------------------------- -// Cone step length kernel (one block per cone). -// -// Computes, for each cone i, the largest alpha in [0, alpha_max] such that -// s_i + alpha * ds_i in Q^{q_i} AND lambda_i + alpha * dlambda_i in Q^{q_i}. -// The per-cone result is written to alpha[i]. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void step_length_kernel( - raft::device_span s, - raft::device_span ds, - raft::device_span lambda, - raft::device_span dlambda, - raft::device_span alpha, - raft::device_span cone_offsets, - i_t K, - f_t alpha_max) -{ - __shared__ typename block_reduce_t, BLOCK_DIM>::TempStorage temp_storage; - - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - - f_t alpha_s = cone_step_length_single( - s.subspan(off, q), ds.subspan(off, q), temp_storage, alpha_max); - f_t alpha_l = cone_step_length_single( - lambda.subspan(off, q), dlambda.subspan(off, q), temp_storage, alpha_max); - - if (threadIdx.x == 0) { alpha[cone] = min(alpha_s, alpha_l); } -} - -// --------------------------------------------------------------------------- -// Single-variable cone step length kernel (one block per cone). -// Like step_length_kernel but only checks u + alpha*du in Q^{q_i}. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void step_length_single_kernel( - raft::device_span u, - raft::device_span du, - raft::device_span alpha, - raft::device_span cone_offsets, - i_t K, - f_t alpha_max) -{ - __shared__ typename block_reduce_t, BLOCK_DIM>::TempStorage temp_storage; - - i_t cone = static_cast(blockIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - - f_t a = cone_step_length_single( - u.subspan(off, q), du.subspan(off, q), temp_storage, alpha_max); - - if (threadIdx.x == 0) { alpha[cone] = a; } -} - -// --------------------------------------------------------------------------- -// Shift u into int(Q^q) if it is not already interior (one block per cone). -// -// alpha(u) = ||u_1|| - u_0. If alpha >= 0 (u on boundary or outside): -// u_0 <- u_0 + 1 + max(0, alpha) (shift along identity element e) -// -// Modifies u in place. Used once during initial-point computation. -// --------------------------------------------------------------------------- -template -__global__ __launch_bounds__(BLOCK_DIM) void interior_shift_kernel( - raft::device_span u, raft::device_span cone_offsets, i_t K) +template +__global__ void step_length_pair_kernel(raft::device_span s, + raft::device_span ds, + raft::device_span lambda, + raft::device_span dlambda, + raft::device_span alpha_primal, + raft::device_span alpha_dual, + raft::device_span s_du1_sq, + raft::device_span s_u1du1, + raft::device_span s_u1_sq, + raft::device_span l_du1_sq, + raft::device_span l_u1du1, + raft::device_span l_u1_sq, + raft::device_span cone_offsets, + f_t alpha_max, + i_t K) { - __shared__ typename block_reduce_t::TempStorage temp_storage; - - i_t cone = static_cast(blockIdx.x); + i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (cone >= K) return; i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; + f_t s_c = max(f_t(0), s[off] * s[off] - s_u1_sq[cone]); + f_t l_c = max(f_t(0), lambda[off] * lambda[off] - l_u1_sq[cone]); - f_t tail_sq = f_t(0); - for (i_t j = 1 + static_cast(threadIdx.x); j < q; j += BLOCK_DIM) { - f_t v = u[off + j]; - tail_sq += v * v; - } - tail_sq = block_reduce_t(temp_storage).Sum(tail_sq); - - if (threadIdx.x == 0) { - f_t u1_norm = sqrt(tail_sq); - f_t gap = u1_norm - u[off]; - if (gap >= f_t(0)) { u[off] += f_t(1) + gap; } - } + alpha_primal[cone] = cone_step_length_from_scalars( + s[off], ds[off], s_du1_sq[cone], s_u1du1[cone], s_c, alpha_max); + alpha_dual[cone] = cone_step_length_from_scalars( + lambda[off], dlambda[off], l_du1_sq[cone], l_u1du1[cone], l_c, alpha_max); } /** @@ -835,9 +340,9 @@ __global__ __launch_bounds__(BLOCK_DIM) void interior_shift_kernel( * caller to cover the cone portion of the global x/z vectors. The caller * must keep the underlying memory alive. * - * Search directions, RHS vectors, and workspace live directly in - * iteration_data_t (matching the existing LP/QP pattern where dx_aff, dual_rhs, - * etc. are all top-level fields of iteration_data_t). + * Only persistent cone state lives here. Reusable per-iteration workspace sits + * under `scratch`, which keeps the mutating temporary buffers out of the + * persistent NT state. */ template struct cone_data_t { @@ -848,21 +353,21 @@ struct cone_data_t { rmm::device_uvector cone_offsets; // [K+1] prefix sums of cone_dims rmm::device_uvector cone_dims; // [K] dimension q_i of each cone rmm::device_uvector block_offsets; // [K+1] prefix sums of q_i^2 (for dense block build) + rmm::device_uvector block_entry_cone_ids; // [sum q_i^2] owning cone id for each block entry // --- Primal/dual cone iterates (non-owning views, set by caller) --- raft::device_span s; // [m_c] cone slack: s_i in int(Q^{q_i}) raft::device_span lambda; // [m_c] cone dual: lambda_i in int(Q^{q_i}) // --- NT scaling state (recomputed each iteration from s, lambda) --- - rmm::device_uvector eta; // [K] scaling factor eta_i = (||s_i||_J / ||lambda_i||_J)^{1/2} - rmm::device_uvector inv_eta; // [K] cached 1/eta_i + rmm::device_uvector + inv_eta; // [K] 1/eta_i where eta_i = (||s_i||_J / ||lambda_i||_J)^{1/2} rmm::device_uvector inv_1pw0; // [K] cached 1/(1 + wbar_0_i) rmm::device_uvector w_bar; // [m_c] NT scaling direction, unit J-norm, packed by cone rmm::device_uvector omega; // [m_c] scaled variable omega_i = H_i^{-1} s_i, packed by cone rmm::device_uvector rho; // [K] ||omega_i||^2_J = ||s_i||_J * ||lambda_i||_J - rmm::device_uvector small_cone_ids; // [n_small] cone ids with q <= 32 - rmm::device_uvector medium_cone_ids; // [n_medium] cone ids with 32 < q <= 2048 - rmm::device_uvector large_cone_ids; // [n_large] cone ids with q > 2048 + rmm::device_uvector element_cone_ids; // [m_c] owning cone id for each packed entry + cone_scratch_t scratch; cone_data_t(i_t K_in, const std::vector& dims, @@ -874,34 +379,31 @@ struct cone_data_t { cone_offsets(K_in + 1, stream), cone_dims(K_in, stream), block_offsets(K_in + 1, stream), + block_entry_cone_ids( + std::accumulate( + dims.begin(), dims.end(), i_t(0), [](i_t acc, i_t q) { return acc + q * q; }), + stream), s(s_in), lambda(lambda_in), - eta(K_in, stream), inv_eta(K_in, stream), inv_1pw0(K_in, stream), w_bar(m_c, stream), omega(m_c, stream), rho(K_in, stream), - small_cone_ids(0, stream), - medium_cone_ids(0, stream), - large_cone_ids(0, stream) + element_cone_ids(m_c, stream), + scratch(K_in, stream) { std::vector offsets(K + 1, 0); std::vector blk_offsets(K + 1, 0); - std::vector small_ids; - std::vector medium_ids; - std::vector large_ids; + std::vector cone_ids(m_c, 0); + std::vector block_cone_ids(block_entry_cone_ids.size(), 0); for (i_t i = 0; i < K; ++i) { offsets[i + 1] = offsets[i] + dims[i]; blk_offsets[i + 1] = blk_offsets[i] + dims[i] * dims[i]; - if (dims[i] <= small_cone_limit) { - small_ids.push_back(i); - } else if (dims[i] <= medium_cone_limit) { - medium_ids.push_back(i); - } else { - large_ids.push_back(i); - } + std::fill(cone_ids.begin() + offsets[i], cone_ids.begin() + offsets[i + 1], i); + std::fill( + block_cone_ids.begin() + blk_offsets[i], block_cone_ids.begin() + blk_offsets[i + 1], i); } auto init_device_vec = [&](auto& d_vec, const auto& h_vec) { @@ -914,86 +416,229 @@ struct cone_data_t { raft::copy(cone_offsets.data(), offsets.data(), K + 1, stream); raft::copy(cone_dims.data(), dims.data(), K, stream); raft::copy(block_offsets.data(), blk_offsets.data(), K + 1, stream); - init_device_vec(small_cone_ids, small_ids); - init_device_vec(medium_cone_ids, medium_ids); - init_device_vec(large_cone_ids, large_ids); + init_device_vec(block_entry_cone_ids, block_cone_ids); + init_device_vec(element_cone_ids, cone_ids); } }; +template +void segmented_sum(InputIt input, + raft::device_span cone_offsets, + i_t K, + raft::device_span out, + rmm::device_uvector& workspace, + rmm::cuda_stream_view stream) +{ + if (K == 0) return; + cuopt_assert(static_cast(out.size()) == K, "segmented_sum output must match cone count"); + + std::size_t temp_storage_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_storage_bytes, + input, + out.data(), + K, + cone_offsets.data(), + cone_offsets.data() + 1, + stream.value()); + if (workspace.size() < temp_storage_bytes) { workspace.resize(temp_storage_bytes, stream); } + cub::DeviceSegmentedReduce::Sum(workspace.data(), + temp_storage_bytes, + input, + out.data(), + K, + cone_offsets.data(), + cone_offsets.data() + 1, + stream.value()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void segmented_reduce(InputIt input, + raft::device_span cone_offsets, + i_t K, + rmm::device_uvector& out, + rmm::device_uvector& workspace, + ReduceOp reduce_op, + t_t initial_value, + rmm::cuda_stream_view stream) +{ + out.resize(K, stream); + if (K == 0) return; + + std::size_t temp_storage_bytes = 0; + cub::DeviceSegmentedReduce::Reduce(nullptr, + temp_storage_bytes, + input, + out.data(), + K, + cone_offsets.data(), + cone_offsets.data() + 1, + reduce_op, + initial_value, + stream.value()); + if (workspace.size() < temp_storage_bytes) { workspace.resize(temp_storage_bytes, stream); } + cub::DeviceSegmentedReduce::Reduce(workspace.data(), + temp_storage_bytes, + input, + out.data(), + K, + cone_offsets.data(), + cone_offsets.data() + 1, + reduce_op, + initial_value, + stream.value()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template -void compute_affine_cone_rhs_term(const cone_data_t& cones, - rmm::device_uvector& out, - rmm::cuda_stream_view stream) +void apply_hinv2(raft::device_span v, + raft::device_span out, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span tail_dot, + rmm::device_uvector& workspace, + i_t K, + rmm::cuda_stream_view stream, + f_t output_scale = f_t(1)) { - out.resize(cones.m_c, stream); - if (cones.K == 0) return; + if (K == 0) return; - apply_Hinv2_kernel - <<>>(cones.s, - cuopt::make_span(out), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.cone_offsets), - cones.K); + auto span_v = v; + auto span_w_bar = w_bar; + auto span_cone_offsets = cone_offsets; + auto span_element_cone_ids = element_cone_ids; + auto tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_v, span_w_bar, span_cone_offsets, span_element_cone_ids] HD(i_t idx) { + i_t cone = span_element_cone_ids[idx]; + i_t cone_off = span_cone_offsets[cone]; + return (idx == cone_off) ? f_t(0) : span_w_bar[idx] * span_v[idx]; + }); + segmented_sum(tail_terms, cone_offsets, K, tail_dot, workspace, stream); + + i_t grid_dim = (static_cast(out.size()) + flat_block_dim - 1) / flat_block_dim; + apply_hinv2_write_kernel<<>>( + v, out, w_bar, inv_eta, tail_dot, cone_offsets, element_cone_ids, output_scale); RAFT_CUDA_TRY(cudaPeekAtLastError()); } +template +void apply_hinv2(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream, + f_t output_scale = f_t(1)) +{ + apply_hinv2(v, + out, + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cones.element_cone_ids), + cones.scratch.hinv2_tail_dot(), + cones.scratch.segmented_reduce_workspace, + cones.K, + stream, + output_scale); +} + +template +void compute_affine_cone_rhs_term(cone_data_t& cones, + raft::device_span out, + rmm::cuda_stream_view stream, + f_t output_scale = f_t(1)) +{ + cuopt_assert(static_cast(out.size()) == cones.m_c, "cone rhs span must match cone size"); + if (cones.K == 0) return; + + apply_hinv2(cones.s, out, cones, stream, output_scale); +} + template void compute_combined_cone_rhs_term(raft::device_span dx_aff, - const cone_data_t& cones, + cone_data_t& cones, f_t sigma_mu, - rmm::device_uvector& out, - rmm::cuda_stream_view stream) + raft::device_span out, + rmm::cuda_stream_view stream, + f_t output_scale = f_t(1)) { - out.resize(cones.m_c, stream); + cuopt_assert(static_cast(out.size()) == cones.m_c, "cone rhs span must match cone size"); if (cones.K == 0) return; - fused_corrector_kernel - <<>>(dx_aff, - cuopt::make_span(cones.omega), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.inv_1pw0), - cuopt::make_span(cones.rho), - sigma_mu, - cuopt::make_span(out), - cuopt::make_span(cones.cone_offsets), - cones.K); + auto span_dx_aff = dx_aff; + auto span_w_bar = cuopt::make_span(cones.w_bar); + auto span_omega = cuopt::make_span(cones.omega); + auto span_cone_offsets = cuopt::make_span(cones.cone_offsets); + auto span_element_cone_id = cuopt::make_span(cones.element_cone_ids); + + auto raw_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_dx_aff, span_w_bar, span_omega, span_cone_offsets, span_element_cone_id] HD(i_t idx) { + i_t cone = span_element_cone_id[idx]; + i_t cone_off = span_cone_offsets[cone]; + if (idx == cone_off) { return corrector_raw_t{f_t(0), f_t(0), f_t(0)}; } + f_t dx_aff_j = span_dx_aff[idx]; + return corrector_raw_t{ + span_w_bar[idx] * dx_aff_j, span_omega[idx] * dx_aff_j, dx_aff_j * dx_aff_j}; + }); + segmented_reduce>(raw_terms, + cuopt::make_span(cones.cone_offsets), + cones.K, + cones.scratch.corrector_raw, + cones.scratch.segmented_reduce_workspace, + corrector_raw_sum_t{}, + corrector_raw_t{f_t(0), f_t(0), f_t(0)}, + stream); + + i_t grid_dim = (cones.m_c + flat_block_dim - 1) / flat_block_dim; + fused_corrector_write_kernel + <<>>(cones.s, + cones.lambda, + dx_aff, + cuopt::make_span(cones.omega), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.rho), + cuopt::make_span(cones.scratch.corrector_raw), + out, + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cones.element_cone_ids), + sigma_mu, + output_scale); RAFT_CUDA_TRY(cudaPeekAtLastError()); } template -void recover_cone_dz(raft::device_span dx, - const cone_data_t& cones, - const rmm::device_uvector& cone_rhs_term, - rmm::device_uvector& hinv2_dx, - raft::device_span dz, - rmm::cuda_stream_view stream) +void recover_cone_dz_from_target(raft::device_span dx, + cone_data_t& cones, + raft::device_span cone_target, + rmm::device_uvector& hinv2_dx, + raft::device_span dz, + rmm::cuda_stream_view stream) { hinv2_dx.resize(cones.m_c, stream); if (cones.K == 0) return; - apply_Hinv2_kernel - <<>>(dx, - cuopt::make_span(hinv2_dx), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.cone_offsets), - cones.K); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + apply_hinv2(dx, cuopt::make_span(hinv2_dx), cones, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - cones.m_c, - [span_rhs = cuopt::make_span(cone_rhs_term), - span_hinv2 = cuopt::make_span(hinv2_dx), - span_dz = dz] __device__(i_t j) { span_dz[j] = -span_rhs[j] - span_hinv2[j]; }); + auto span_target = cone_target; + auto span_hinv2 = cuopt::make_span(hinv2_dx); + auto span_dz = dz; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + cones.m_c, + [span_target, span_hinv2, span_dz] __device__(i_t j) { + span_dz[j] = span_target[j] - span_hinv2[j]; + }); } template void accumulate_cone_hinv2_matvec(raft::device_span x, - const cone_data_t& cones, + cone_data_t& cones, rmm::device_uvector& hinv2_x, raft::device_span out, rmm::cuda_stream_view stream) @@ -1001,21 +646,14 @@ void accumulate_cone_hinv2_matvec(raft::device_span x, hinv2_x.resize(cones.m_c, stream); if (cones.K == 0) return; - apply_Hinv2_kernel - <<>>(x, - cuopt::make_span(hinv2_x), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.cone_offsets), - cones.K); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + apply_hinv2(x, cuopt::make_span(hinv2_x), cones, stream); + auto span_hinv2 = cuopt::make_span(hinv2_x); + auto span_out = out; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), cones.m_c, - [span_hinv2 = cuopt::make_span(hinv2_x), span_out = out] __device__(i_t j) { - span_out[j] += span_hinv2[j]; - }); + [span_hinv2, span_out] __device__(i_t j) { span_out[j] += span_hinv2[j]; }); } // --------------------------------------------------------------------------- @@ -1026,11 +664,44 @@ void accumulate_cone_hinv2_matvec(raft::device_span x, // - `csr_indices[e]` gives the destination slot in `augmented_x` // - `q_values[e]` stores any pre-merged Q contribution for that slot // -// For each flat entry we identify its owning cone from `block_offsets`, -// recover local (r, c) coordinates, evaluate H_k^{-2}(r, c), and scatter +// For each flat entry we load its precomputed owning cone id, recover local +// (r, c) coordinates, evaluate H_k^{-2}(r, c), and write // -(H_k^{-2}(r, c) + q_values[e]) // into `augmented_x[csr_indices[e]]`. // --------------------------------------------------------------------------- +template +__global__ void scatter_hinv2_into_augmented_kernel( + raft::device_span augmented_x, + raft::device_span csr_indices, + raft::device_span q_values, + raft::device_span w_bar, + raft::device_span inv_eta, + raft::device_span cone_offsets, + raft::device_span block_offsets, + raft::device_span block_entry_cone_ids) +{ + i_t e = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (e >= static_cast(csr_indices.size())) return; + + i_t cone = block_entry_cone_ids[e]; + i_t off = cone_offsets[cone]; + i_t q = cone_offsets[cone + 1] - off; + i_t blk_off = block_offsets[cone]; + i_t local = e - blk_off; + i_t r = local / q; + i_t c = local % q; + + f_t ie_sq = inv_eta[cone] * inv_eta[cone]; + f_t w0 = w_bar[off]; + f_t u_r = (r == 0) ? w0 : -w_bar[off + r]; + f_t u_c = (c == 0) ? w0 : -w_bar[off + c]; + f_t val = f_t(2) * u_r * ie_sq * u_c; + f_t diag_correction = (r == 0) ? -ie_sq : ie_sq; + if (r == c) { val += diag_correction; } + + augmented_x[csr_indices[e]] = -val - q_values[e]; +} + template void scatter_hinv2_into_augmented(const cone_data_t& cones, rmm::device_uvector& augmented_x, @@ -1041,156 +712,246 @@ void scatter_hinv2_into_augmented(const cone_data_t& cones, i_t count = static_cast(csr_indices.size()); if (count == 0) return; - auto values = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_w_bar = cuopt::make_span(cones.w_bar), - span_inv_eta = cuopt::make_span(cones.inv_eta), - span_block_offsets = cuopt::make_span(cones.block_offsets), - span_cone_offsets = cuopt::make_span(cones.cone_offsets), - span_q_values = cuopt::make_span(q_values)] __device__(i_t e) -> f_t { - i_t lo = 0; - i_t hi = static_cast(span_block_offsets.size()) - 1; - while (lo + 1 < hi) { - i_t mid = lo + (hi - lo) / 2; - if (span_block_offsets[mid] <= e) { - lo = mid; - } else { - hi = mid; - } - } - i_t cone = lo; - i_t off = span_cone_offsets[cone]; - i_t q = span_cone_offsets[cone + 1] - off; - i_t blk_off = span_block_offsets[cone]; - i_t local = e - blk_off; - i_t r = local / q; - i_t c = local % q; - - f_t ie_sq = span_inv_eta[cone] * span_inv_eta[cone]; - f_t w0 = span_w_bar[off]; - f_t u_r = (r == 0) ? w0 : -span_w_bar[off + r]; - f_t u_c = (c == 0) ? w0 : -span_w_bar[off + c]; - f_t val = f_t(2) * u_r * ie_sq * u_c; - f_t diag_correction = (r == 0) ? -ie_sq : ie_sq; - if (r == c) { val += diag_correction; } - - return -val - span_q_values[e]; - }); + cuopt_assert(count == static_cast(cones.block_entry_cone_ids.size()), + "scatter expects one flat entry per cone-block coefficient"); - thrust::scatter( - rmm::exec_policy(stream), values, values + count, csr_indices.begin(), augmented_x.begin()); + i_t grid_dim = (count + flat_block_dim - 1) / flat_block_dim; + scatter_hinv2_into_augmented_kernel + <<>>(cuopt::make_span(augmented_x), + cuopt::make_span(csr_indices), + cuopt::make_span(q_values), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cones.block_offsets), + cuopt::make_span(cones.block_entry_cone_ids)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } // --------------------------------------------------------------------------- -// Compute the maximum feasible step length for the cone portion of (x, z). -// -// Launches step_length_kernel (one CTA per cone), then reduces the per-cone -// results to a single scalar. Returns min over all cones of the step length -// that keeps both x_K + alpha*dx_K and z_K + alpha*dz_K in their cones. +// Compute per-cone step lengths, then reduce them to the global maximum +// feasible primal/dual step. // --------------------------------------------------------------------------- template -f_t compute_cone_step_length(const cone_data_t& cones, - raft::device_span x_K, - raft::device_span dx_K, - raft::device_span z_K, - raft::device_span dz_K, - f_t alpha_max, - rmm::cuda_stream_view stream) +void compute_cone_step_length_per_cone(cone_data_t& cones, + raft::device_span x_K, + raft::device_span dx_K, + raft::device_span z_K, + raft::device_span dz_K, + raft::device_span alpha_primal, + raft::device_span alpha_dual, + f_t alpha_max, + rmm::cuda_stream_view stream) { - if (cones.K == 0) return alpha_max; - - rmm::device_uvector d_alpha(cones.K, stream); - step_length_kernel - <<>>(x_K, - dx_K, - z_K, - dz_K, - cuopt::make_span(d_alpha), - cuopt::make_span(cones.cone_offsets), - cones.K, - alpha_max); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - f_t result = thrust::reduce( - rmm::exec_policy(stream), d_alpha.begin(), d_alpha.end(), alpha_max, thrust::minimum()); - return result; -} - -template -f_t compute_single_cone_step_length(const cone_data_t& cones, - raft::device_span u_K, - raft::device_span du_K, - f_t alpha_max, - rmm::cuda_stream_view stream) -{ - if (cones.K == 0) return alpha_max; + cuopt_assert(static_cast(alpha_primal.size()) == cones.K && + static_cast(alpha_dual.size()) == cones.K, + "step-length outputs must match cone count"); + if (cones.K == 0) return; - rmm::device_uvector d_alpha(cones.K, stream); - step_length_single_kernel<<>>( - u_K, du_K, cuopt::make_span(d_alpha), cuopt::make_span(cones.cone_offsets), cones.K, alpha_max); + auto span_offsets = cuopt::make_span(cones.cone_offsets); + auto span_elem = cuopt::make_span(cones.element_cone_ids); + + auto s_du1_sq = cones.scratch.step_s_du1_sq(); + auto s_u1du1 = cones.scratch.step_s_u1du1(); + auto s_u1_sq = cones.scratch.step_s_u1_sq(); + auto l_du1_sq = cones.scratch.step_l_du1_sq(); + auto l_u1du1 = cones.scratch.step_l_u1du1(); + auto l_u1_sq = cones.scratch.step_l_u1_sq(); + + auto span_x_K = x_K; + auto span_dx_K = dx_K; + auto span_z_K = z_K; + auto span_dz_K = dz_K; + + auto s_du1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_dx_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_dx_K[idx] * span_dx_K[idx]; + }); + segmented_sum(s_du1_sq_terms, + span_offsets, + cones.K, + s_du1_sq, + cones.scratch.segmented_reduce_workspace, + stream); + + auto s_u1du1_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_x_K, span_dx_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_x_K[idx] * span_dx_K[idx]; + }); + segmented_sum(s_u1du1_terms, + span_offsets, + cones.K, + s_u1du1, + cones.scratch.segmented_reduce_workspace, + stream); + + auto s_u1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_x_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_x_K[idx] * span_x_K[idx]; + }); + segmented_sum(s_u1_sq_terms, + span_offsets, + cones.K, + s_u1_sq, + cones.scratch.segmented_reduce_workspace, + stream); + + auto l_du1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_dz_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_dz_K[idx] * span_dz_K[idx]; + }); + segmented_sum(l_du1_sq_terms, + span_offsets, + cones.K, + l_du1_sq, + cones.scratch.segmented_reduce_workspace, + stream); + + auto l_u1du1_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_z_K, span_dz_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_z_K[idx] * span_dz_K[idx]; + }); + segmented_sum(l_u1du1_terms, + span_offsets, + cones.K, + l_u1du1, + cones.scratch.segmented_reduce_workspace, + stream); + + auto l_u1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_z_K, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_z_K[idx] * span_z_K[idx]; + }); + segmented_sum(l_u1_sq_terms, + span_offsets, + cones.K, + l_u1_sq, + cones.scratch.segmented_reduce_workspace, + stream); + + i_t grid_dim = (cones.K + flat_block_dim - 1) / flat_block_dim; + step_length_pair_kernel<<>>(x_K, + dx_K, + z_K, + dz_K, + alpha_primal, + alpha_dual, + s_du1_sq, + s_u1du1, + s_u1_sq, + l_du1_sq, + l_u1du1, + l_u1_sq, + span_offsets, + alpha_max, + cones.K); RAFT_CUDA_TRY(cudaPeekAtLastError()); - - return thrust::reduce( - rmm::exec_policy(stream), d_alpha.begin(), d_alpha.end(), alpha_max, thrust::minimum()); } -// --------------------------------------------------------------------------- -// Shift cone slices of a vector into the strict interior of their cones. -// Operates on a subspan of the global vector (pre-sliced to cone portion). -// --------------------------------------------------------------------------- template -void launch_interior_shift(raft::device_span u_K, - const cone_data_t& cones, - rmm::cuda_stream_view stream) +std::pair compute_cone_step_length(cone_data_t& cones, + raft::device_span x_K, + raft::device_span dx_K, + raft::device_span z_K, + raft::device_span dz_K, + f_t alpha_max, + rmm::cuda_stream_view stream) { - if (cones.K == 0) return; - interior_shift_kernel - <<>>(u_K, cuopt::make_span(cones.cone_offsets), cones.K); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + if (cones.K == 0) return {alpha_max, alpha_max}; + + auto alpha_primal = cones.scratch.step_alpha_primal_span(); + auto alpha_dual = cones.scratch.step_alpha_dual_span(); + + compute_cone_step_length_per_cone( + cones, x_K, dx_K, z_K, dz_K, alpha_primal, alpha_dual, alpha_max, stream); + + f_t primal = thrust::reduce(rmm::exec_policy(stream), + alpha_primal.begin(), + alpha_primal.end(), + alpha_max, + thrust::minimum()); + f_t dual = thrust::reduce(rmm::exec_policy(stream), + alpha_dual.begin(), + alpha_dual.end(), + alpha_max, + thrust::minimum()); + return {primal, dual}; } template void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) { - auto launch_streaming_bucket = [&](auto& cone_ids, auto block_dim_ic) { - constexpr int block_dim = std::remove_cvref_t::value; - i_t bucket_size = static_cast(cone_ids.size()); - if (bucket_size == 0) return; + if (cones.K == 0) return; - nt_scaling_kernel - <<>>(cones.s, + auto nt_s1_sq = cones.scratch.nt_s1_sq(); + auto nt_l1_sq = cones.scratch.nt_l1_sq(); + auto nt_sl = cones.scratch.nt_sl(); + + auto span_s = cones.s; + auto span_lambda = cones.lambda; + auto span_offsets = cuopt::make_span(cones.cone_offsets); + auto span_elem = cuopt::make_span(cones.element_cone_ids); + + auto s1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_s, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_s[idx] * span_s[idx]; + }); + segmented_sum( + s1_sq_terms, span_offsets, cones.K, nt_s1_sq, cones.scratch.segmented_reduce_workspace, stream); + + auto l1_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [span_lambda, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_lambda[idx] * span_lambda[idx]; + }); + segmented_sum( + l1_sq_terms, span_offsets, cones.K, nt_l1_sq, cones.scratch.segmented_reduce_workspace, stream); + + auto sl_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_s, span_lambda, span_offsets, span_elem] HD(i_t idx) { + i_t cone = span_elem[idx]; + return (idx == span_offsets[cone]) ? f_t(0) : span_s[idx] * span_lambda[idx]; + }); + segmented_sum( + sl_terms, span_offsets, cones.K, nt_sl, cones.scratch.segmented_reduce_workspace, stream); + + i_t scalar_grid_dim = (cones.K + flat_block_dim - 1) / flat_block_dim; + nt_scaling_scalar_kernel + <<>>(cones.s, + cones.lambda, + span_offsets, + nt_s1_sq, + nt_l1_sq, + nt_sl, + cuopt::make_span(cones.inv_eta), + cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.w_bar), + cuopt::make_span(cones.omega), + cuopt::make_span(cones.rho), + cones.K); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + i_t grid_dim = (cones.m_c + flat_block_dim - 1) / flat_block_dim; + nt_scaling_tail_kernel + <<>>(cones.s, cones.lambda, - cuopt::make_span(cones.eta), cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.inv_1pw0), + cuopt::make_span(cones.rho), cuopt::make_span(cones.w_bar), cuopt::make_span(cones.omega), - cuopt::make_span(cones.rho), - cuopt::make_span(cones.cone_offsets), - cuopt::make_span(cone_ids), - bucket_size); - }; - - i_t small_count = static_cast(cones.small_cone_ids.size()); - if (small_count > 0) { - constexpr int warps_per_block = small_block_dim / 32; - i_t grid_dim = (small_count + warps_per_block - 1) / warps_per_block; - nt_scaling_small_kernel - <<>>(cones.s, - cones.lambda, - cuopt::make_span(cones.eta), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.inv_1pw0), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.omega), - cuopt::make_span(cones.rho), - cuopt::make_span(cones.cone_offsets), - cuopt::make_span(cones.small_cone_ids), - small_count); - } - - launch_streaming_bucket(cones.medium_cone_ids, std::integral_constant{}); - launch_streaming_bucket(cones.large_cone_ids, std::integral_constant{}); + span_offsets, + span_elem); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index ab5f990809..90ae6ff74e 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -19,11 +19,16 @@ namespace cuopt::linear_programming::dual_simplex { +template +static i_t linear_var_count(const lp_problem_t& problem) +{ + return problem.second_order_cone_dims.empty() ? problem.num_cols : problem.cone_var_start; +} + template i_t remove_empty_cols(lp_problem_t& problem, i_t& num_empty_cols, - presolve_info_t& presolve_info, - const std::vector& is_cone_variable) + presolve_info_t& presolve_info) { constexpr bool verbose = false; if (verbose) { printf("Removing %d empty columns\n", num_empty_cols); } @@ -32,9 +37,10 @@ i_t remove_empty_cols(lp_problem_t& problem, presolve_info.removed_reduced_costs.reserve(num_empty_cols); std::vector has_quadratic_term(problem.num_cols, false); + i_t linear_cols = linear_var_count(problem); if (problem.Q.n > 0) { - for (i_t j = 0; j < problem.num_cols; ++j) { + for (i_t j = 0; j < linear_cols; ++j) { const i_t row_start = problem.Q.row_start[j]; const i_t row_end = problem.Q.row_start[j + 1]; if (row_end - row_start == 0) { continue; } @@ -46,8 +52,8 @@ i_t remove_empty_cols(lp_problem_t& problem, i_t new_cols = 0; for (i_t j = 0; j < problem.num_cols; ++j) { bool remove_var = false; - if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { - bool non_removable = has_quadratic_term[j] || is_cone_variable[j]; + if (j < linear_cols && (problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { + bool non_removable = has_quadratic_term[j]; if (problem.objective[j] >= 0 && problem.lower[j] > -inf && !non_removable) { presolve_info.removed_values.push_back(problem.lower[j]); problem.obj_constant += problem.objective[j] * problem.lower[j]; @@ -115,6 +121,12 @@ i_t remove_empty_cols(lp_problem_t& problem, problem.Q.check_matrix("After removing empty columns"); } + if (!problem.second_order_cone_dims.empty()) { + i_t new_cone_start = col_old_to_new[problem.cone_var_start]; + assert(new_cone_start != -1); + problem.cone_var_start = new_cone_start; + } + problem.objective = objective; problem.lower = lower; problem.upper = upper; @@ -264,6 +276,113 @@ i_t convert_less_than_to_equal(const user_problem_t& user_problem, // We must convert rows in the form: a_i^T x <= beta // into: a_i^T x + s_i = beta, s_i >= 0 + if (!problem.second_order_cone_dims.empty()) { + const i_t old_num_cols = problem.num_cols; + const i_t linear_cols = linear_var_count(problem); + const i_t num_slacks = less_rows; + const i_t num_cols = old_num_cols + num_slacks; + const i_t old_nnz = problem.A.col_start[old_num_cols]; + const i_t nnz = old_nnz + num_slacks; + const i_t new_cone_start = linear_cols + num_slacks; + + auto old_A = problem.A; + csc_matrix_t expanded_A(problem.A.m, num_cols, nnz); + + std::vector objective(num_cols, 0.0); + std::vector lower(num_cols, 0.0); + std::vector upper(num_cols, INFINITY); + std::vector old_to_new(old_num_cols, -1); + + for (i_t j = 0; j < linear_cols; ++j) { + old_to_new[j] = j; + objective[j] = problem.objective[j]; + lower[j] = problem.lower[j]; + upper[j] = problem.upper[j]; + } + for (i_t j = linear_cols; j < old_num_cols; ++j) { + old_to_new[j] = j + num_slacks; + objective[old_to_new[j]] = problem.objective[j]; + lower[old_to_new[j]] = problem.lower[j]; + upper[old_to_new[j]] = problem.upper[j]; + } + + i_t nz = 0; + for (i_t j = 0; j < linear_cols; ++j) { + expanded_A.col_start[j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + + i_t slack_col = linear_cols; + for (i_t i = 0; i < problem.num_rows; i++) { + if (row_sense[i] == 'L') { + expanded_A.col_start[slack_col] = nz; + expanded_A.i[nz] = i; + expanded_A.x[nz] = 1.0; + new_slacks.push_back(slack_col); + row_sense[i] = 'E'; + ++slack_col; + ++nz; + --less_rows; + } + } + + for (i_t j = linear_cols; j < old_num_cols; ++j) { + i_t new_j = old_to_new[j]; + expanded_A.col_start[new_j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + expanded_A.col_start[num_cols] = nz; + assert(less_rows == 0); + assert(slack_col == new_cone_start); + assert(nz == nnz); + + if (problem.Q.n > 0) { + const auto old_Q = problem.Q; + const i_t q_nnz = old_Q.row_start[old_num_cols]; + + problem.Q.row_start.assign(num_cols + 1, 0); + for (i_t row = 0; row < old_num_cols; ++row) { + i_t new_row = old_to_new[row]; + problem.Q.row_start[new_row + 1] = old_Q.row_start[row + 1] - old_Q.row_start[row]; + } + for (i_t row = 0; row < num_cols; ++row) { + problem.Q.row_start[row + 1] += problem.Q.row_start[row]; + } + + problem.Q.j.resize(q_nnz); + problem.Q.x.resize(q_nnz); + auto row_starts = problem.Q.row_start; + for (i_t row = 0; row < old_num_cols; ++row) { + i_t new_row = old_to_new[row]; + for (i_t p = old_Q.row_start[row]; p < old_Q.row_start[row + 1]; ++p) { + problem.Q.j[row_starts[new_row]] = old_to_new[old_Q.j[p]]; + problem.Q.x[row_starts[new_row]] = old_Q.x[p]; + ++row_starts[new_row]; + } + } + problem.Q.m = num_cols; + problem.Q.n = num_cols; + problem.Q.nz_max = q_nnz; + } + + problem.A = expanded_A; + problem.A.n = num_cols; + problem.objective = objective; + problem.lower = lower; + problem.upper = upper; + problem.num_cols = num_cols; + problem.cone_var_start = new_cone_start; + return 0; + } + i_t num_cols = problem.num_cols + less_rows; i_t nnz = problem.A.col_start[problem.num_cols] + less_rows; problem.A.col_start.resize(num_cols + 1); @@ -817,26 +936,10 @@ i_t presolve(const lp_problem_t& original, { problem = original; std::vector row_sense(problem.num_rows, '='); - auto build_is_cone_variable = [](const lp_problem_t& current_problem) { - std::vector is_cone_variable(current_problem.num_cols, false); - if (!current_problem.second_order_cone_dims.empty()) { - i_t cone_end = current_problem.cone_var_start; - for (auto q_k : current_problem.second_order_cone_dims) { - cone_end += q_k; - } - for (i_t j = current_problem.cone_var_start; j < cone_end; ++j) { - is_cone_variable[j] = true; - } - } - return is_cone_variable; - }; - auto is_cone_variable = build_is_cone_variable(problem); // Check for free variables i_t free_variables = 0; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { - free_variables++; - } + for (i_t j = 0; j < linear_var_count(problem); j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } } if (settings.barrier_presolve && free_variables > 0) { @@ -846,8 +949,8 @@ i_t presolve(const lp_problem_t& original, std::vector row_marked(problem.num_rows, 0); current_free_variables.reserve(problem.num_cols); constraints_to_check.reserve(problem.num_rows); - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { + for (i_t j = 0; j < linear_var_count(problem); j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { current_free_variables.push_back(j); const i_t col_start = problem.A.col_start[j]; const i_t col_end = problem.A.col_start[j + 1]; @@ -986,10 +1089,8 @@ i_t presolve(const lp_problem_t& original, } i_t new_free_variables = 0; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { - new_free_variables++; - } + for (i_t j = 0; j < linear_var_count(problem); j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { new_free_variables++; } } if (removed_free_variables != 0) { settings.log.printf("Bounded %d free variables\n", removed_free_variables); @@ -997,7 +1098,6 @@ i_t presolve(const lp_problem_t& original, assert(new_free_variables == free_variables - removed_free_variables); free_variables = new_free_variables; } - // The original problem may have a variable without a lower bound // but a finite upper bound // -inf < x_j <= u_j @@ -1142,23 +1242,19 @@ i_t presolve(const lp_problem_t& original, // Check for empty cols i_t num_empty_cols = 0; { - for (i_t j = 0; j < problem.num_cols; ++j) { + for (i_t j = 0; j < linear_var_count(problem); ++j) { if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { num_empty_cols++; } } } if (num_empty_cols > 0) { settings.log.printf("Presolve attempt to remove %d empty cols\n", num_empty_cols); - remove_empty_cols(problem, num_empty_cols, presolve_info, is_cone_variable); + remove_empty_cols(problem, num_empty_cols, presolve_info); } - is_cone_variable = build_is_cone_variable(problem); - // Check for free variables (exclude cone variables — they are naturally unbounded) free_variables = 0; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { - free_variables++; - } + for (i_t j = 0; j < linear_var_count(problem); j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } } problem.Q.check_matrix("Before free variable expansion"); @@ -1175,123 +1271,164 @@ i_t presolve(const lp_problem_t& original, // becomes // sum_{k != j} c_k x_k + c_j v - c_j w - std::vector pair_index(problem.num_cols, -1); - i_t num_cols = problem.num_cols + free_variables; - i_t nnz = problem.A.col_start[problem.num_cols]; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { - nnz += (problem.A.col_start[j + 1] - problem.A.col_start[j]); + const i_t old_num_cols = problem.num_cols; + const i_t linear_cols = linear_var_count(problem); + const i_t new_cone_start = + problem.second_order_cone_dims.empty() ? 0 : linear_cols + free_variables; + const i_t num_cols = old_num_cols + free_variables; + + auto old_A = problem.A; + auto old_Q = problem.Q; + auto old_objective = problem.objective; + auto old_lower = problem.lower; + auto old_upper = problem.upper; + + std::vector partner_index(old_num_cols, -1); + std::vector orig_to_new(old_num_cols, -1); + std::vector is_free(old_num_cols, false); + + i_t next_partner = linear_cols; + for (i_t j = 0; j < linear_cols; ++j) { + orig_to_new[j] = j; + if (old_lower[j] == -inf && old_upper[j] == inf) { + is_free[j] = true; + partner_index[j] = next_partner++; } } + for (i_t j = linear_cols; j < old_num_cols; ++j) { + orig_to_new[j] = j + free_variables; + } + assert(next_partner == new_cone_start || problem.second_order_cone_dims.empty()); - problem.A.col_start.resize(num_cols + 1); - problem.A.i.resize(nnz); - problem.A.x.resize(nnz); - problem.lower.resize(num_cols); - problem.upper.resize(num_cols); - problem.objective.resize(num_cols); + i_t nnz = old_A.col_start[old_num_cols]; + for (i_t j = 0; j < linear_cols; ++j) { + if (is_free[j]) { nnz += old_A.col_start[j + 1] - old_A.col_start[j]; } + } - presolve_info.free_variable_pairs.resize(free_variables * 2); - i_t pair_count = 0; - i_t q = problem.A.col_start[problem.num_cols]; - i_t col = problem.num_cols; - for (i_t j = 0; j < problem.num_cols; j++) { - if (problem.lower[j] == -inf && problem.upper[j] == inf && !is_cone_variable[j]) { - for (i_t p = problem.A.col_start[j]; p < problem.A.col_start[j + 1]; p++) { - i_t i = problem.A.i[p]; - f_t aij = problem.A.x[p]; - problem.A.i[q] = i; - problem.A.x[q] = -aij; - q++; + csc_matrix_t expanded_A(problem.A.m, num_cols, nnz); + i_t nz = 0; + for (i_t j = 0; j < linear_cols; ++j) { + expanded_A.col_start[j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + for (i_t j = 0; j < linear_cols; ++j) { + if (partner_index[j] != -1) { + expanded_A.col_start[partner_index[j]] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = -old_A.x[p]; + ++nz; } - problem.lower[col] = 0.0; - problem.upper[col] = inf; - problem.objective[col] = -problem.objective[j]; - presolve_info.free_variable_pairs[pair_count++] = j; - presolve_info.free_variable_pairs[pair_count++] = col; - pair_index[j] = col; - problem.A.col_start[++col] = q; - problem.lower[j] = 0.0; } } + for (i_t j = linear_cols; j < old_num_cols; ++j) { + i_t new_j = orig_to_new[j]; + expanded_A.col_start[new_j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + expanded_A.col_start[num_cols] = nz; + + std::vector objective(num_cols); + std::vector lower(num_cols, -INFINITY); + std::vector upper(num_cols, INFINITY); + presolve_info.free_variable_pairs.clear(); + presolve_info.free_variable_pairs.reserve(free_variables * 2); + + for (i_t j = 0; j < linear_cols; ++j) { + objective[j] = old_objective[j]; + if (is_free[j]) { + lower[j] = 0.0; + upper[j] = inf; + objective[partner_index[j]] = -old_objective[j]; + lower[partner_index[j]] = 0.0; + upper[partner_index[j]] = inf; + presolve_info.free_variable_pairs.push_back(j); + presolve_info.free_variable_pairs.push_back(partner_index[j]); + } else { + lower[j] = old_lower[j]; + upper[j] = old_upper[j]; + } + } + for (i_t j = linear_cols; j < old_num_cols; ++j) { + i_t new_j = orig_to_new[j]; + objective[new_j] = old_objective[j]; + lower[new_j] = old_lower[j]; + upper[new_j] = old_upper[j]; + } - if (problem.Q.n > 0) { + if (old_Q.n > 0) { std::vector row_counts(num_cols, 0); - i_t nz_count = problem.Q.row_start[problem.num_cols]; - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - row_counts[row] = q_end - q_start; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - if (pair_index[row] != -1 && pair_index[col] != -1) { - assert(pair_index[row] >= problem.num_cols); - assert(pair_index[col] >= problem.num_cols); - row_counts[row]++; - row_counts[pair_index[row]] += 2; - nz_count += 3; - } else if (pair_index[col] != -1) { - assert(pair_index[col] >= problem.num_cols); - row_counts[row]++; + i_t nz_count = 0; + for (i_t row = 0; row < old_Q.n; ++row) { + i_t new_row = orig_to_new[row]; + i_t partner_row = partner_index[row]; + i_t q_start = old_Q.row_start[row]; + i_t q_end = old_Q.row_start[row + 1]; + for (i_t qj = q_start; qj < q_end; ++qj) { + i_t col = old_Q.j[qj]; + i_t partner_col = partner_index[col]; + row_counts[new_row]++; + nz_count++; + if (partner_col != -1) { + row_counts[new_row]++; nz_count++; - } else if (pair_index[row] != -1) { - assert(pair_index[row] >= problem.num_cols); - row_counts[pair_index[row]]++; + } + if (partner_row != -1) { + row_counts[partner_row]++; nz_count++; + if (partner_col != -1) { + row_counts[partner_row]++; + nz_count++; + } } } } std::vector Q_row_start(num_cols + 1); Q_row_start[0] = 0; - for (i_t row = 0; row < num_cols; row++) { + for (i_t row = 0; row < num_cols; ++row) { Q_row_start[row + 1] = Q_row_start[row] + row_counts[row]; } std::vector Q_j(nz_count); std::vector Q_x(nz_count); auto row_starts = Q_row_start; - // First copy the original Q ma - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - i_t q_nz = Q_row_start[row]; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - f_t qij = problem.Q.x[qj]; - Q_j[q_nz] = col; - Q_x[q_nz] = qij; - q_nz++; - } - row_starts[row] = q_nz; - } - - // Expand the Q matrix for the free variables - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - f_t qij = problem.Q.x[qj]; - if (pair_index[row] != -1 && pair_index[col] != -1) { - Q_j[row_starts[row]] = pair_index[col]; - Q_x[row_starts[row]] = -qij; - row_starts[row]++; - - Q_j[row_starts[pair_index[row]]] = col; - Q_x[row_starts[pair_index[row]]] = -qij; - row_starts[pair_index[row]]++; - - Q_j[row_starts[pair_index[row]]] = pair_index[col]; - Q_x[row_starts[pair_index[row]]] = qij; - row_starts[pair_index[row]]++; - } else if (pair_index[col] != -1) { - Q_j[row_starts[row]] = pair_index[col]; - Q_x[row_starts[row]] = -qij; - row_starts[row]++; - } else if (pair_index[row] != -1) { - Q_j[row_starts[pair_index[row]]] = col; - Q_x[row_starts[pair_index[row]]] = -qij; - row_starts[pair_index[row]]++; + for (i_t row = 0; row < old_Q.n; ++row) { + i_t new_row = orig_to_new[row]; + i_t partner_row = partner_index[row]; + i_t q_start = old_Q.row_start[row]; + i_t q_end = old_Q.row_start[row + 1]; + for (i_t qj = q_start; qj < q_end; ++qj) { + i_t col = old_Q.j[qj]; + f_t qij = old_Q.x[qj]; + i_t new_col = orig_to_new[col]; + i_t partner_col = partner_index[col]; + + Q_j[row_starts[new_row]] = new_col; + Q_x[row_starts[new_row]] = qij; + row_starts[new_row]++; + + if (partner_col != -1) { + Q_j[row_starts[new_row]] = partner_col; + Q_x[row_starts[new_row]] = -qij; + row_starts[new_row]++; + } + if (partner_row != -1) { + Q_j[row_starts[partner_row]] = new_col; + Q_x[row_starts[partner_row]] = -qij; + row_starts[partner_row]++; + if (partner_col != -1) { + Q_j[row_starts[partner_row]] = partner_col; + Q_x[row_starts[partner_row]] = qij; + row_starts[partner_row]++; + } } } } @@ -1304,9 +1441,13 @@ i_t presolve(const lp_problem_t& original, problem.Q.check_matrix("After free variable expansion"); } - // assert(problem.A.p[num_cols] == nnz); - problem.A.n = num_cols; - problem.num_cols = num_cols; + problem.A = expanded_A; + problem.A.n = num_cols; + problem.objective = objective; + problem.lower = lower; + problem.upper = upper; + problem.num_cols = num_cols; + if (!problem.second_order_cone_dims.empty()) { problem.cone_var_start = new_cone_start; } } if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0 && @@ -1388,7 +1529,7 @@ void crush_primal_solution(const user_problem_t& user_problem, // including previously added slacks, are reset before writing new values. solution.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - solution[j] = user_solution[j]; + solution[user_col_to_problem_col(user_problem, problem, j)] = user_solution[j]; } std::vector primal_residual(problem.num_rows); @@ -1428,7 +1569,7 @@ void crush_primal_solution_with_slack(const user_problem_t& user_probl // Re-crush can be called with a reused output vector; clear stale entries first. solution.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - solution[j] = user_solution[j]; + solution[user_col_to_problem_col(user_problem, problem, j)] = user_solution[j]; } std::vector primal_residual(problem.num_rows); @@ -1475,9 +1616,9 @@ f_t crush_dual_solution(const user_problem_t& user_problem, for (i_t i = 0; i < user_problem.num_rows; i++) { y[i] = user_y[i]; } - z.resize(problem.num_cols); + z.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - z[j] = user_z[j]; + z[user_col_to_problem_col(user_problem, problem, j)] = user_z[j]; } std::vector is_range_row(problem.num_rows, false); @@ -1544,6 +1685,17 @@ f_t crush_dual_solution(const user_problem_t& user_problem, return dual_res_inf; } +template +static i_t user_col_to_problem_col(const user_problem_t& user_problem, + const lp_problem_t& problem, + i_t user_col) +{ + if (user_problem.second_order_cone_dims.empty()) { return user_col; } + if (problem.cone_var_start <= user_problem.cone_var_start) { return user_col; } + if (user_col < user_problem.cone_var_start) { return user_col; } + return problem.cone_var_start + (user_col - user_problem.cone_var_start); +} + template void uncrush_primal_solution(const user_problem_t& user_problem, const lp_problem_t& problem, @@ -1553,9 +1705,9 @@ void uncrush_primal_solution(const user_problem_t& user_problem, user_solution.resize(user_problem.num_cols); assert(problem.num_cols >= user_problem.num_cols); assert(solution.size() >= user_problem.num_cols); - std::copy(solution.begin(), - solution.begin() + std::min((i_t)solution.size(), user_problem.num_cols), - user_solution.data()); + for (i_t j = 0; j < user_problem.num_cols; ++j) { + user_solution[j] = solution[user_col_to_problem_col(user_problem, problem, j)]; + } } template @@ -1658,13 +1810,25 @@ void uncrush_solution(const presolve_info_t& presolve_info, if (num_free_variables > 0) { settings.log.printf("Post-solve: Handling free variables %d\n", num_free_variables); // We added free variables so we need to map the crushed solution back to the original variables + std::vector remove_partner(input_x.size(), false); for (i_t k = 0; k < 2 * num_free_variables; k += 2) { const i_t u = free_variable_pairs[k]; const i_t v = free_variable_pairs[k + 1]; input_x[u] -= input_x[v]; + remove_partner[v] = true; + } + std::vector compact_x; + std::vector compact_z; + compact_x.reserve(input_x.size() - num_free_variables); + compact_z.reserve(input_z.size() - num_free_variables); + for (i_t j = 0; j < static_cast(input_x.size()); ++j) { + if (!remove_partner[j]) { + compact_x.push_back(input_x[j]); + compact_z.push_back(input_z[j]); + } } - input_z.resize(input_z.size() - num_free_variables); - input_x.resize(input_x.size() - num_free_variables); + input_x = compact_x; + input_z = compact_z; } if (presolve_info.removed_variables.size() > 0) { diff --git a/cpp/src/dual_simplex/scaling.cpp b/cpp/src/dual_simplex/scaling.cpp index 54fe5dcf46..65dcb49bfe 100644 --- a/cpp/src/dual_simplex/scaling.cpp +++ b/cpp/src/dual_simplex/scaling.cpp @@ -22,7 +22,7 @@ i_t column_scaling(const lp_problem_t& unscaled, i_t m = scaled.num_rows; i_t n = scaled.num_cols; - if (!settings.scale_columns || unscaled.Q.n > 0) { + if (!settings.scale_columns || unscaled.Q.n > 0 || !unscaled.second_order_cone_dims.empty()) { settings.log.printf("Skipping column scaling\n"); column_scaling.resize(n, 1.0); return 0; @@ -30,19 +30,9 @@ i_t column_scaling(const lp_problem_t& unscaled, column_scaling.resize(n); - i_t cone_start = unscaled.cone_var_start; - i_t cone_end = cone_start; - for (auto q_k : unscaled.second_order_cone_dims) { - cone_end += q_k; - } - f_t max = 0; f_t min = std::numeric_limits::max(); for (i_t j = 0; j < n; ++j) { - if (j >= cone_start && j < cone_end) { - column_scaling[j] = 1.0; - continue; - } const i_t col_start = scaled.A.col_start[j]; const i_t col_end = scaled.A.col_start[j + 1]; f_t sum = 0.0; diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 9bc12bb925..544647d5d0 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -349,8 +349,18 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us if (!user_problem.second_order_cone_dims.empty()) { i_t cone_end = user_problem.cone_var_start; for (auto q_k : user_problem.second_order_cone_dims) { + if (q_k <= 1) { + settings.log.printf( + "Error: second-order cone dimensions must be at least 2; use linear variables instead of " + "Q^1\n"); + return lp_status_t::NUMERICAL_ISSUES; + } cone_end += q_k; } + if (cone_end != user_problem.num_cols) { + settings.log.printf("Error: conic variables must form a trailing block [linear | cone]\n"); + return lp_status_t::NUMERICAL_ISSUES; + } for (i_t j = user_problem.cone_var_start; j < cone_end; ++j) { if (user_problem.lower[j] != 0.0 && user_problem.lower[j] > -1e30) { settings.log.printf("Error: explicit lower bound on conic variable %d is not supported\n", diff --git a/cpp/src/dual_simplex/vector_math.cuh b/cpp/src/dual_simplex/vector_math.cuh index abc7263858..32c75ea366 100644 --- a/cpp/src/dual_simplex/vector_math.cuh +++ b/cpp/src/dual_simplex/vector_math.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,6 +9,10 @@ #include +#include +#include +#include + #include #include @@ -28,6 +32,7 @@ struct norm_inf_max { template f_t device_custom_vector_norm_inf(InputIteratorT in, i_t size, rmm::cuda_stream_view stream_view) { + if (size == 0) { return 0; } // FIXME: Tmp storage stored in vector_math class. auto d_out = rmm::device_scalar(stream_view); rmm::device_uvector d_temp_storage(0, stream_view); @@ -62,6 +67,12 @@ f_t device_vector_norm_inf(const rmm::device_uvector& in, rmm::cuda_stream_ return device_custom_vector_norm_inf(in.data(), in.size(), stream_view); } +template +f_t device_vector_norm_inf(raft::device_span in, rmm::cuda_stream_view stream_view) +{ + return device_custom_vector_norm_inf(in.data(), in.size(), stream_view); +} + // TMP we should just have a CPU and GPU version to do the comparison // Should never have to norm inf a CPU vector if we are using the GPU template @@ -71,4 +82,12 @@ f_t vector_norm_inf(const std::vector& x, rmm::cuda_stream_view return device_vector_norm_inf(d_x, stream_view); } +template +f_t vector_norm_inf(raft::host_span x, rmm::cuda_stream_view stream_view) +{ + rmm::device_uvector d_x(x.size(), stream_view); + raft::copy(d_x.data(), x.data(), x.size(), stream_view); + return device_vector_norm_inf(d_x, stream_view); +} + } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu index 37972ba442..0e001b802f 100644 --- a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu +++ b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu @@ -24,13 +24,7 @@ #include #include -#include -#include -#include #include -#include -#include -#include namespace cuopt::linear_programming::detail { template diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu index 035fde5ac6..e6f48ac9de 100644 --- a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu @@ -326,16 +326,6 @@ auto ref_fused_corrector_single(const std::vector& dx_aff, return ref_apply_hinv_single(corr, w_bar, inv_eta, inv_1pw0); } -template -auto ref_interior_shift_single(std::vector u) -> std::vector -{ - if (u.empty()) { return u; } - - f_t gap = tail_norm(u) - u[0]; - if (gap >= f_t(0)) { u[0] += f_t(1) + gap; } - return u; -} - template auto make_patterned_cone(int q, f_t head, f_t scale) -> std::vector { @@ -401,54 +391,39 @@ class second_order_cone_test : public ::testing::Test { } } - void launch_apply_hinv(const rmm::device_uvector& z, - rmm::device_uvector& out, - const rmm::device_uvector& w_bar, - const rmm::device_uvector& inv_eta, - const rmm::device_uvector& inv_1pw0, - const rmm::device_uvector& cone_offsets, - i_t k) - { - apply_Hinv_kernel<<>>(cuopt::make_span(z), - cuopt::make_span(out), - cuopt::make_span(w_bar), - cuopt::make_span(inv_eta), - cuopt::make_span(inv_1pw0), - cuopt::make_span(cone_offsets), - k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - sync(); - } - - void launch_step_length(const rmm::device_uvector& s, - const rmm::device_uvector& ds, - const rmm::device_uvector& lambda, - const rmm::device_uvector& dlambda, + void launch_step_length(rmm::device_uvector& s, + rmm::device_uvector& ds, + rmm::device_uvector& lambda, + rmm::device_uvector& dlambda, rmm::device_uvector& alpha, const rmm::device_uvector& cone_offsets, i_t k, f_t alpha_max) { - step_length_kernel<<>>(cuopt::make_span(s), - cuopt::make_span(ds), - cuopt::make_span(lambda), - cuopt::make_span(dlambda), - cuopt::make_span(alpha), - cuopt::make_span(cone_offsets), - k, - alpha_max); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - sync(); - } - - void launch_interior_shift(rmm::device_uvector& u, - const rmm::device_uvector& cone_offsets, - i_t k) - { - interior_shift_kernel - <<>>(cuopt::make_span(u), cuopt::make_span(cone_offsets), k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + auto h_offsets = copy_to_host(cone_offsets); + std::vector dims(k, 0); + for (i_t cone = 0; cone < k; ++cone) { + dims[cone] = h_offsets[cone + 1] - h_offsets[cone]; + } + cone_data_t cones(k, dims, cuopt::make_span(s), cuopt::make_span(lambda), stream_); + rmm::device_uvector alpha_dual(k, stream_); + launch_nt_scaling(cones, stream_); + compute_cone_step_length_per_cone(cones, + cuopt::make_span(s), + cuopt::make_span(ds), + cuopt::make_span(lambda), + cuopt::make_span(dlambda), + cuopt::make_span(alpha), + cuopt::make_span(alpha_dual), + alpha_max, + stream_); sync(); + auto h_primal = copy_to_host(alpha); + auto h_dual = copy_to_host(alpha_dual); + for (i_t i = 0; i < k; ++i) { + h_primal[i] = std::min(h_primal[i], h_dual[i]); + } + copy_to_device(alpha, h_primal); } void launch_apply_hinv2(const rmm::device_uvector& v, @@ -458,77 +433,64 @@ class second_order_cone_test : public ::testing::Test { const rmm::device_uvector& cone_offsets, i_t k) { - apply_Hinv2_kernel<<>>(cuopt::make_span(v), - cuopt::make_span(out), - cuopt::make_span(w_bar), - cuopt::make_span(inv_eta), - cuopt::make_span(cone_offsets), - k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - sync(); - } - - void launch_jordan_product(const rmm::device_uvector& a, - const rmm::device_uvector& b, - rmm::device_uvector& out, - const rmm::device_uvector& cone_offsets, - i_t k) - { - jordan_product_kernel<<>>(cuopt::make_span(a), - cuopt::make_span(b), - cuopt::make_span(out), - cuopt::make_span(cone_offsets), - k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - sync(); - } - - void launch_inverse_jordan_product(const rmm::device_uvector& omega, - const rmm::device_uvector& r, - const rmm::device_uvector& rho, - rmm::device_uvector& out, - const rmm::device_uvector& cone_offsets, - i_t k) - { - inverse_jordan_product_kernel - <<>>(cuopt::make_span(omega), - cuopt::make_span(r), - cuopt::make_span(rho), - cuopt::make_span(out), - cuopt::make_span(cone_offsets), - k); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + auto h_offsets = copy_to_host(cone_offsets); + std::vector h_element_cone_ids(v.size(), 0); + for (i_t cone = 0; cone < k; ++cone) { + std::fill(h_element_cone_ids.begin() + h_offsets[cone], + h_element_cone_ids.begin() + h_offsets[cone + 1], + cone); + } + auto d_element_cone_ids = make_device_vector(h_element_cone_ids); + rmm::device_uvector tail_dot(k, stream_); + rmm::device_uvector workspace(0, stream_); + apply_hinv2(cuopt::make_span(v), + cuopt::make_span(out), + cuopt::make_span(w_bar), + cuopt::make_span(inv_eta), + cuopt::make_span(cone_offsets), + cuopt::make_span(d_element_cone_ids), + cuopt::make_span(tail_dot), + workspace, + k, + stream_); sync(); } void launch_fused_corrector(const rmm::device_uvector& dx_aff, - const cone_data_t& cones, + cone_data_t& cones, f_t sigma_mu, rmm::device_uvector& out) { - compute_combined_cone_rhs_term(cuopt::make_span(dx_aff), cones, sigma_mu, out, stream_); + out.resize(cones.m_c, stream_); + compute_combined_cone_rhs_term( + cuopt::make_span(dx_aff), cones, sigma_mu, cuopt::make_span(out), stream_); sync(); } - void launch_affine_cone_rhs(const cone_data_t& cones, rmm::device_uvector& out) + void launch_affine_cone_rhs(cone_data_t& cones, rmm::device_uvector& out) { - compute_affine_cone_rhs_term(cones, out, stream_); + out.resize(cones.m_c, stream_); + compute_affine_cone_rhs_term(cones, cuopt::make_span(out), stream_); sync(); } - void launch_recover_cone_dz(const rmm::device_uvector& dx, - const cone_data_t& cones, - const rmm::device_uvector& cone_rhs_term, - rmm::device_uvector& hinv2_dx, - rmm::device_uvector& dz) + void launch_recover_cone_dz_from_target(const rmm::device_uvector& dx, + cone_data_t& cones, + const rmm::device_uvector& cone_target, + rmm::device_uvector& hinv2_dx, + rmm::device_uvector& dz) { - recover_cone_dz( - cuopt::make_span(dx), cones, cone_rhs_term, hinv2_dx, cuopt::make_span(dz), stream_); + recover_cone_dz_from_target(cuopt::make_span(dx), + cones, + cuopt::make_span(cone_target), + hinv2_dx, + cuopt::make_span(dz), + stream_); sync(); } void launch_accumulate_cone_hinv2(const rmm::device_uvector& x, - const cone_data_t& cones, + cone_data_t& cones, rmm::device_uvector& hinv2_x, rmm::device_uvector& out) { @@ -547,25 +509,107 @@ class second_order_cone_test : public ::testing::Test { } }; -TEST_F(second_order_cone_test, cone_data_topology_and_bucket_partitioning) +TEST_F(second_order_cone_test, cone_data_topology_and_flat_index_maps) { - std::vector dims{1, 32, 33, 2048, 2049}; + std::vector dims{1, 2, 3, 4}; cone_data_t cones(static_cast(dims.size()), dims, {}, {}, stream_); auto expected_offsets = build_offsets(dims); auto actual_offsets = copy_to_host(cones.cone_offsets); auto actual_dims = copy_to_host(cones.cone_dims); - auto small_ids = copy_to_host(cones.small_cone_ids); - auto medium_ids = copy_to_host(cones.medium_cone_ids); - auto large_ids = copy_to_host(cones.large_cone_ids); + auto element_cone_ids = copy_to_host(cones.element_cone_ids); + auto block_cone_ids = copy_to_host(cones.block_entry_cone_ids); + + std::vector expected_element_cone_ids; + for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { + expected_element_cone_ids.insert(expected_element_cone_ids.end(), dims[cone], cone); + } + + std::vector expected_block_cone_ids; + for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { + expected_block_cone_ids.insert(expected_block_cone_ids.end(), dims[cone] * dims[cone], cone); + } EXPECT_EQ(cones.K, static_cast(dims.size())); EXPECT_EQ(cones.m_c, expected_offsets.back()); EXPECT_EQ(actual_offsets, expected_offsets); EXPECT_EQ(actual_dims, dims); - EXPECT_EQ(small_ids, std::vector({0, 1})); - EXPECT_EQ(medium_ids, std::vector({2, 3})); - EXPECT_EQ(large_ids, std::vector({4})); + EXPECT_EQ(element_cone_ids, expected_element_cone_ids); + EXPECT_EQ(block_cone_ids, expected_block_cone_ids); +} + +TEST_F(second_order_cone_test, cone_data_reuses_named_scratch_slots) +{ + std::vector> s_cones{{5.0, 1.0, 1.0}, {6.0, 1.0, -0.5, 0.25, 0.1}}; + std::vector> ds_cones{{-0.5, 0.1, 0.1}, {-0.2, 0.05, 0.03, -0.02, 0.01}}; + std::vector> lambda_cones{{5.0, 1.0, 1.0}, {4.0, 0.2, 0.3, -0.1, 0.05}}; + std::vector> dlambda_cones{{-0.5, 0.1, 0.1}, {-0.1, 0.02, -0.03, 0.01, -0.01}}; + std::vector dims{3, 5}; + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_ds = make_device_vector(pack_cones(ds_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + auto d_dlambda = make_device_vector(pack_cones(dlambda_cones)); + cone_data_t cones(static_cast(dims.size()), + dims, + cuopt::make_span(d_s), + cuopt::make_span(d_lambda), + stream_); + + EXPECT_EQ(cones.scratch.step_s_du1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_s_u1du1().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_l_du1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_l_u1du1().size(), dims.size()); + EXPECT_EQ(cones.scratch.hinv2_tail_dot().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_s_u1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_l_u1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.nt_s1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.nt_l1_sq().size(), dims.size()); + EXPECT_EQ(cones.scratch.nt_sl().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_alpha_primal_span().size(), dims.size()); + EXPECT_EQ(cones.scratch.step_alpha_dual_span().size(), dims.size()); + + auto s_du1_ptr = cones.scratch.step_s_du1_sq().data(); + auto s_u1du1_ptr = cones.scratch.step_s_u1du1().data(); + auto l_du1_ptr = cones.scratch.step_l_du1_sq().data(); + auto l_u1du1_ptr = cones.scratch.step_l_u1du1().data(); + auto hinv2_ptr = cones.scratch.hinv2_tail_dot().data(); + auto s_u1_sq_ptr = cones.scratch.step_s_u1_sq().data(); + auto l_u1_sq_ptr = cones.scratch.step_l_u1_sq().data(); + auto nt_s1_ptr = cones.scratch.nt_s1_sq().data(); + auto nt_l1_ptr = cones.scratch.nt_l1_sq().data(); + auto nt_sl_ptr = cones.scratch.nt_sl().data(); + auto alpha_p_ptr = cones.scratch.step_alpha_primal_span().data(); + auto alpha_d_ptr = cones.scratch.step_alpha_dual_span().data(); + + EXPECT_EQ(hinv2_ptr, s_du1_ptr); + EXPECT_EQ(s_du1_ptr, nt_s1_ptr); + EXPECT_EQ(s_u1du1_ptr, nt_l1_ptr); + EXPECT_EQ(s_u1_sq_ptr, nt_sl_ptr); + + compute_cone_step_length_per_cone(cones, + cuopt::make_span(d_s), + cuopt::make_span(d_ds), + cuopt::make_span(d_lambda), + cuopt::make_span(d_dlambda), + cones.scratch.step_alpha_primal_span(), + cones.scratch.step_alpha_dual_span(), + f_t(10.0), + stream_); + sync(); + + EXPECT_EQ(s_du1_ptr, cones.scratch.step_s_du1_sq().data()); + EXPECT_EQ(s_u1du1_ptr, cones.scratch.step_s_u1du1().data()); + EXPECT_EQ(l_du1_ptr, cones.scratch.step_l_du1_sq().data()); + EXPECT_EQ(l_u1du1_ptr, cones.scratch.step_l_u1du1().data()); + EXPECT_EQ(hinv2_ptr, cones.scratch.hinv2_tail_dot().data()); + EXPECT_EQ(s_u1_sq_ptr, cones.scratch.step_s_u1_sq().data()); + EXPECT_EQ(l_u1_sq_ptr, cones.scratch.step_l_u1_sq().data()); + EXPECT_EQ(nt_s1_ptr, cones.scratch.nt_s1_sq().data()); + EXPECT_EQ(nt_l1_ptr, cones.scratch.nt_l1_sq().data()); + EXPECT_EQ(nt_sl_ptr, cones.scratch.nt_sl().data()); + EXPECT_EQ(alpha_p_ptr, cones.scratch.step_alpha_primal_span().data()); + EXPECT_EQ(alpha_d_ptr, cones.scratch.step_alpha_dual_span().data()); } TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) @@ -582,7 +626,6 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) launch_nt_scaling(cones, stream_); - auto eta = copy_to_host(cones.eta); auto inv_eta = copy_to_host(cones.inv_eta); auto inv_1pw0 = copy_to_host(cones.inv_1pw0); auto rho = copy_to_host(cones.rho); @@ -591,7 +634,6 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) auto ref = ref_nt_scaling_single(s_cones[0], lambda_cones[0]); - EXPECT_NEAR(eta[0], ref.eta, 1e-12); EXPECT_NEAR(inv_eta[0], ref.inv_eta, 1e-12); EXPECT_NEAR(inv_1pw0[0], ref.inv_1pw0, 1e-12); EXPECT_NEAR(rho[0], ref.rho, 1e-12); @@ -624,7 +666,6 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_across_bucket_sizes) launch_nt_scaling(cones, stream_); - auto eta = copy_to_host(cones.eta); auto inv_eta = copy_to_host(cones.inv_eta); auto inv_1pw0 = copy_to_host(cones.inv_1pw0); auto rho = copy_to_host(cones.rho); @@ -634,7 +675,6 @@ TEST_F(second_order_cone_test, nt_scaling_matches_reference_across_bucket_sizes) for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { auto ref = ref_nt_scaling_single(s_cones[cone], lambda_cones[cone]); - EXPECT_NEAR(eta[cone], ref.eta, 1e-10) << "cone " << cone; EXPECT_NEAR(inv_eta[cone], ref.inv_eta, 1e-10) << "cone " << cone; EXPECT_NEAR(inv_1pw0[cone], ref.inv_1pw0, 1e-10) << "cone " << cone; EXPECT_NEAR(rho[cone], ref.rho, 1e-10) << "cone " << cone; @@ -661,16 +701,60 @@ TEST_F(second_order_cone_test, nt_scaling_omega_equals_H_times_lambda) launch_nt_scaling(cones, stream_); - auto eta = copy_to_host(cones.eta); + auto inv_eta = copy_to_host(cones.inv_eta); auto inv_1pw0 = copy_to_host(cones.inv_1pw0); auto w_bar = copy_to_host(cones.w_bar); auto omega = copy_to_host(cones.omega); // NT symmetry: omega should equal both H^{-1}s and H*lambda. - auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta[0], inv_1pw0[0]); + auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, f_t(1) / inv_eta[0], inv_1pw0[0]); expect_vector_near(omega, H_lambda, 1e-10, 1e-8, "omega_vs_H_lambda"); } +TEST_F(second_order_cone_test, nt_scaling_tail_identities_match_heads) +{ + std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; + std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; + std::vector dims{5}; + + auto d_s = make_device_vector(pack_cones(s_cones)); + auto d_lambda = make_device_vector(pack_cones(lambda_cones)); + cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); + + launch_nt_scaling(cones, stream_); + + auto inv_eta = copy_to_host(cones.inv_eta); + auto rho = copy_to_host(cones.rho); + auto w_bar = copy_to_host(cones.w_bar); + auto omega = copy_to_host(cones.omega); + + f_t s_J = std::sqrt(j_norm_sq(s_cones[0])); + f_t l_J = std::sqrt(j_norm_sq(lambda_cones[0])); + f_t s_dot_raw = s_cones[0][0] * lambda_cones[0][0]; + for (std::size_t j = 1; j < s_cones[0].size(); ++j) { + s_dot_raw += s_cones[0][j] * lambda_cones[0][j]; + } + f_t s_dot_l = s_dot_raw / (s_J * l_J); + f_t gamma = std::sqrt(std::max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); + f_t w0_from_heads = (s_cones[0][0] / s_J + lambda_cones[0][0] / l_J) / (f_t(2) * gamma); + + f_t omega_tail_sq = f_t(0); + f_t w_omega_tail = f_t(0); + for (std::size_t j = 1; j < omega.size(); ++j) { + omega_tail_sq += omega[j] * omega[j]; + w_omega_tail += w_bar[j] * omega[j]; + } + + EXPECT_NEAR(omega_tail_sq, omega[0] * omega[0] - rho[0], 1e-10) + << "||omega_1||^2 should be derived from omega_0 and rho"; + EXPECT_NEAR(w_bar[0], w0_from_heads, 1e-10) + << "w_bar_0 should be derived directly from normalized cone heads"; + + f_t derived_w_omega = f_t(0.5) * (inv_eta[0] * s_cones[0][0] - lambda_cones[0][0] / inv_eta[0]); + EXPECT_NEAR(w_omega_tail, derived_w_omega, 1e-10) + << "w_bar_1^T omega_1 should be derived from cone heads"; +} + TEST_F(second_order_cone_test, nt_scaling_near_boundary_is_stable) { // s and lambda barely inside the cone: ||tail||^2 ≈ head^2. @@ -684,62 +768,25 @@ TEST_F(second_order_cone_test, nt_scaling_near_boundary_is_stable) launch_nt_scaling(cones, stream_); - auto eta = copy_to_host(cones.eta); auto inv_eta = copy_to_host(cones.inv_eta); auto inv_1pw0 = copy_to_host(cones.inv_1pw0); auto w_bar = copy_to_host(cones.w_bar); auto omega = copy_to_host(cones.omega); + f_t eta_val = f_t(1) / inv_eta[0]; + EXPECT_NEAR(j_norm_sq(w_bar), f_t(1), 1e-8) << "w_bar J-norm not 1 near boundary"; EXPECT_GT(w_bar[0], tail_norm(w_bar)) << "w_bar not interior near boundary"; // Round-trip: H(omega) should equal s. - auto H_omega = ref_apply_H_single(omega, w_bar, eta[0], inv_1pw0[0]); + auto H_omega = ref_apply_H_single(omega, w_bar, eta_val, inv_1pw0[0]); expect_vector_near(H_omega, pack_cones(s_cones), 1e-8, 1e-6, "H_omega_vs_s_near_boundary"); // Symmetry: omega should also equal H*lambda. - auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta[0], inv_1pw0[0]); + auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta_val, inv_1pw0[0]); expect_vector_near(omega, H_lambda, 1e-8, 1e-6, "omega_vs_H_lambda_near_boundary"); } -TEST_F(second_order_cone_test, apply_hinv_matches_reference_for_packed_cones) -{ - std::vector dims{1, 3, 5}; - auto offsets = build_offsets(dims); - - std::vector> z_cones{{3.0}, {2.0, -1.0, 0.5}, {1.0, 0.25, -0.75, 0.5, -0.125}}; - std::vector> w_bar_cones{ - {1.0}, {0.0, 0.15, -0.05}, {0.0, 0.10, -0.20, 0.05, 0.15}}; - std::vector inv_eta_host{0.5, 1.25, 0.75}; - std::vector inv_1pw0_host(inv_eta_host.size(), 0.0); - - for (std::size_t cone = 0; cone < w_bar_cones.size(); ++cone) { - f_t w1_sq = f_t(0); - for (std::size_t j = 1; j < w_bar_cones[cone].size(); ++j) { - w1_sq += w_bar_cones[cone][j] * w_bar_cones[cone][j]; - } - w_bar_cones[cone][0] = std::sqrt(f_t(1) + w1_sq); - inv_1pw0_host[cone] = f_t(1) / (f_t(1) + w_bar_cones[cone][0]); - } - - auto z = make_device_vector(pack_cones(z_cones)); - auto w_bar = make_device_vector(pack_cones(w_bar_cones)); - auto inv_eta = make_device_vector(inv_eta_host); - auto inv_1pw0 = make_device_vector(inv_1pw0_host); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector out(z.size(), stream_); - - launch_apply_hinv(z, out, w_bar, inv_eta, inv_1pw0, d_offsets, static_cast(dims.size())); - - auto actual_out = copy_to_host(out); - auto expected = pack_cones(std::vector>{ - ref_apply_hinv_single(z_cones[0], w_bar_cones[0], inv_eta_host[0], inv_1pw0_host[0]), - ref_apply_hinv_single(z_cones[1], w_bar_cones[1], inv_eta_host[1], inv_1pw0_host[1]), - ref_apply_hinv_single(z_cones[2], w_bar_cones[2], inv_eta_host[2], inv_1pw0_host[2])}); - - expect_vector_near(actual_out, expected, 1e-12, 1e-10, "apply_hinv"); -} - TEST_F(second_order_cone_test, step_length_matches_reference_and_handles_q1) { std::vector dims{1, 3}; @@ -938,33 +985,6 @@ TEST_F(second_order_cone_test, step_length_boundary_tightness) EXPECT_GT(j_norm_sq(l_int), 0.0) << "lambda not interior at (1-eps)*alpha"; } -TEST_F(second_order_cone_test, interior_shift_matches_reference_and_preserves_tail) -{ - std::vector dims{1, 3, 4}; - auto offsets = build_offsets(dims); - - std::vector> cones{{-0.25}, {2.0, 0.3, 0.4}, {0.5, 0.6, 0.8, 0.0}}; - auto packed = pack_cones(cones); - auto expected = pack_cones(std::vector>{ref_interior_shift_single(cones[0]), - ref_interior_shift_single(cones[1]), - ref_interior_shift_single(cones[2])}); - - auto u = make_device_vector(packed); - auto d_offsets = make_device_vector(offsets); - launch_interior_shift(u, d_offsets, static_cast(dims.size())); - - auto actual = copy_to_host(u); - expect_vector_near(actual, expected, 1e-12, 1e-10, "interior_shift"); - - for (std::size_t cone = 0; cone < dims.size(); ++cone) { - auto shifted = slice_cone(actual, offsets, static_cast(cone)); - EXPECT_GT(shifted[0], tail_norm(shifted)); - for (std::size_t j = 1; j < shifted.size(); ++j) { - EXPECT_EQ(shifted[j], cones[cone][j]) << "cone " << cone << " tail " << j; - } - } -} - TEST_F(second_order_cone_test, apply_hinv2_matches_reference_for_packed_cones) { std::vector dims{1, 3, 5}; @@ -1000,94 +1020,6 @@ TEST_F(second_order_cone_test, apply_hinv2_matches_reference_for_packed_cones) expect_vector_near(actual, expected, 1e-12, 1e-10, "apply_hinv2"); } -TEST_F(second_order_cone_test, apply_hinv2_equals_double_hinv_with_nt_scaling) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - launch_nt_scaling(cones, stream_); - - std::vector> v_cones{{1.0, -0.3, 0.2}, {0.5, 0.1, -0.15, 0.25, -0.1}}; - auto d_v = make_device_vector(pack_cones(v_cones)); - auto d_offsets = make_device_vector(offsets); - - // H^{-2} v (single kernel) - rmm::device_uvector d_hinv2(cones.omega.size(), stream_); - launch_apply_hinv2( - d_v, d_hinv2, cones.w_bar, cones.inv_eta, d_offsets, static_cast(dims.size())); - - // H^{-1}(H^{-1} v) (two passes) - rmm::device_uvector d_tmp(cones.omega.size(), stream_); - rmm::device_uvector d_double(cones.omega.size(), stream_); - launch_apply_hinv(d_v, - d_tmp, - cones.w_bar, - cones.inv_eta, - cones.inv_1pw0, - d_offsets, - static_cast(dims.size())); - launch_apply_hinv(d_tmp, - d_double, - cones.w_bar, - cones.inv_eta, - cones.inv_1pw0, - d_offsets, - static_cast(dims.size())); - - auto hinv2_actual = copy_to_host(d_hinv2); - auto double_actual = copy_to_host(d_double); - expect_vector_near(hinv2_actual, double_actual, 1e-10, 1e-8, "hinv2_vs_double_hinv"); -} - -TEST_F(second_order_cone_test, apply_hinv2_strided_loop_for_large_cone) -{ - std::vector dims{513}; - auto offsets = build_offsets(dims); - - auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); - auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); - - auto d_s = make_device_vector(s_cone); - auto d_lambda = make_device_vector(lambda_cone); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto v_cone = make_patterned_cone(dims[0], 3.0, 0.006); - auto d_v = make_device_vector(v_cone); - auto d_offsets = make_device_vector(offsets); - - // Direct H^{-2} apply - rmm::device_uvector d_hinv2(cones.omega.size(), stream_); - launch_apply_hinv2(d_v, d_hinv2, cones.w_bar, cones.inv_eta, d_offsets, 1); - - // Reference: two H^{-1} passes - rmm::device_uvector d_tmp(cones.omega.size(), stream_); - rmm::device_uvector d_double(cones.omega.size(), stream_); - launch_apply_hinv(d_v, d_tmp, cones.w_bar, cones.inv_eta, cones.inv_1pw0, d_offsets, 1); - launch_apply_hinv(d_tmp, d_double, cones.w_bar, cones.inv_eta, cones.inv_1pw0, d_offsets, 1); - - auto hinv2_actual = copy_to_host(d_hinv2); - auto double_actual = copy_to_host(d_double); - expect_vector_near(hinv2_actual, double_actual, 1e-8, 1e-6, "hinv2_large"); - - // Also check against CPU reference - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_host = copy_to_host(cones.inv_eta); - auto ref = ref_apply_hinv2_single(v_cone, w_bar_host, inv_eta_host[0]); - expect_vector_near(hinv2_actual, ref, 1e-8, 1e-6, "hinv2_large_ref"); -} - TEST_F(second_order_cone_test, affine_cone_rhs_matches_hinv2_of_primal) { std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; @@ -1310,168 +1242,6 @@ TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_large_cone) } } -TEST_F(second_order_cone_test, jordan_product_matches_reference_for_packed_cones) -{ - std::vector dims{1, 3, 4}; - auto offsets = build_offsets(dims); - - std::vector> a_cones{{2.0}, {2.0, 1.0, -0.5}, {3.0, 0.25, -0.75, 0.5}}; - std::vector> b_cones{{4.0}, {1.5, -0.5, 0.25}, {2.0, -0.25, 0.5, 1.0}}; - - auto a = make_device_vector(pack_cones(a_cones)); - auto b = make_device_vector(pack_cones(b_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector out(a.size(), stream_); - - launch_jordan_product(a, b, out, d_offsets, static_cast(dims.size())); - - auto actual = copy_to_host(out); - auto expected = - pack_cones(std::vector>{ref_jordan_product_single(a_cones[0], b_cones[0]), - ref_jordan_product_single(a_cones[1], b_cones[1]), - ref_jordan_product_single(a_cones[2], b_cones[2])}); - - expect_vector_near(actual, expected, 1e-12, 1e-10, "jordan_product"); -} - -TEST_F(second_order_cone_test, inverse_jordan_product_matches_reference_and_identity) -{ - std::vector dims{1, 3, 5}; - auto offsets = build_offsets(dims); - - std::vector> omega_cones{ - {2.0}, {2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> r_cones{{4.0}, {1.0, -0.25, 0.5}, {2.0, 0.5, -0.25, 0.25, 0.75}}; - - std::vector rho_host; - for (const auto& w : omega_cones) { - rho_host.push_back(j_norm_sq(w)); - } - - auto d_omega = make_device_vector(pack_cones(omega_cones)); - auto d_r = make_device_vector(pack_cones(r_cones)); - auto d_rho = make_device_vector(rho_host); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector d_out(d_omega.size(), stream_); - - launch_inverse_jordan_product( - d_omega, d_r, d_rho, d_out, d_offsets, static_cast(dims.size())); - - auto actual = copy_to_host(d_out); - auto expected = pack_cones(std::vector>{ - ref_inverse_jordan_product_single(omega_cones[0], r_cones[0], rho_host[0]), - ref_inverse_jordan_product_single(omega_cones[1], r_cones[1], rho_host[1]), - ref_inverse_jordan_product_single(omega_cones[2], r_cones[2], rho_host[2])}); - - expect_vector_near(actual, expected, 1e-12, 1e-10, "inverse_jordan_product"); - - // Identity check: omega circ (omega \ r) = r - auto d_inv = make_device_vector(actual); - rmm::device_uvector d_roundtrip(d_omega.size(), stream_); - launch_jordan_product(d_omega, d_inv, d_roundtrip, d_offsets, static_cast(dims.size())); - - auto roundtrip = copy_to_host(d_roundtrip); - auto r_packed = pack_cones(r_cones); - expect_vector_near(roundtrip, r_packed, 1e-10, 1e-8, "inverse_identity"); -} - -TEST_F(second_order_cone_test, jordan_and_inverse_jordan_strided_loop_for_large_cone) -{ - std::vector dims{513}; - auto offsets = build_offsets(dims); - - auto a_cone = make_patterned_cone(dims[0], 5.0, 0.005); - auto b_cone = make_patterned_cone(dims[0], 4.0, 0.004); - auto omega_cone = make_patterned_cone(dims[0], 6.0, 0.003); - f_t rho_val = j_norm_sq(omega_cone); - ASSERT_GT(rho_val, 0.0); - - std::vector> a_cones{a_cone}; - std::vector> b_cones{b_cone}; - std::vector> omega_cones{omega_cone}; - - auto d_a = make_device_vector(pack_cones(a_cones)); - auto d_b = make_device_vector(pack_cones(b_cones)); - auto d_omega = make_device_vector(pack_cones(omega_cones)); - auto d_rho = make_device_vector(std::vector{rho_val}); - auto d_offsets = make_device_vector(offsets); - - // Jordan product: strided path - rmm::device_uvector d_jp(d_a.size(), stream_); - launch_jordan_product(d_a, d_b, d_jp, d_offsets, 1); - auto jp_actual = copy_to_host(d_jp); - auto jp_expected = ref_jordan_product_single(a_cone, b_cone); - expect_vector_near(jp_actual, jp_expected, 1e-10, 1e-8, "jordan_large"); - - // Inverse Jordan product: strided path + identity - auto r_cone = make_patterned_cone(dims[0], 3.0, 0.006); - std::vector> r_cones{r_cone}; - - auto d_r = make_device_vector(pack_cones(r_cones)); - rmm::device_uvector d_inv(d_omega.size(), stream_); - launch_inverse_jordan_product(d_omega, d_r, d_rho, d_inv, d_offsets, 1); - - auto inv_actual = copy_to_host(d_inv); - auto inv_expected = ref_inverse_jordan_product_single(omega_cone, r_cone, rho_val); - expect_vector_near(inv_actual, inv_expected, 1e-10, 1e-8, "inv_jordan_large"); - - // Round-trip identity on the large cone - auto d_inv_vec = make_device_vector(inv_actual); - rmm::device_uvector d_rt(d_omega.size(), stream_); - launch_jordan_product(d_omega, d_inv_vec, d_rt, d_offsets, 1); - auto rt_actual = copy_to_host(d_rt); - expect_vector_near(rt_actual, r_cone, 1e-8, 1e-6, "identity_large"); -} - -TEST_F(second_order_cone_test, inverse_jordan_product_with_nt_scaling_rho) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - launch_nt_scaling(cones, stream_); - - auto omega_host = copy_to_host(cones.omega); - auto rho_host = copy_to_host(cones.rho); - - // Build an arbitrary r vector, run inverse Jordan with NT-produced rho/omega - std::vector> r_cones{{1.0, -0.3, 0.2}, {0.5, 0.1, -0.15, 0.25, -0.1}}; - auto d_r = make_device_vector(pack_cones(r_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector d_out(cones.omega.size(), stream_); - - launch_inverse_jordan_product( - cones.omega, d_r, cones.rho, d_out, d_offsets, static_cast(dims.size())); - - auto inv_actual = copy_to_host(d_out); - - // Verify host reference matches using NT-produced values - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto omega_c = slice_cone(omega_host, offsets, c); - auto r_c = r_cones[c]; - auto ref = ref_inverse_jordan_product_single(omega_c, r_c, rho_host[c]); - auto actual = slice_cone(inv_actual, offsets, c); - expect_vector_near(actual, ref, 1e-10, 1e-8, "nt_rho_inv_jordan"); - } - - // Round-trip identity with NT-produced omega - rmm::device_uvector d_rt(cones.omega.size(), stream_); - auto d_inv = make_device_vector(inv_actual); - launch_jordan_product(cones.omega, d_inv, d_rt, d_offsets, static_cast(dims.size())); - auto rt_actual = copy_to_host(d_rt); - auto r_packed = pack_cones(r_cones); - expect_vector_near(rt_actual, r_packed, 1e-8, 1e-6, "nt_identity"); -} - TEST_F(second_order_cone_test, fused_corrector_matches_reference_with_nt_scaling) { std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; @@ -1595,7 +1365,7 @@ TEST_F(second_order_cone_test, cone_block_scatter_with_q_overlap) } } -TEST_F(second_order_cone_test, recover_cone_dz_matches_reference) +TEST_F(second_order_cone_test, recover_cone_dz_from_target_matches_reference) { std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; @@ -1615,13 +1385,19 @@ TEST_F(second_order_cone_test, recover_cone_dz_matches_reference) rmm::device_uvector d_rhs(cones.m_c, stream_); launch_affine_cone_rhs(cones, d_rhs); + auto rhs_actual = copy_to_host(d_rhs); + + std::vector target_host(rhs_actual.size(), f_t(0)); + for (std::size_t j = 0; j < rhs_actual.size(); ++j) { + target_host[j] = -rhs_actual[j]; + } + auto d_target = make_device_vector(target_host); rmm::device_uvector d_hinv2_dx(cones.m_c, stream_); rmm::device_uvector d_dz(cones.m_c, stream_); - launch_recover_cone_dz(d_dx, cones, d_rhs, d_hinv2_dx, d_dz); + launch_recover_cone_dz_from_target(d_dx, cones, d_target, d_hinv2_dx, d_dz); auto actual = copy_to_host(d_dz); - auto rhs_actual = copy_to_host(d_rhs); auto w_bar_host = copy_to_host(cones.w_bar); auto inv_eta_h = copy_to_host(cones.inv_eta); @@ -1634,7 +1410,7 @@ TEST_F(second_order_cone_test, recover_cone_dz_matches_reference) for (i_t j = 0; j < static_cast(ref.size()); ++j) { ref[j] = -rhs_c[j] - ref_hinv2[j]; } - expect_vector_near(act, ref, 1e-10, 1e-8, "recover_cone_dz"); + expect_vector_near(act, ref, 1e-10, 1e-8, "recover_cone_dz_from_target"); } } diff --git a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu index 1bf604bff8..b558b8d50e 100644 --- a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu +++ b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu @@ -175,7 +175,7 @@ TEST(barrier, dual_variable_greater_than) EXPECT_NEAR(solution.z[1], 0.0, 1e-5); } -TEST(barrier, cone_metadata_preserved_through_barrier_setup) +TEST(barrier, cone_metadata_reindexed_when_slack_is_inserted_before_cones) { raft::handle_t handle{}; init_handler(&handle); @@ -219,8 +219,11 @@ TEST(barrier, cone_metadata_preserved_through_barrier_setup) lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + ASSERT_EQ(new_slacks.size(), 1); + EXPECT_EQ(new_slacks[0], 1); + EXPECT_EQ(original_lp.num_cols, 6); EXPECT_EQ(original_lp.second_order_cone_dims, user_problem.second_order_cone_dims); - EXPECT_EQ(original_lp.cone_var_start, user_problem.cone_var_start); + EXPECT_EQ(original_lp.cone_var_start, 2); lp_problem_t barrier_lp(user_problem.handle_ptr, original_lp.num_rows, @@ -230,7 +233,209 @@ TEST(barrier, cone_metadata_preserved_through_barrier_setup) column_scaling(original_lp, settings, barrier_lp, column_scales); EXPECT_EQ(barrier_lp.second_order_cone_dims, user_problem.second_order_cone_dims); - EXPECT_EQ(barrier_lp.cone_var_start, user_problem.cone_var_start); + EXPECT_EQ(barrier_lp.cone_var_start, 2); +} + +TEST(barrier, presolve_reindexes_cone_start_after_empty_column_removal) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 4; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 0, 1, 2, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = 0.5; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + presolve_info_t presolve_info; + lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); + ASSERT_EQ(presolve(original_lp, settings, presolved_lp, presolve_info), 0); + + EXPECT_EQ(presolved_lp.num_cols, 3); + EXPECT_EQ(presolved_lp.second_order_cone_dims, std::vector({3})); + EXPECT_EQ(presolved_lp.cone_var_start, 0); + + lp_problem_t barrier_lp(user_problem.handle_ptr, + presolved_lp.num_rows, + presolved_lp.num_cols, + presolved_lp.A.col_start[presolved_lp.num_cols]); + std::vector column_scales; + ASSERT_EQ(column_scaling(presolved_lp, settings, barrier_lp, column_scales), 0); + EXPECT_EQ(barrier_lp.cone_var_start, 0); +} + +TEST(barrier, presolve_packs_free_variable_partner_before_cones) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 5; + constexpr int nz = 5; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 1, 2, 3, 4, 5}; + for (int j = 0; j < n; ++j) { + user_problem.A.i[j] = 0; + user_problem.A.x[j] = 1.0; + } + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + // Two free linear vars ensure the new implied-bound pass cannot fully + // eliminate the free-variable expansion path before the cone block. + user_problem.lower = {-inf, -inf, 0.0, 0.0, 0.0}; + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + presolve_info_t presolve_info; + lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); + ASSERT_EQ(presolve(original_lp, settings, presolved_lp, presolve_info), 0); + + EXPECT_EQ(presolved_lp.num_cols, 7); + EXPECT_EQ(presolved_lp.cone_var_start, 4); + EXPECT_EQ(presolved_lp.second_order_cone_dims, std::vector({3})); + ASSERT_EQ(presolve_info.free_variable_pairs.size(), 4); + EXPECT_EQ(presolve_info.free_variable_pairs[0], 0); + EXPECT_EQ(presolve_info.free_variable_pairs[1], 2); + EXPECT_EQ(presolve_info.free_variable_pairs[2], 1); + EXPECT_EQ(presolve_info.free_variable_pairs[3], 3); +} + +TEST(barrier, uncrush_solution_removes_non_tail_free_variable_partner) +{ + using namespace cuopt::linear_programming::dual_simplex; + + presolve_info_t presolve_info; + presolve_info.free_variable_pairs = {0, 1}; + + simplex_solver_settings_t settings; + std::vector crushed_x{5.0, 2.0, 9.0, 8.0}; + std::vector crushed_y{}; + std::vector crushed_z{7.0, 11.0, 13.0, 17.0}; + std::vector uncrushed_x(3); + std::vector uncrushed_y(0); + std::vector uncrushed_z(3); + + uncrush_solution(presolve_info, + settings, + crushed_x, + crushed_y, + crushed_z, + uncrushed_x, + uncrushed_y, + uncrushed_z); + + EXPECT_EQ(uncrushed_x, std::vector({3.0, 9.0, 8.0})); + EXPECT_EQ(uncrushed_z, std::vector({7.0, 13.0, 17.0})); +} + +TEST(barrier, rejects_middle_cone_input_before_barrier) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 3; + constexpr int n = 5; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0, 1.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 1, 1, 2, 2, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 2; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {2.0, 1.0, 3.0}; + user_problem.row_sense = {'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + lp_solution_t solution(m, n); + + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + EXPECT_EQ(status, lp_status_t::NUMERICAL_ISSUES); } TEST(barrier, socp_min_x0_subject_to_norm_constraint) @@ -305,4 +510,497 @@ TEST(barrier, socp_min_x0_subject_to_norm_constraint) EXPECT_NEAR(std::abs(solution.x[2]), 0.0, 1e-4); } +TEST(barrier, mixed_linear_and_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize l + // subject to l - t = 0 + // u = 1 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 2, 3, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {0.0, 1.0}; + user_problem.row_sense = {'E', 'E'}; + + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_block"; + + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_soc_tail_coupling) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize t + // subject to l - u = 0 + // l + u = 2 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 2, 2, 4, 4}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = -1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + + user_problem.rhs = {0.0, 2.0}; + user_problem.row_sense = {'E', 'E'}; + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_tail_coupling"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_soc_tail_coupling_with_inequality) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize t + // subject to l - u = 0 + // l + u >= 2 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 2, 2, 4, 4}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = -1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + + user_problem.rhs = {0.0, 2.0}; + user_problem.row_sense = {'E', 'G'}; + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_tail_coupling_with_inequality"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_two_soc_blocks) +{ + // Variables ordered as [l1, l2 | t1, u1, v1 | t2, u2, v2], + // where (t1, u1, v1), (t2, u2, v2) \in Q^3. + // + // minimize t1 + t2 + // subject to l1 - u1 = 0 + // l2 - u2 = 0 + // l1 + l2 = 3 + // l1 - l2 = 1 + // + // Optimal: l1* = 2, l2* = 1, t1* = 2, u1* = 2, v1* = 0, + // t2* = 1, u2* = 1, v2* = 0, obj* = 3. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 8; + constexpr int nz = 8; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l1, l2, t1, u1, v1, t2, u2, v2 + user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 3; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + user_problem.A.i[4] = 2; + user_problem.A.x[4] = 1.0; + user_problem.A.i[5] = 3; + user_problem.A.x[5] = -1.0; + user_problem.A.i[6] = 0; + user_problem.A.x[6] = -1.0; + user_problem.A.i[7] = 1; + user_problem.A.x[7] = -1.0; + + user_problem.rhs = {0.0, 0.0, 3.0, 1.0}; + user_problem.row_sense = {'E', 'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_two_soc_blocks"; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3, 3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 3.0, 1e-4); + EXPECT_NEAR(solution.x[0], 2.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 2.0, 1e-4); + EXPECT_NEAR(solution.x[3], 2.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4); + EXPECT_NEAR(solution.x[5], 1.0, 1e-4); + EXPECT_NEAR(solution.x[6], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[7]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_two_soc_blocks_with_inequality) +{ + // Variables ordered as [l1, l2 | t1, u1, v1 | t2, u2, v2], + // where (t1, u1, v1), (t2, u2, v2) \in Q^3. + // + // minimize t1 + t2 + // subject to l1 - u1 = 0 + // l2 - u2 = 0 + // l1 + l2 >= 3 + // l1 - l2 = 1 + // + // Optimal: l1* = 2, l2* = 1, t1* = 2, u1* = 2, v1* = 0, + // t2* = 1, u2* = 1, v2* = 0, obj* = 3. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 8; + constexpr int nz = 8; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l1, l2, t1, u1, v1, t2, u2, v2 + user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 3; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + user_problem.A.i[4] = 2; + user_problem.A.x[4] = 1.0; + user_problem.A.i[5] = 3; + user_problem.A.x[5] = -1.0; + user_problem.A.i[6] = 0; + user_problem.A.x[6] = -1.0; + user_problem.A.i[7] = 1; + user_problem.A.x[7] = -1.0; + + user_problem.rhs = {0.0, 0.0, 3.0, 1.0}; + user_problem.row_sense = {'E', 'E', 'G', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_two_soc_blocks_with_inequality"; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3, 3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 3.0, 1e-4); + EXPECT_NEAR(solution.x[0], 2.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 2.0, 1e-4); + EXPECT_NEAR(solution.x[3], 2.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4); + EXPECT_NEAR(solution.x[5], 1.0, 1e-4); + EXPECT_NEAR(solution.x[6], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[7]), 0.0, 1e-4); +} + +TEST(barrier, free_linear_prefix_is_uncrushed_correctly_with_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3 and l is free. + // + // minimize t + // subject to l - u = 0 + // u = 1 + // (t, u, v) in Q^3 + // + // Presolve splits the free linear variable into a partner column before the + // cone block, so the returned user-space solution must uncrush back to + // l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 1, 3, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {0.0, 1.0}; + user_problem.row_sense = {'E', 'E'}; + user_problem.lower = {-inf, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "free_linear_prefix_is_uncrushed_correctly_with_soc_block"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, qp_with_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize 0.5 l^2 + t + // subject to l + u = 2 + // (t, u, v) in Q^3 + // + // Since t >= |u| and u = 2 - l with l >= 0, the objective becomes + // 0.5 l^2 + |2 - l|, which is minimized at l* = 1, u* = 1, t* = 1, v* = 0. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 4; + constexpr int nz = 2; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 1, 2, 2}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = 1.0; + + user_problem.rhs = {2.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.Q_offsets = {0, 1, 1, 1, 1}; + user_problem.Q_indices = {0}; + user_problem.Q_values = {1.0}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "qp_with_soc_block"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.5, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + } // namespace cuopt::linear_programming::dual_simplex::test From a5dba68c29e39948b639265a18700d212fc616c0 Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 7 Apr 2026 13:12:34 -0700 Subject: [PATCH 05/22] test(dual_simplex): expand SOCP barrier coverage and tighten layout checks Add row-cone and presolve regression cases around the SOCP barrier path, clean up the PR-facing test wording, and align the cone layout validation with the shared infinity convention. Signed-off-by: Yan Zaretskiy --- cpp/src/barrier/barrier.cu | 2 +- cpp/src/dual_simplex/presolve.cpp | 67 ++- cpp/src/dual_simplex/solve.cpp | 114 +++-- cpp/src/dual_simplex/user_problem.hpp | 2 + .../unit_tests/second_order_cone_test.cu | 9 +- .../dual_simplex/unit_tests/solve_barrier.cu | 420 +++++++++++++++++- 6 files changed, 561 insertions(+), 53 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 1d937a4c5e..ae1364f027 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -2048,7 +2048,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v[k] = -c[j] + epsilon; } } - // Now hande the case with no upper bounds (skip cone variables) + // Now handle the case with no upper bounds (skip cone variables) const i_t cone_end = has_cones ? data.cone_var_start_ + data.cones_->m_c : 0; for (i_t j = 0; j < lp.num_cols; j++) { if (has_cones && j >= data.cone_var_start_ && j < cone_end) continue; diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index 90ae6ff74e..7adee4525a 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -25,6 +25,68 @@ static i_t linear_var_count(const lp_problem_t& problem) return problem.second_order_cone_dims.empty() ? problem.num_cols : problem.cone_var_start; } +template +static void lift_second_order_cone_rows(const user_problem_t& user_problem, + std::vector& row_sense, + lp_problem_t& problem) +{ + if (user_problem.second_order_cone_row_dims.empty()) { return; } + + auto& dims = user_problem.second_order_cone_row_dims; + const i_t lifted_row_count = std::accumulate(dims.begin(), dims.end(), i_t{0}); + const i_t cone_row_start = user_problem.cone_row_start; + + const i_t old_num_cols = problem.num_cols; + const i_t new_num_cols = old_num_cols + lifted_row_count; + const i_t old_nnz = problem.A.col_start[old_num_cols]; + const i_t new_nnz = old_nnz + lifted_row_count; + + auto old_A = problem.A; + csc_matrix_t lifted_A(problem.num_rows, new_num_cols, new_nnz); + + i_t nz = 0; + for (i_t j = 0; j < old_num_cols; ++j) { + lifted_A.col_start[j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + lifted_A.i[nz] = old_A.i[p]; + lifted_A.x[nz] = old_A.x[p]; + ++nz; + } + } + + for (i_t offset = 0; offset < lifted_row_count; ++offset) { + const i_t j = old_num_cols + offset; + const i_t row = cone_row_start + offset; + lifted_A.col_start[j] = nz; + lifted_A.i[nz] = row; + lifted_A.x[nz] = 1.0; + row_sense[row] = 'E'; + ++nz; + } + lifted_A.col_start[new_num_cols] = nz; + assert(nz == new_nnz); + + std::vector objective(new_num_cols, 0.0); + std::vector lower(new_num_cols, 0.0); + std::vector upper(new_num_cols, inf); + for (i_t j = 0; j < old_num_cols; ++j) { + objective[j] = problem.objective[j]; + lower[j] = problem.lower[j]; + upper[j] = problem.upper[j]; + } + + problem.A = lifted_A; + problem.A.n = new_num_cols; + problem.objective = objective; + problem.lower = lower; + problem.upper = upper; + problem.num_cols = new_num_cols; + if (problem.second_order_cone_dims.empty()) { problem.cone_var_start = old_num_cols; } + problem.second_order_cone_dims.insert(problem.second_order_cone_dims.end(), + user_problem.second_order_cone_row_dims.begin(), + user_problem.second_order_cone_row_dims.end()); +} + template i_t remove_empty_cols(lp_problem_t& problem, i_t& num_empty_cols, @@ -697,6 +759,9 @@ void convert_user_problem(const user_problem_t& user_problem, // Make a copy of row_sense so we can modify it std::vector row_sense = user_problem.row_sense; + if (settings.barrier && !user_problem.second_order_cone_row_dims.empty()) { + lift_second_order_cone_rows(user_problem, row_sense, problem); + } // The original problem can have constraints in the form // a_i^T x >= b, a_i^T x <= b, and a_i^T x == b @@ -750,7 +815,7 @@ void convert_user_problem(const user_problem_t& user_problem, settings.log.debug( "equality rows %d less rows %d columns %d\n", equal_rows, less_rows, problem.num_cols); if (settings.barrier && settings.dualize != 0 && user_problem.Q_values.size() == 0 && - user_problem.second_order_cone_dims.empty() && + problem.second_order_cone_dims.empty() && (settings.dualize == 1 || (settings.dualize == -1 && less_rows > 1.2 * problem.num_cols && equal_rows < 2e4))) { settings.log.debug("Dualizing in presolve\n"); diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 544647d5d0..021e0e8a20 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -37,6 +37,48 @@ namespace cuopt::linear_programming::dual_simplex { namespace { +template +bool validate_second_order_cone_row_metadata(const user_problem_t& user_problem, + const simplex_solver_settings_t& settings) +{ + if (user_problem.second_order_cone_row_dims.empty()) { return true; } + + i_t lifted_row_count = 0; + for (auto q_k : user_problem.second_order_cone_row_dims) { + if (q_k < 0) { + settings.log.printf("Error: second-order cone row dimensions must be nonnegative\n"); + return false; + } + lifted_row_count += q_k; + } + + if (user_problem.cone_row_start < 0) { + settings.log.printf("Error: cone_row_start must be nonnegative\n"); + return false; + } + + const i_t cone_row_end = user_problem.cone_row_start + lifted_row_count; + if (cone_row_end > user_problem.num_rows) { + settings.log.printf("Error: second-order cone row block exceeds the number of rows\n"); + return false; + } + + if (user_problem.num_range_rows > static_cast(user_problem.range_rows.size())) { + settings.log.printf("Error: range row metadata is inconsistent\n"); + return false; + } + + for (i_t k = 0; k < user_problem.num_range_rows; ++k) { + const i_t row = user_problem.range_rows[k]; + if (row >= user_problem.cone_row_start && row < cone_row_end) { + settings.log.printf("Error: range rows cannot intersect the second-order cone row block\n"); + return false; + } + } + + return true; +} + template void write_matlab(const std::string& filename, const dual_simplex::lp_problem_t& lp) { @@ -61,6 +103,42 @@ void write_matlab(const std::string& filename, const dual_simplex::lp_problem_t< fclose(fid); } +template +bool validate_barrier_cone_layout(const lp_problem_t& problem, + const simplex_solver_settings_t& settings) +{ + if (problem.second_order_cone_dims.empty()) { return true; } + + i_t cone_end = problem.cone_var_start; + for (auto q_k : problem.second_order_cone_dims) { + if (q_k <= 1) { + settings.log.printf( + "Error: second-order cone dimensions must be at least 2; use linear variables instead of " + "Q^1\n"); + return false; + } + cone_end += q_k; + } + + if (cone_end != problem.num_cols) { + settings.log.printf("Error: conic variables must form a trailing block [linear | cone]\n"); + return false; + } + + for (i_t j = problem.cone_var_start; j < cone_end; ++j) { + if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { + settings.log.printf("Error: explicit lower bound on conic variable %d is not supported\n", j); + return false; + } + if (problem.upper[j] < inf) { + settings.log.printf("Error: explicit upper bound on conic variable %d is not supported\n", j); + return false; + } + } + + return true; +} + } // namespace template @@ -344,35 +422,12 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us simplex_solver_settings_t barrier_settings = settings; barrier_settings.barrier_presolve = true; dualize_info_t dualize_info; + if (!validate_second_order_cone_row_metadata(user_problem, settings)) { + return lp_status_t::NUMERICAL_ISSUES; + } convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks, dualize_info); - - if (!user_problem.second_order_cone_dims.empty()) { - i_t cone_end = user_problem.cone_var_start; - for (auto q_k : user_problem.second_order_cone_dims) { - if (q_k <= 1) { - settings.log.printf( - "Error: second-order cone dimensions must be at least 2; use linear variables instead of " - "Q^1\n"); - return lp_status_t::NUMERICAL_ISSUES; - } - cone_end += q_k; - } - if (cone_end != user_problem.num_cols) { - settings.log.printf("Error: conic variables must form a trailing block [linear | cone]\n"); - return lp_status_t::NUMERICAL_ISSUES; - } - for (i_t j = user_problem.cone_var_start; j < cone_end; ++j) { - if (user_problem.lower[j] != 0.0 && user_problem.lower[j] > -1e30) { - settings.log.printf("Error: explicit lower bound on conic variable %d is not supported\n", - j); - return lp_status_t::NUMERICAL_ISSUES; - } - if (user_problem.upper[j] < 1e30) { - settings.log.printf("Error: explicit upper bound on conic variable %d is not supported\n", - j); - return lp_status_t::NUMERICAL_ISSUES; - } - } + if (!validate_barrier_cone_layout(original_lp, settings)) { + return lp_status_t::NUMERICAL_ISSUES; } lp_solution_t lp_solution(original_lp.num_rows, original_lp.num_cols); @@ -604,7 +659,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); uncrush_dual_solution( user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z); - solution.objective = barrier_solution.objective; + solution.objective = + barrier_solution.user_objective / user_problem.obj_scale - user_problem.obj_constant; solution.user_objective = barrier_solution.user_objective; solution.l2_primal_residual = barrier_solution.l2_primal_residual; solution.l2_dual_residual = barrier_solution.l2_dual_residual; diff --git a/cpp/src/dual_simplex/user_problem.hpp b/cpp/src/dual_simplex/user_problem.hpp index 8b0588064c..db62891185 100644 --- a/cpp/src/dual_simplex/user_problem.hpp +++ b/cpp/src/dual_simplex/user_problem.hpp @@ -54,6 +54,8 @@ struct user_problem_t { std::vector Q_values; i_t cone_var_start{0}; std::vector second_order_cone_dims; + i_t cone_row_start{0}; + std::vector second_order_cone_row_dims; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu index e6f48ac9de..b3e8974e8b 100644 --- a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu @@ -614,8 +614,7 @@ TEST_F(second_order_cone_test, cone_data_reuses_named_scratch_slots) TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) { - // Borrowed from the Clarabel regression input, but checked against our own - // host-side NT formulas. + // Fixed small-cone fixture validated against the host-side NT formulas. std::vector> s_cones{{1.5, 0.3, 0.4}}; std::vector> lambda_cones{{2.0, 0.5, 0.5}}; std::vector dims{3}; @@ -855,15 +854,15 @@ TEST_F(second_order_cone_test, step_length_matches_reference_for_large_cone) EXPECT_LT(actual_alpha[0], alpha_max); } -TEST_F(second_order_cone_test, step_length_boundary_c_zero_matches_clarabel_branch) +TEST_F(second_order_cone_test, step_length_boundary_c_zero_returns_zero) { std::vector dims{3}; auto offsets = build_offsets(dims); // Boundary point: c = u^T J u = 1^2 - 1^2 - 0^2 = 0. // Direction: a = du^T J du = 1^2 - 1^2 - 1^2 = -1 < 0. - // Clarabel's c == 0 branch returns 0 in this case because the direction - // leaves the cone immediately. + // The step length is 0 in this case because the direction leaves the cone + // immediately. std::vector> s_cones{{1.0, 1.0, 0.0}}; std::vector> ds_cones{{1.0, 1.0, 1.0}}; std::vector> lambda_cones{{1.0, 1.0, 0.0}}; diff --git a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu index b558b8d50e..1b4a866221 100644 --- a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu +++ b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu @@ -5,8 +5,6 @@ */ /* clang-format on */ -#include - #include #include @@ -34,6 +32,72 @@ static void init_handler(const raft::handle_t* handle_ptr) handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); } +template +static void populate_basic_qp_socp_problem(user_problem_t& user_problem, + bool explicit_cone_variables) +{ + constexpr i_t num_rows = 9; + constexpr f_t p00 = static_cast(1.4652521089139698); + constexpr f_t p01 = static_cast(0.6137176286085666); + constexpr f_t p02 = static_cast(-1.1527861771130112); + constexpr f_t p11 = static_cast(2.219109946678485); + constexpr f_t p12 = static_cast(-1.4400420548730628); + constexpr f_t p22 = static_cast(1.6014483534926371); + + user_problem.num_rows = num_rows; + user_problem.rhs = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0}; + user_problem.row_sense = {'L', 'L', 'L', 'L', 'L', 'L', 'E', 'E', 'E'}; + user_problem.num_range_rows = 0; + + if (explicit_cone_variables) { + user_problem.num_cols = 6; + user_problem.objective = {0.1, -2.0, 1.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = num_rows; + user_problem.A.n = user_problem.num_cols; + user_problem.A.nz_max = 12; + user_problem.A.reallocate(12); + user_problem.A.col_start = {0, 3, 6, 9, 10, 11, 12}; + user_problem.A.i = {0, 3, 6, 1, 4, 7, 2, 5, 8, 6, 7, 8}; + user_problem.A.x = {2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 1.0, 1.0, 1.0}; + + user_problem.lower = {-inf, -inf, -inf, 0.0, 0.0, 0.0}; + user_problem.upper.assign(user_problem.num_cols, inf); + + user_problem.Q_offsets = {0, 3, 6, 9, 9, 9, 9}; + user_problem.Q_indices = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + user_problem.Q_values = {p00, p01, p02, p01, p11, p12, p02, p12, p22}; + + user_problem.cone_var_start = 3; + user_problem.second_order_cone_dims = {3}; + user_problem.problem_name = "basic_qp_socp_explicit_cone"; + } else { + user_problem.num_cols = 3; + user_problem.objective = {0.1, -2.0, 1.0}; + + user_problem.A.m = num_rows; + user_problem.A.n = user_problem.num_cols; + user_problem.A.nz_max = 9; + user_problem.A.reallocate(9); + user_problem.A.col_start = {0, 3, 6, 9}; + user_problem.A.i = {0, 3, 6, 1, 4, 7, 2, 5, 8}; + user_problem.A.x = {2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 2.0, -2.0, 1.0}; + + user_problem.lower.assign(user_problem.num_cols, -inf); + user_problem.upper.assign(user_problem.num_cols, inf); + + user_problem.Q_offsets = {0, 3, 6, 9}; + user_problem.Q_indices = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + user_problem.Q_values = {p00, p01, p02, p01, p11, p12, p02, p12, p22}; + + user_problem.cone_row_start = 6; + user_problem.second_order_cone_row_dims = {3}; + user_problem.problem_name = "basic_qp_socp_row_cone"; + } + + user_problem.var_types.assign(user_problem.num_cols, variable_type_t::CONTINUOUS); +} + TEST(barrier, chess_set) { namespace dual_simplex = cuopt::linear_programming::dual_simplex; @@ -140,7 +204,6 @@ TEST(barrier, dual_variable_greater_than) user_problem.A.x[nnz++] = 1.0; user_problem.A.i[nnz] = 1; user_problem.A.x[nnz++] = 2.0; - user_problem.A.print_matrix(); EXPECT_EQ(nnz, nz); user_problem.rhs.resize(m); @@ -210,7 +273,7 @@ TEST(barrier, cone_metadata_reindexed_when_slack_is_inserted_before_cones) simplex_solver_settings_t settings; settings.barrier = true; - settings.barrier_presolve = false; + settings.barrier_presolve = true; settings.dualize = 0; settings.scale_columns = false; @@ -236,6 +299,225 @@ TEST(barrier, cone_metadata_reindexed_when_slack_is_inserted_before_cones) EXPECT_EQ(barrier_lp.cone_var_start, 2); } +TEST(barrier, row_cone_block_is_lifted_into_trailing_cone_variables) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 3; + constexpr int n = 2; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 2, 4}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = -1.0; + user_problem.A.i[3] = 2; + user_problem.A.x[3] = 2.0; + + user_problem.rhs = {3.0, 1.0, 4.0}; + user_problem.row_sense = {'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_row_start = 0; + user_problem.second_order_cone_row_dims = {3}; + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + EXPECT_TRUE(new_slacks.empty()); + EXPECT_EQ(original_lp.num_cols, 5); + EXPECT_EQ(original_lp.cone_var_start, 2); + EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3})); + + for (int j = 2; j < 5; ++j) { + EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 2); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); + EXPECT_EQ(original_lp.objective[j], 0.0); + EXPECT_EQ(original_lp.lower[j], 0.0); + EXPECT_EQ(original_lp.upper[j], inf); + } +} + +TEST(barrier, row_cone_block_and_scalar_inequality_order_as_linear_slack_then_cone) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 1; + constexpr int nz = 2; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 2}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = -1.0; + + user_problem.rhs = {2.0, 0.0, 1.0, 0.0}; + user_problem.row_sense = {'L', 'E', 'E', 'E'}; + user_problem.lower = {0.0}; + user_problem.upper = {inf}; + user_problem.num_range_rows = 0; + user_problem.cone_row_start = 1; + user_problem.second_order_cone_row_dims = {3}; + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + ASSERT_EQ(new_slacks.size(), 1); + EXPECT_EQ(new_slacks[0], 1); + EXPECT_EQ(original_lp.num_cols, 5); + EXPECT_EQ(original_lp.cone_var_start, 2); + EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3})); + + EXPECT_EQ(original_lp.A.col_start[1] - original_lp.A.col_start[0], 2); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[0]], 0); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[0]], 1.0); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[0] + 1], 1); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[0] + 1], -1.0); + + EXPECT_EQ(original_lp.A.col_start[2] - original_lp.A.col_start[1], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[1]], 0); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[1]], 1.0); + + for (int j = 2; j < 5; ++j) { + EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 1); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); + EXPECT_EQ(original_lp.objective[j], 0.0); + EXPECT_EQ(original_lp.lower[j], 0.0); + EXPECT_EQ(original_lp.upper[j], inf); + } +} + +TEST(barrier, explicit_and_lifted_cones_stay_contiguous_after_scalar_slack_insertion) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 4; + constexpr int nz = 5; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 2, 3, 4, 5}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 2; + user_problem.A.x[3] = 2.0; + user_problem.A.i[4] = 3; + user_problem.A.x[4] = -3.0; + + user_problem.rhs = {2.0, 0.0, 0.0, 0.0}; + user_problem.row_sense = {'L', 'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.cone_row_start = 1; + user_problem.second_order_cone_row_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + ASSERT_EQ(new_slacks.size(), 1); + EXPECT_EQ(new_slacks[0], 1); + EXPECT_EQ(original_lp.num_cols, 8); + EXPECT_EQ(original_lp.cone_var_start, 2); + EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3, 3})); + + EXPECT_EQ(original_lp.A.col_start[2] - original_lp.A.col_start[1], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[1]], 0); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[1]], 1.0); + + EXPECT_EQ(original_lp.A.col_start[3] - original_lp.A.col_start[2], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[2]], 1); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[2]], 1.0); + EXPECT_EQ(original_lp.A.col_start[4] - original_lp.A.col_start[3], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[3]], 2); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[3]], 2.0); + EXPECT_EQ(original_lp.A.col_start[5] - original_lp.A.col_start[4], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[4]], 3); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[4]], -3.0); + + for (int j = 5; j < 8; ++j) { + EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); + EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 4); + EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); + EXPECT_EQ(original_lp.objective[j], 0.0); + EXPECT_EQ(original_lp.lower[j], 0.0); + EXPECT_EQ(original_lp.upper[j], inf); + } +} + TEST(barrier, presolve_reindexes_cone_start_after_empty_column_removal) { raft::handle_t handle{}; @@ -487,22 +769,9 @@ TEST(barrier, socp_min_x0_subject_to_norm_constraint) settings.barrier = true; settings.barrier_presolve = false; settings.dualize = 0; - settings.set_log(true); - settings.log.log_to_console = false; - settings.log.enable_log_to_file(); - settings.log.set_log_file("/tmp/socp_barrier_test.log"); lp_solution_t solution(m, n); - printf("=== Calling solve_linear_program_with_barrier ===\n"); - fflush(stdout); auto status = solve_linear_program_with_barrier(user_problem, settings, solution); - printf("=== status=%d obj=%e x=[%e %e %e] ===\n", - static_cast(status), - solution.objective, - solution.x[0], - solution.x[1], - solution.x[2]); - fflush(stdout); EXPECT_EQ(status, lp_status_t::OPTIMAL); EXPECT_NEAR(solution.objective, 1.0, 1e-4); EXPECT_NEAR(solution.x[0], 1.0, 1e-4); @@ -510,6 +779,123 @@ TEST(barrier, socp_min_x0_subject_to_norm_constraint) EXPECT_NEAR(std::abs(solution.x[2]), 0.0, 1e-4); } +TEST(barrier, socp_min_x_subject_to_row_cone_metadata) +{ + // minimize x + // subject to -x + s_0 = 0 + // s_1 = 1 + // s_2 = 0 + // (s_0, s_1, s_2) in Q^3 + // + // Optimal: x* = 1, obj* = 1. + + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 3; + constexpr int n = 1; + constexpr int nz = 1; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 1}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = -1.0; + + user_problem.rhs = {0.0, 1.0, 0.0}; + user_problem.row_sense = {'E', 'E', 'E'}; + user_problem.lower = {0.0}; + user_problem.upper = {inf}; + user_problem.num_range_rows = 0; + user_problem.problem_name = "socp_row_cone_metadata"; + user_problem.cone_row_start = 0; + user_problem.second_order_cone_row_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = false; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); +} + +TEST(barrier, basic_qp_socp_row_cone_matches_reference_solution) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + populate_basic_qp_socp_problem(user_problem, false); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = false; + + lp_solution_t solution(user_problem.num_rows, user_problem.num_cols); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.x[0], -0.5, 1e-3); + EXPECT_NEAR(solution.x[1], 0.435603, 1e-3); + EXPECT_NEAR(solution.x[2], -0.245459, 1e-3); + EXPECT_NEAR(solution.objective, -0.84590, 1e-3); +} + +TEST(barrier, basic_qp_socp_row_cone_matches_explicit_cone_formulation) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t row_cone_problem(&handle); + user_problem_t explicit_cone_problem(&handle); + populate_basic_qp_socp_problem(row_cone_problem, false); + populate_basic_qp_socp_problem(explicit_cone_problem, true); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = false; + settings.dualize = 0; + settings.scale_columns = false; + + lp_solution_t row_cone_solution(row_cone_problem.num_rows, + row_cone_problem.num_cols); + lp_solution_t explicit_cone_solution(explicit_cone_problem.num_rows, + explicit_cone_problem.num_cols); + + auto row_cone_status = + solve_linear_program_with_barrier(row_cone_problem, settings, row_cone_solution); + auto explicit_status = + solve_linear_program_with_barrier(explicit_cone_problem, settings, explicit_cone_solution); + + EXPECT_EQ(row_cone_status, lp_status_t::OPTIMAL); + EXPECT_EQ(explicit_status, lp_status_t::OPTIMAL); + EXPECT_NEAR(row_cone_solution.objective, explicit_cone_solution.objective, 1e-4); + EXPECT_NEAR(row_cone_solution.objective, -0.84590, 1e-3); + for (int j = 0; j < 3; ++j) { + EXPECT_NEAR(row_cone_solution.x[j], explicit_cone_solution.x[j], 1e-4); + } +} + TEST(barrier, mixed_linear_and_soc_block) { // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. From f4b9c8ed57a0cbff16ec90ad030c9f373023af4d Mon Sep 17 00:00:00 2001 From: YUWEN Chen Date: Wed, 15 Apr 2026 03:13:44 -0700 Subject: [PATCH 06/22] feat(mps_parser): add QCMATRIX parsing and quadratic-constraint model support --- .../include/mps_parser/mps_data_model.hpp | 39 ++++ .../include/mps_parser/parser.hpp | 2 + cpp/libmps_parser/src/mps_data_model.cpp | 52 ++++++ cpp/libmps_parser/src/mps_parser.cpp | 172 ++++++++++++++++-- cpp/libmps_parser/src/mps_parser.hpp | 18 ++ cpp/libmps_parser/tests/mps_parser_test.cpp | 45 +++++ 6 files changed, 315 insertions(+), 13 deletions(-) diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 6879e15d60..6022639f46 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -262,6 +262,40 @@ class mps_data_model_t { const i_t* Q_offsets, i_t size_offsets); + /** + * @brief CSR of Q for one quadratic constraint (MPS QCMATRIX). + * + * @c constraint_row_index is the row index in the linear constraint matrix A (0-based), + * matching the order of non-objective rows in the ROWS section. + */ + struct quadratic_constraint_matrix_t { + i_t constraint_row_index{}; + std::vector values; + std::vector indices; + std::vector offsets; + }; + + /** + * @brief Append one quadratic constraint matrix (QCMATRIX) in CSR format. + * + * @param constraint_row_index Row index in A (0-based), matching non-objective ROWS order. + * @param[in] Qc_values Values of the CSR representation; copied into the model. + * @param size_values Size of the Qc_values array. + * @param[in] Qc_indices Indices of the CSR representation; copied into the model. + * @param size_indices Size of the Qc_indices array. + * @param[in] Qc_offsets Offsets of the CSR representation; copied into the model. + * @param size_offsets Size of the Qc_offsets array. + */ + void append_quadratic_constraint_matrix(i_t constraint_row_index, + const f_t* Qc_values, + i_t size_values, + const i_t* Qc_indices, + i_t size_indices, + const i_t* Qc_offsets, + i_t size_offsets); + + const std::vector& get_quadratic_constraint_matrices() const; + i_t get_n_variables() const; i_t get_n_constraints() const; i_t get_nnz() const; @@ -306,6 +340,8 @@ class mps_data_model_t { bool has_quadratic_objective() const noexcept; + bool has_quadratic_constraints() const noexcept; + /** whether to maximize or minimize the objective function */ bool maximize_; /** @@ -361,6 +397,9 @@ class mps_data_model_t { std::vector Q_objective_indices_; std::vector Q_objective_offsets_; + /** One CSR matrix per QCMATRIX block, in order of appearance in the file */ + std::vector quadratic_constraint_matrices_; + }; // class mps_data_model_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/parser.hpp b/cpp/libmps_parser/include/mps_parser/parser.hpp index e8e8c342bd..6578ffb4d5 100644 --- a/cpp/libmps_parser/include/mps_parser/parser.hpp +++ b/cpp/libmps_parser/include/mps_parser/parser.hpp @@ -23,6 +23,8 @@ namespace cuopt::mps_parser { * QPS files (for quadratic programming). QPS files are MPS files with additional * sections: * - QUADOBJ: Defines quadratic terms in the objective function + * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ) + * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP) * * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression * libraries zlib or libbzip2 are installed, respectively. diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 7d0d44a038..34e7d9b3d9 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace cuopt::mps_parser { @@ -219,6 +220,51 @@ void mps_data_model_t::set_quadratic_objective_matrix(const f_t* Q_val std::copy(Q_offsets, Q_offsets + size_offsets, Q_objective_offsets_.data()); } +template +void mps_data_model_t::append_quadratic_constraint_matrix(i_t constraint_row_index, + const f_t* Qc_values, + i_t size_values, + const i_t* Qc_indices, + i_t size_indices, + const i_t* Qc_offsets, + i_t size_offsets) +{ + if (size_values != 0) { + mps_parser_expects( + Qc_values != nullptr, error_type_t::ValidationError, "Qc_values cannot be null"); + } + if (size_indices != 0) { + mps_parser_expects( + Qc_indices != nullptr, error_type_t::ValidationError, "Qc_indices cannot be null"); + } + mps_parser_expects( + Qc_offsets != nullptr, error_type_t::ValidationError, "Qc_offsets cannot be null"); + mps_parser_expects( + size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); + + quadratic_constraint_matrix_t qcm; + qcm.constraint_row_index = constraint_row_index; + qcm.values.resize(size_values); + if (size_values > 0) { + std::copy(Qc_values, Qc_values + size_values, qcm.values.data()); + } + qcm.indices.resize(size_indices); + if (size_indices > 0) { + std::copy(Qc_indices, Qc_indices + size_indices, qcm.indices.data()); + } + qcm.offsets.resize(size_offsets); + std::copy(Qc_offsets, Qc_offsets + size_offsets, qcm.offsets.data()); + + quadratic_constraint_matrices_.push_back(std::move(qcm)); +} + +template +auto mps_data_model_t::get_quadratic_constraint_matrices() const + -> const std::vector& +{ + return quadratic_constraint_matrices_; +} + template const std::vector& mps_data_model_t::get_constraint_matrix_values() const { @@ -460,6 +506,12 @@ bool mps_data_model_t::has_quadratic_objective() const noexcept return !Q_objective_values_.empty(); } +template +bool mps_data_model_t::has_quadratic_constraints() const noexcept +{ + return !quadratic_constraint_matrices_.empty(); +} + // NOTE: Explicitly instantiate all types here in order to avoid linker error template class mps_data_model_t; diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 586544331f..9d3c2b45c4 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -432,6 +431,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.get_constraint_upper_bounds().size()); } + const i_t num_vars_for_quad = static_cast(var_names.size()); + problem.set_problem_name(problem_name); problem.set_objective_name(objective_name); problem.set_variable_names(std::move(var_names)); @@ -439,13 +440,17 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_row_names(std::move(row_names)); problem.set_maximize(maximize); - // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of - // O(nnz*log(nnz))) For QUADOBJ: handles upper triangular input by expanding to full symmetric - // matrix + // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of O(nnz*log(nnz))) + // For QUADOBJ: handles upper triangular input by expanding to full symmetric matrix. + // + // @p value_scale: + // QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); + // QCMATRIX uses 1.0 (symmetric Q defines xᵀQx directly in the constraint). auto build_csr_via_transpose = [](const std::vector>& entries, i_t num_rows, i_t num_cols, - bool is_quadobj = false) { + bool symmetrize_upper_triangular, + f_t value_scale) { struct CSRResult { std::vector values; std::vector indices; @@ -467,7 +472,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // For QUADOBJ (upper triangular), add both (row,col) and (col,row) if off-diagonal csc_data[col].emplace_back(row, val); - if (is_quadobj && row != col) { csc_data[row].emplace_back(col, val); } + if (symmetrize_upper_triangular && row != col) { csc_data[row].emplace_back(col, val); } } // Second transpose: convert CSC to CSR (entries sorted by row, columns within rows sorted) @@ -485,9 +490,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (i_t row = 0; row < num_rows; ++row) { for (const auto& [col, val] : csr_data[row]) { - // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q x - // so we have to multiply the value by 0.5 to get the correct value. - result.values.push_back(val * 0.5); + result.values.push_back(val * value_scale); result.indices.push_back(col); } result.offsets.push_back(result.values.size()); @@ -500,8 +503,9 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) if (!quadobj_entries.empty()) { // Convert quadratic objective entries to CSR format using double transpose // QUADOBJ stores upper triangular elements, so we expand to full symmetric matrix - i_t num_vars = static_cast(var_names.size()); - auto csr_result = build_csr_via_transpose(quadobj_entries, num_vars, num_vars, true); + constexpr f_t k_mps_quad_half_scale = f_t(0.5); // MPS ½ xᵀQx vs internal xᵀQx + auto csr_result = + build_csr_via_transpose(quadobj_entries, num_vars_for_quad, num_vars_for_quad, true, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) problem.set_quadratic_objective_matrix(csr_result.values.data(), @@ -513,8 +517,9 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) } else if (!qmatrix_entries.empty()) { // Convert quadratic objective entries to CSR format using double transpose // QMATRIX stores full symmetric matrix - i_t num_vars = static_cast(var_names.size()); - auto csr_result = build_csr_via_transpose(qmatrix_entries, num_vars, num_vars, false); + constexpr f_t k_mps_quad_half_scale = f_t(0.5); + auto csr_result = + build_csr_via_transpose(qmatrix_entries, num_vars_for_quad, num_vars_for_quad, false, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) problem.set_quadratic_objective_matrix(csr_result.values.data(), @@ -524,6 +529,20 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) csr_result.offsets.data(), csr_result.offsets.size()); } + + // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs) + constexpr f_t k_qcmatrix_value_scale = f_t(1); + for (const auto& block : qcmatrix_blocks_) { + auto csr_result = build_csr_via_transpose( + block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); + problem.append_quadratic_constraint_matrix(block.constraint_row_id, + csr_result.values.data(), + csr_result.values.size(), + csr_result.indices.data(), + csr_result.indices.size(), + csr_result.offsets.data(), + csr_result.offsets.size()); + } } template @@ -599,6 +618,11 @@ void mps_parser_t::parse_string(char* buf) // these lines mark the start of a particular "section" if (line[0] != ' ') { skip_line = false; + // Leaving QCMATRIX: any non-QCMATRIX section header ends the current block + if (inside_qcmatrix_ && line.find("QCMATRIX", 0, 8) != 0) { + flush_qcmatrix_block(); + inside_qcmatrix_ = false; + } if (line.find("NAME", 0, 4) == 0) { encountered_sections.insert("NAME"); auto name_start = line.find_first_not_of(" \t", 4); @@ -709,6 +733,7 @@ void mps_parser_t::parse_string(char* buf) inside_objname_ = false; inside_objsense_ = false; inside_qmatrix_ = false; + inside_qcmatrix_ = false; inside_quadobj_ = true; } else if (line.find("QMATRIX", 0, 7) == 0) { encountered_sections.insert("QMATRIX"); @@ -721,6 +746,21 @@ void mps_parser_t::parse_string(char* buf) inside_objsense_ = false; inside_quadobj_ = false; inside_qmatrix_ = true; + inside_qcmatrix_ = false; + } else if (line.find("QCMATRIX", 0, 8) == 0) { + encountered_sections.insert("QCMATRIX"); + flush_qcmatrix_block(); + inside_rows_ = false; + inside_columns_ = false; + inside_rhs_ = false; + inside_bounds_ = false; + inside_ranges_ = false; + inside_objname_ = false; + inside_objsense_ = false; + inside_quadobj_ = false; + inside_qmatrix_ = false; + inside_qcmatrix_ = true; + parse_qcmatrix_header(line); } else if (line.find("ENDATA", 0, 6) == 0) { encountered_sections.insert("ENDATA"); break; @@ -737,6 +777,7 @@ void mps_parser_t::parse_string(char* buf) inside_objname_ = false; inside_quadobj_ = false; inside_qmatrix_ = false; + inside_qcmatrix_ = false; } else { mps_parser_expects(false, error_type_t::ValidationError, @@ -763,6 +804,8 @@ void mps_parser_t::parse_string(char* buf) parse_quad(line, true); } else if (inside_qmatrix_) { parse_quad(line, false); + } else if (inside_qcmatrix_) { + parse_qcmatrix_data(line); } else { mps_parser_expects(false, error_type_t::ValidationError, @@ -1282,6 +1325,109 @@ void mps_parser_t::parse_objname(std::string_view line) } } +template +void mps_parser_t::flush_qcmatrix_block() +{ + if (qcmatrix_active_row_id_ < 0) { return; } + if (qcmatrix_current_entries_.empty()) { + qcmatrix_active_row_id_ = -1; + return; + } + for (const auto& b : qcmatrix_blocks_) { + mps_parser_expects(b.constraint_row_id != qcmatrix_active_row_id_, + error_type_t::ValidationError, + "Duplicate QCMATRIX block for the same constraint row (index %d)", + static_cast(qcmatrix_active_row_id_)); + } + qcmatrix_raw_block_t block; + block.constraint_row_id = qcmatrix_active_row_id_; + block.entries = std::move(qcmatrix_current_entries_); + qcmatrix_blocks_.push_back(std::move(block)); + qcmatrix_active_row_id_ = -1; +} + +template +void mps_parser_t::parse_qcmatrix_header(std::string_view line) +{ + std::string row_name; + if (fixed_mps_format) { + mps_parser_expects(line.size() >= 19, + error_type_t::ValidationError, + "QCMATRIX header line too short! line=%s", + std::string(line).c_str()); + //fixed MPS: constraint name starts in column 12 (1-based) → 0-based index 11, 8 chars + row_name = std::string(trim(line.substr(11, 8))); + } else { + std::stringstream ss{std::string(line)}; + std::string kw; + ss >> kw; + mps_parser_expects(kw == "QCMATRIX", + error_type_t::ValidationError, + "Expected QCMATRIX keyword! line=%s", + std::string(line).c_str()); + ss >> row_name; + mps_parser_expects(!row_name.empty(), + error_type_t::ValidationError, + "QCMATRIX missing constraint row name! line=%s", + std::string(line).c_str()); + } + + auto row_it = row_names_map.find(row_name); + mps_parser_expects(row_it != row_names_map.end(), + error_type_t::ValidationError, + "Unknown constraint row name '%s' in QCMATRIX! line=%s", + row_name.c_str(), + std::string(line).c_str()); + + qcmatrix_active_row_id_ = row_it->second; +} + +template +void mps_parser_t::parse_qcmatrix_data(std::string_view line) +{ + mps_parser_expects(qcmatrix_active_row_id_ >= 0, + error_type_t::ValidationError, + "QCMATRIX data line before a valid QCMATRIX header! line=%s", + std::string(line).c_str()); + + std::string var1_name, var2_name; + f_t value; + + if (fixed_mps_format) { + mps_parser_expects(line.size() >= 25, + error_type_t::ValidationError, + "QCMATRIX data line should have at least 3 entities! line=%s", + std::string(line).c_str()); + + var1_name = std::string(trim(line.substr(4, 8))); + var2_name = std::string(trim(line.substr(14, 8))); + if (var1_name[0] == '$' || var2_name[0] == '$') return; + + i_t pos = 24; + value = get_numerical_bound(line, pos); + } else { + std::stringstream ss{std::string(line)}; + ss >> var1_name >> var2_name >> value; + if (var1_name[0] == '$' || var2_name[0] == '$') return; + } + + auto var1_it = var_names_map.find(var1_name); + auto var2_it = var_names_map.find(var2_name); + + mps_parser_expects(var1_it != var_names_map.end(), + error_type_t::ValidationError, + "Variable '%s' not found in QCMATRIX! line=%s", + var1_name.c_str(), + std::string(line).c_str()); + mps_parser_expects(var2_it != var_names_map.end(), + error_type_t::ValidationError, + "Variable '%s' not found in QCMATRIX! line=%s", + var2_name.c_str(), + std::string(line).c_str()); + + qcmatrix_current_entries_.emplace_back(var1_it->second, var2_it->second, value); +} + template void mps_parser_t::parse_quad(std::string_view line, bool is_quadobj) { diff --git a/cpp/libmps_parser/src/mps_parser.hpp b/cpp/libmps_parser/src/mps_parser.hpp index facad14c66..d73cfdd8b3 100644 --- a/cpp/libmps_parser/src/mps_parser.hpp +++ b/cpp/libmps_parser/src/mps_parser.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,18 @@ class mps_parser_t { // QPS-specific parsing states bool inside_quadobj_{false}; bool inside_qmatrix_{false}; + bool inside_qcmatrix_{false}; + + /** (free-format) QCMATRIX: finalized blocks (row id + triples) */ + struct qcmatrix_raw_block_t { + i_t constraint_row_id{}; + std::vector> entries{}; + }; + std::vector qcmatrix_blocks_{}; + /** Triples for the QCMATRIX block currently being read (-1 row id means none) */ + i_t qcmatrix_active_row_id_{-1}; + std::vector> qcmatrix_current_entries_{}; + std::unordered_set encountered_sections{}; std::unordered_map row_names_map{}; std::unordered_map var_names_map{}; @@ -170,6 +183,11 @@ class mps_parser_t { // QPS-specific parsing methods void parse_quad(std::string_view line, bool is_quadobj); + // QCMATRIX-specific parsing methods + void flush_qcmatrix_block(); + void parse_qcmatrix_header(std::string_view line); + void parse_qcmatrix_data(std::string_view line); + }; // class mps_parser_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index f915fb2df5..e041b42419 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -855,6 +855,51 @@ TEST(qps_parser, quadratic_objective_basic) EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]); } +// ================================================================================================ +// QCMATRIX Support Tests +// ================================================================================================ + +TEST(qps_parser, qcmatrix_append_api) +{ + using model_t = mps_data_model_t; + model_t model; + + // Validate default-constructed struct shape. + model_t::quadratic_constraint_matrix_t default_qcm; + EXPECT_EQ(0, default_qcm.constraint_row_index); + EXPECT_TRUE(default_qcm.values.empty()); + EXPECT_TRUE(default_qcm.indices.empty()); + EXPECT_TRUE(default_qcm.offsets.empty()); + + // QC0: [[10, 2], [2, 2]] + const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; + const std::vector qc0_indices = {0, 1, 0, 1}; + const std::vector qc0_offsets = {0, 2, 4}; + model.append_quadratic_constraint_matrix( + 0, qc0_values.data(), qc0_values.size(), qc0_indices.data(), qc0_indices.size(), qc0_offsets.data(), qc0_offsets.size()); + + // QC1: [[4, 1], [1, 6]] + const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; + const std::vector qc1_indices = {0, 1, 0, 1}; + const std::vector qc1_offsets = {0, 2, 4}; + model.append_quadratic_constraint_matrix( + 1, qc1_values.data(), qc1_values.size(), qc1_indices.data(), qc1_indices.size(), qc1_offsets.data(), qc1_offsets.size()); + + ASSERT_TRUE(model.has_quadratic_constraints()); + const auto& qcs = model.get_quadratic_constraint_matrices(); + ASSERT_EQ(2u, qcs.size()); + + EXPECT_EQ(0, qcs[0].constraint_row_index); + EXPECT_EQ(qc0_values, qcs[0].values); + EXPECT_EQ(qc0_indices, qcs[0].indices); + EXPECT_EQ(qc0_offsets, qcs[0].offsets); + + EXPECT_EQ(1, qcs[1].constraint_row_index); + EXPECT_EQ(qc1_values, qcs[1].values); + EXPECT_EQ(qc1_indices, qcs[1].indices); + EXPECT_EQ(qc1_offsets, qcs[1].offsets); +} + // Test actual QPS files from the dataset TEST(qps_parser, test_qps_files) { From 43f09b4c4cf35e695cb44729714b7c9592a2cc81 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Wed, 15 Apr 2026 07:03:00 -0700 Subject: [PATCH 07/22] fix: address review suggestions (chatbot) --- cpp/libmps_parser/src/mps_data_model.cpp | 3 ++ cpp/libmps_parser/src/mps_parser.cpp | 40 ++++++++++++++++++++---- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 34e7d9b3d9..718c8423d3 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -229,6 +229,9 @@ void mps_data_model_t::append_quadratic_constraint_matrix(i_t constrai const i_t* Qc_offsets, i_t size_offsets) { + mps_parser_expects( + constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); + if (size_values != 0) { mps_parser_expects( Qc_values != nullptr, error_type_t::ValidationError, "Qc_values cannot be null"); diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 9d3c2b45c4..f2ff0089fc 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -1406,9 +1406,23 @@ void mps_parser_t::parse_qcmatrix_data(std::string_view line) i_t pos = 24; value = get_numerical_bound(line, pos); } else { - std::stringstream ss{std::string(line)}; - ss >> var1_name >> var2_name >> value; - if (var1_name[0] == '$' || var2_name[0] == '$') return; + i_t pos = 0; + i_t end = 0; + const std::string_view var1_sv = get_next_string(line, pos, end); + mps_parser_expects(!var1_sv.empty(), + error_type_t::ValidationError, + "QCMATRIX data line missing first variable name! line=%s", + std::string(line).c_str()); + if (var1_sv[0] == '$') return; + const std::string_view var2_sv = get_next_string(line, pos, end); + mps_parser_expects(!var2_sv.empty(), + error_type_t::ValidationError, + "QCMATRIX data line missing second variable name! line=%s", + std::string(line).c_str()); + if (var2_sv[0] == '$') return; + value = get_numerical_bound(line, end); + var1_name = std::string(var1_sv); + var2_name = std::string(var2_sv); } auto var1_it = var_names_map.find(var1_name); @@ -1450,9 +1464,23 @@ void mps_parser_t::parse_quad(std::string_view line, bool is_quadobj) i_t pos = 24; value = get_numerical_bound(line, pos); } else { - std::stringstream ss{std::string(line)}; - ss >> var1_name >> var2_name >> value; - if (var1_name[0] == '$' || var2_name[0] == '$') return; + i_t pos = 0; + i_t end = 0; + const std::string_view var1_sv = get_next_string(line, pos, end); + mps_parser_expects(!var1_sv.empty(), + error_type_t::ValidationError, + "QUADOBJ/QMATRIX data line missing first variable name! line=%s", + std::string(line).c_str()); + if (var1_sv[0] == '$') return; + const std::string_view var2_sv = get_next_string(line, pos, end); + mps_parser_expects(!var2_sv.empty(), + error_type_t::ValidationError, + "QUADOBJ/QMATRIX data line missing second variable name! line=%s", + std::string(line).c_str()); + if (var2_sv[0] == '$') return; + value = get_numerical_bound(line, end); + var1_name = std::string(var1_sv); + var2_name = std::string(var2_sv); } // Find variable indices From ac9dd8a8e891d1bd4c1b446028a3a7995c5d6b23 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Fri, 17 Apr 2026 06:49:34 -0700 Subject: [PATCH 08/22] complete parsder for linear cost, ROW and RHS for QCQP --- .../include/mps_parser/mps_data_model.hpp | 61 ++++--- cpp/libmps_parser/src/mps_data_model.cpp | 89 ++++++--- cpp/libmps_parser/src/mps_parser.cpp | 63 +++++-- cpp/libmps_parser/tests/mps_parser_test.cpp | 169 ++++++++++++++++-- 4 files changed, 296 insertions(+), 86 deletions(-) diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 6022639f46..50f95e2496 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -263,38 +263,43 @@ class mps_data_model_t { i_t size_offsets); /** - * @brief CSR of Q for one quadratic constraint (MPS QCMATRIX). + * @brief One quadratic constraint as parsed from MPS sections (ROWS, COLUMNS, RHS, QCMATRIX). * - * @c constraint_row_index is the row index in the linear constraint matrix A (0-based), - * matching the order of non-objective rows in the ROWS section. + * This bundles all pieces of a quadratic row: + * - row identity and type (from ROWS), + * - sparse linear coefficients (from COLUMNS), + * - RHS value (from RHS), + * - quadratic matrix Q in CSR (from QCMATRIX). */ - struct quadratic_constraint_matrix_t { + struct quadratic_constraint_t { i_t constraint_row_index{}; - std::vector values; - std::vector indices; - std::vector offsets; + std::string constraint_row_name{}; + char constraint_row_type{}; + std::vector linear_values{}; + std::vector linear_indices{}; + f_t rhs_value{f_t(0)}; + std::vector quadratic_values{}; + std::vector quadratic_indices{}; + std::vector quadratic_offsets{}; }; - /** - * @brief Append one quadratic constraint matrix (QCMATRIX) in CSR format. - * - * @param constraint_row_index Row index in A (0-based), matching non-objective ROWS order. - * @param[in] Qc_values Values of the CSR representation; copied into the model. - * @param size_values Size of the Qc_values array. - * @param[in] Qc_indices Indices of the CSR representation; copied into the model. - * @param size_indices Size of the Qc_indices array. - * @param[in] Qc_offsets Offsets of the CSR representation; copied into the model. - * @param size_offsets Size of the Qc_offsets array. - */ - void append_quadratic_constraint_matrix(i_t constraint_row_index, - const f_t* Qc_values, - i_t size_values, - const i_t* Qc_indices, - i_t size_indices, - const i_t* Qc_offsets, - i_t size_offsets); + /** @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). */ + void append_quadratic_constraint(i_t constraint_row_index, + const std::string& constraint_row_name, + char constraint_row_type, + const f_t* linear_values, + i_t linear_nnz, + const i_t* linear_indices, + i_t linear_indices_nnz, + f_t rhs_value, + const f_t* quadratic_values, + i_t quadratic_size_values, + const i_t* quadratic_indices, + i_t quadratic_size_indices, + const i_t* quadratic_offsets, + i_t quadratic_size_offsets); - const std::vector& get_quadratic_constraint_matrices() const; + const std::vector& get_quadratic_constraints() const; i_t get_n_variables() const; i_t get_n_constraints() const; @@ -397,8 +402,8 @@ class mps_data_model_t { std::vector Q_objective_indices_; std::vector Q_objective_offsets_; - /** One CSR matrix per QCMATRIX block, in order of appearance in the file */ - std::vector quadratic_constraint_matrices_; + /** One full quadratic constraint per QCMATRIX block, in order of appearance in the file */ + std::vector quadratic_constraints_; }; // class mps_data_model_t diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 718c8423d3..5feb405631 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -221,51 +221,82 @@ void mps_data_model_t::set_quadratic_objective_matrix(const f_t* Q_val } template -void mps_data_model_t::append_quadratic_constraint_matrix(i_t constraint_row_index, - const f_t* Qc_values, - i_t size_values, - const i_t* Qc_indices, - i_t size_indices, - const i_t* Qc_offsets, - i_t size_offsets) +void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, + const std::string& constraint_row_name, + char constraint_row_type, + const f_t* linear_values, + i_t linear_nnz, + const i_t* linear_indices, + i_t linear_indices_nnz, + f_t rhs_value, + const f_t* quadratic_values, + i_t quadratic_size_values, + const i_t* quadratic_indices, + i_t quadratic_size_indices, + const i_t* quadratic_offsets, + i_t quadratic_size_offsets) { mps_parser_expects( constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); - - if (size_values != 0) { + + mps_parser_expects(linear_nnz == linear_indices_nnz, + error_type_t::ValidationError, + "linear_values and linear_indices must have the same nnz count"); + if (linear_nnz != 0) { mps_parser_expects( - Qc_values != nullptr, error_type_t::ValidationError, "Qc_values cannot be null"); + linear_values != nullptr && linear_indices != nullptr, + error_type_t::ValidationError, + "linear_values and linear_indices cannot be null when linear_nnz > 0"); } - if (size_indices != 0) { + + if (quadratic_size_values != 0) { mps_parser_expects( - Qc_indices != nullptr, error_type_t::ValidationError, "Qc_indices cannot be null"); + quadratic_values != nullptr, error_type_t::ValidationError, "quadratic_values cannot be null"); } mps_parser_expects( - Qc_offsets != nullptr, error_type_t::ValidationError, "Qc_offsets cannot be null"); + quadratic_offsets != nullptr, error_type_t::ValidationError, "quadratic_offsets cannot be null"); + if (quadratic_size_indices != 0) { + mps_parser_expects( + quadratic_indices != nullptr, error_type_t::ValidationError, "quadratic_indices cannot be null"); + } mps_parser_expects( - size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); + quadratic_size_offsets > 0, error_type_t::ValidationError, "quadratic_size_offsets cannot be empty"); + + quadratic_constraint_t qc; + qc.constraint_row_index = constraint_row_index; + qc.constraint_row_name = constraint_row_name; + qc.constraint_row_type = constraint_row_type; + qc.rhs_value = rhs_value; + + qc.linear_values.resize(linear_nnz); + qc.linear_indices.resize(linear_nnz); + if (linear_nnz > 0) { + std::copy(linear_values, linear_values + linear_nnz, qc.linear_values.data()); + std::copy(linear_indices, linear_indices + linear_nnz, qc.linear_indices.data()); + } - quadratic_constraint_matrix_t qcm; - qcm.constraint_row_index = constraint_row_index; - qcm.values.resize(size_values); - if (size_values > 0) { - std::copy(Qc_values, Qc_values + size_values, qcm.values.data()); + qc.quadratic_values.resize(quadratic_size_values); + if (quadratic_size_values > 0) { + std::copy( + quadratic_values, quadratic_values + quadratic_size_values, qc.quadratic_values.data()); } - qcm.indices.resize(size_indices); - if (size_indices > 0) { - std::copy(Qc_indices, Qc_indices + size_indices, qcm.indices.data()); + qc.quadratic_indices.resize(quadratic_size_indices); + if (quadratic_size_indices > 0) { + std::copy( + quadratic_indices, quadratic_indices + quadratic_size_indices, qc.quadratic_indices.data()); } - qcm.offsets.resize(size_offsets); - std::copy(Qc_offsets, Qc_offsets + size_offsets, qcm.offsets.data()); + qc.quadratic_offsets.resize(quadratic_size_offsets); + std::copy( + quadratic_offsets, quadratic_offsets + quadratic_size_offsets, qc.quadratic_offsets.data()); - quadratic_constraint_matrices_.push_back(std::move(qcm)); + quadratic_constraints_.push_back(std::move(qc)); } template -auto mps_data_model_t::get_quadratic_constraint_matrices() const - -> const std::vector& +auto mps_data_model_t::get_quadratic_constraints() const + -> const std::vector& { - return quadratic_constraint_matrices_; + return quadratic_constraints_; } template @@ -512,7 +543,7 @@ bool mps_data_model_t::has_quadratic_objective() const noexcept template bool mps_data_model_t::has_quadratic_constraints() const noexcept { - return !quadratic_constraint_matrices_.empty(); + return !quadratic_constraints_.empty(); } // NOTE: Explicitly instantiate all types here in order to avoid linker error diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index f2ff0089fc..984fd213a4 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -271,6 +271,11 @@ ObjSenseType convert_to_obj_sense(const std::string& str) template void mps_parser_t::fill_problem(mps_data_model_t& problem) { + std::unordered_set quadratic_row_ids{}; + for (const auto& block : qcmatrix_blocks_) { + quadratic_row_ids.insert(block.constraint_row_id); + } + { std::vector h_offsets{}, h_indices{}; std::vector h_values{}; @@ -278,13 +283,17 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) h_offsets.push_back(0); for (i_t i = 0; i < (i_t)A_indices.size(); ++i) { i_t off = h_offsets.size() > 0 ? h_offsets[h_offsets.size() - 1] : 0; - for (const auto& idx_itr : A_indices[i]) { - h_indices.push_back(idx_itr); - } - for (const auto& val_itr : A_values[i]) { - h_values.push_back(val_itr); + // Keep quadratic-row linear coefficients out of global A; they are stored with each + // quadratic constraint object instead. + if (!quadratic_row_ids.count(i)) { + for (const auto& idx_itr : A_indices[i]) { + h_indices.push_back(idx_itr); + } + for (const auto& val_itr : A_values[i]) { + h_values.push_back(val_itr); + } + off += A_indices[i].size(); } - off += A_indices[i].size(); h_offsets.push_back(off); } @@ -317,6 +326,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) "nonzero vector. Nonzero has size %zu but the last offset is %d.", h_values.size(), h_offsets[h_offsets.size() - 1]); + } // Set b & c @@ -438,6 +448,13 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_variable_names(std::move(var_names)); problem.set_variable_types(std::move(var_types)); problem.set_row_names(std::move(row_names)); + { + std::vector row_types_host(row_types.size()); + for (size_t i = 0; i < row_types.size(); ++i) { + row_types_host[i] = static_cast(row_types[i]); + } + problem.set_row_types(row_types_host.data(), static_cast(row_types_host.size())); + } problem.set_maximize(maximize); // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of O(nnz*log(nnz))) @@ -530,18 +547,36 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) csr_result.offsets.size()); } - // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs) + // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs). + // Bundle row metadata, row-linear coefficients (from COLUMNS), rhs, and quadratic part together. constexpr f_t k_qcmatrix_value_scale = f_t(1); for (const auto& block : qcmatrix_blocks_) { auto csr_result = build_csr_via_transpose( block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); - problem.append_quadratic_constraint_matrix(block.constraint_row_id, - csr_result.values.data(), - csr_result.values.size(), - csr_result.indices.data(), - csr_result.indices.size(), - csr_result.offsets.data(), - csr_result.offsets.size()); + const i_t row_id = block.constraint_row_id; + mps_parser_expects(row_id >= 0 && row_id < problem.get_n_constraints(), + error_type_t::ValidationError, + "QCMATRIX row index %d is out of range for constraints", + static_cast(row_id)); + const i_t linear_nnz = static_cast(A_indices[row_id].size()); + const f_t* linear_val = linear_nnz > 0 ? A_values[row_id].data() : nullptr; + const i_t* linear_idx = linear_nnz > 0 ? A_indices[row_id].data() : nullptr; + + problem.append_quadratic_constraint( + row_id, + problem.get_row_names()[row_id], + problem.get_row_types()[row_id], + linear_val, + linear_nnz, + linear_idx, + linear_nnz, + problem.get_constraint_bounds()[row_id], + csr_result.values.data(), + static_cast(csr_result.values.size()), + csr_result.indices.data(), + static_cast(csr_result.indices.size()), + csr_result.offsets.data(), + static_cast(csr_result.offsets.size())); } } diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index e041b42419..dd1376621b 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -13,8 +13,10 @@ #include +#include #include #include +#include #include #include #include @@ -865,39 +867,176 @@ TEST(qps_parser, qcmatrix_append_api) model_t model; // Validate default-constructed struct shape. - model_t::quadratic_constraint_matrix_t default_qcm; + model_t::quadratic_constraint_t default_qcm; EXPECT_EQ(0, default_qcm.constraint_row_index); - EXPECT_TRUE(default_qcm.values.empty()); - EXPECT_TRUE(default_qcm.indices.empty()); - EXPECT_TRUE(default_qcm.offsets.empty()); + EXPECT_TRUE(default_qcm.quadratic_values.empty()); + EXPECT_TRUE(default_qcm.quadratic_indices.empty()); + EXPECT_TRUE(default_qcm.quadratic_offsets.empty()); + EXPECT_TRUE(default_qcm.linear_values.empty()); + EXPECT_TRUE(default_qcm.linear_indices.empty()); + EXPECT_EQ(0.0, default_qcm.rhs_value); // QC0: [[10, 2], [2, 2]] const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; const std::vector qc0_indices = {0, 1, 0, 1}; const std::vector qc0_offsets = {0, 2, 4}; - model.append_quadratic_constraint_matrix( - 0, qc0_values.data(), qc0_values.size(), qc0_indices.data(), qc0_indices.size(), qc0_offsets.data(), qc0_offsets.size()); + const std::vector qc0_linear_values = {1.0, 1.0}; + const std::vector qc0_linear_indices = {0, 1}; + model.append_quadratic_constraint(0, + "QC0", + 'L', + qc0_linear_values.data(), + qc0_linear_values.size(), + qc0_linear_indices.data(), + qc0_linear_indices.size(), + 5.0, + qc0_values.data(), + qc0_values.size(), + qc0_indices.data(), + qc0_indices.size(), + qc0_offsets.data(), + qc0_offsets.size()); // QC1: [[4, 1], [1, 6]] const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; const std::vector qc1_indices = {0, 1, 0, 1}; const std::vector qc1_offsets = {0, 2, 4}; - model.append_quadratic_constraint_matrix( - 1, qc1_values.data(), qc1_values.size(), qc1_indices.data(), qc1_indices.size(), qc1_offsets.data(), qc1_offsets.size()); + const std::vector qc1_linear_values = {3.0, 1.0}; + const std::vector qc1_linear_indices = {0, 1}; + model.append_quadratic_constraint(1, + "QC1", + 'L', + qc1_linear_values.data(), + qc1_linear_values.size(), + qc1_linear_indices.data(), + qc1_linear_indices.size(), + 10.0, + qc1_values.data(), + qc1_values.size(), + qc1_indices.data(), + qc1_indices.size(), + qc1_offsets.data(), + qc1_offsets.size()); ASSERT_TRUE(model.has_quadratic_constraints()); - const auto& qcs = model.get_quadratic_constraint_matrices(); + const auto& qcs = model.get_quadratic_constraints(); ASSERT_EQ(2u, qcs.size()); EXPECT_EQ(0, qcs[0].constraint_row_index); - EXPECT_EQ(qc0_values, qcs[0].values); - EXPECT_EQ(qc0_indices, qcs[0].indices); - EXPECT_EQ(qc0_offsets, qcs[0].offsets); + EXPECT_EQ("QC0", qcs[0].constraint_row_name); + EXPECT_EQ('L', qcs[0].constraint_row_type); + EXPECT_EQ(qc0_linear_values, qcs[0].linear_values); + EXPECT_EQ(qc0_linear_indices, qcs[0].linear_indices); + EXPECT_EQ(5.0, qcs[0].rhs_value); + EXPECT_EQ(qc0_values, qcs[0].quadratic_values); + EXPECT_EQ(qc0_indices, qcs[0].quadratic_indices); + EXPECT_EQ(qc0_offsets, qcs[0].quadratic_offsets); EXPECT_EQ(1, qcs[1].constraint_row_index); - EXPECT_EQ(qc1_values, qcs[1].values); - EXPECT_EQ(qc1_indices, qcs[1].indices); - EXPECT_EQ(qc1_offsets, qcs[1].offsets); + EXPECT_EQ("QC1", qcs[1].constraint_row_name); + EXPECT_EQ('L', qcs[1].constraint_row_type); + EXPECT_EQ(qc1_linear_values, qcs[1].linear_values); + EXPECT_EQ(qc1_linear_indices, qcs[1].linear_indices); + EXPECT_EQ(10.0, qcs[1].rhs_value); + EXPECT_EQ(qc1_values, qcs[1].quadratic_values); + EXPECT_EQ(qc1_indices, qcs[1].quadratic_indices); + EXPECT_EQ(qc1_offsets, qcs[1].quadratic_offsets); +} + +// QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic. +TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) +{ + if (!file_exists("qcqp/QC_Test_1.mps")) { + GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root"; + } + const auto model = parse_mps( + cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false); + + ASSERT_TRUE(model.has_quadratic_constraints()); + const auto& qcs = model.get_quadratic_constraints(); + ASSERT_EQ(2u, qcs.size()); + + ASSERT_EQ(3, model.get_n_constraints()); + ASSERT_EQ(3u, model.get_row_names().size()); + EXPECT_EQ("LIN0", model.get_row_names()[0]); + EXPECT_EQ("QC0", model.get_row_names()[1]); + EXPECT_EQ("QC1", model.get_row_names()[2]); + EXPECT_EQ('L', model.get_row_types()[0]); + + // LIN0: 2*x1 + x2 ≤ 15 (linear row only; not duplicated in quadratic_constraints) + EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), + model.get_constraint_lower_bounds()[0]); + EXPECT_DOUBLE_EQ(15.0, model.get_constraint_upper_bounds()[0]); + const auto& A_off = model.get_constraint_matrix_offsets(); + const auto& A_val = model.get_constraint_matrix_values(); + const auto& A_idx = model.get_constraint_matrix_indices(); + ASSERT_EQ(2, A_off[1] - A_off[0]); + EXPECT_EQ(2.0, A_val[A_off[0] + 0]); + EXPECT_EQ(1.0, A_val[A_off[0] + 1]); + EXPECT_EQ(0, A_idx[A_off[0] + 0]); + EXPECT_EQ(1, A_idx[A_off[0] + 1]); + + // QC0: x1 + x2 + xᵀQ₀x ≤ 5 (row index 1: OBJ 'N' rows are not counted in row_names) + EXPECT_EQ(1, qcs[0].constraint_row_index); + EXPECT_EQ("QC0", qcs[0].constraint_row_name); + EXPECT_EQ('L', qcs[0].constraint_row_type); + ASSERT_EQ(2u, qcs[0].linear_values.size()); + EXPECT_EQ(1.0, qcs[0].linear_values[0]); + EXPECT_EQ(1.0, qcs[0].linear_values[1]); + EXPECT_EQ(0, qcs[0].linear_indices[0]); + EXPECT_EQ(1, qcs[0].linear_indices[1]); + EXPECT_DOUBLE_EQ(5.0, qcs[0].rhs_value); + EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), + model.get_constraint_lower_bounds()[qcs[0].constraint_row_index]); + EXPECT_DOUBLE_EQ(5.0, model.get_constraint_upper_bounds()[qcs[0].constraint_row_index]); + EXPECT_FALSE(qcs[0].quadratic_values.empty()); + + // QC1: 3*x1 + x2 + xᵀQ₁x ≤ 10 + EXPECT_EQ(2, qcs[1].constraint_row_index); + EXPECT_EQ("QC1", qcs[1].constraint_row_name); + EXPECT_EQ('L', qcs[1].constraint_row_type); + ASSERT_EQ(2u, qcs[1].linear_values.size()); + EXPECT_EQ(3.0, qcs[1].linear_values[0]); + EXPECT_EQ(1.0, qcs[1].linear_values[1]); + EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value); + EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), + model.get_constraint_lower_bounds()[qcs[1].constraint_row_index]); + EXPECT_DOUBLE_EQ(10.0, model.get_constraint_upper_bounds()[qcs[1].constraint_row_index]); +} + +TEST(qps_parser, qcqp_p0033_mps_sections) +{ + if (!file_exists("qcqp/p0033_qc1.mps")) { + GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root"; + } + const auto model = parse_mps( + cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false); + + EXPECT_EQ(16, model.get_n_constraints()); + EXPECT_EQ(33, model.get_n_variables()); + ASSERT_EQ(16u, model.get_row_types().size()); + ASSERT_EQ(16u, model.get_row_names().size()); + + const auto& rnames = model.get_row_names(); + auto qc1_it = std::find(rnames.begin(), rnames.end(), std::string("QC1")); + ASSERT_NE(qc1_it, rnames.end()); + const int qc1_row = static_cast(qc1_it - rnames.begin()); + + std::vector coeff; + std::vector vars; + const auto& qcs = model.get_quadratic_constraints(); + ASSERT_EQ(4u, qcs.size()); + EXPECT_EQ(qc1_row, qcs[0].constraint_row_index); + ASSERT_EQ(1u, qcs[0].linear_values.size()); + EXPECT_DOUBLE_EQ(1.0, qcs[0].linear_values[0]); + + const auto& vnames = model.get_variable_names(); + auto c159_it = std::find(vnames.begin(), vnames.end(), std::string("C159")); + ASSERT_NE(c159_it, vnames.end()); + EXPECT_EQ(static_cast(c159_it - vnames.begin()), qcs[0].linear_indices[0]); + + EXPECT_DOUBLE_EQ(1.0, qcs[0].rhs_value); + EXPECT_FALSE(qcs[0].quadratic_values.empty()); } // Test actual QPS files from the dataset From 0bd6499f4bae32fb9cbfa50021248dfc07d17f6b Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Mon, 20 Apr 2026 06:59:00 -0700 Subject: [PATCH 09/22] Add support for reading and writting quadratical constrains into MPS; note that we separate quadratical constraint from the constraint matrix A and RHS --- .../cpu_optimization_problem.hpp | 19 ++ .../optimization_problem.hpp | 21 ++ .../optimization_problem_interface.hpp | 48 ++++ .../optimization_problem_utils.hpp | 33 +++ .../include/mps_parser/data_model_view.hpp | 29 ++ .../include/mps_parser/mps_data_model.hpp | 30 ++- .../include/mps_parser/utilities/span.hpp | 2 + cpp/libmps_parser/src/data_model_view.cpp | 68 +++++ cpp/libmps_parser/src/mps_data_model.cpp | 50 +++- cpp/libmps_parser/src/mps_parser.cpp | 132 ++++++---- cpp/libmps_parser/src/mps_writer.cpp | 247 ++++++++++++++++-- cpp/libmps_parser/tests/mps_parser_test.cpp | 94 +++++-- cpp/src/pdlp/cpu_optimization_problem.cpp | 96 +++++++ cpp/src/pdlp/optimization_problem.cu | 92 ++++++- 14 files changed, 863 insertions(+), 98 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp index 009a8ce84e..2865fc877f 100644 --- a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp @@ -41,6 +41,8 @@ class mip_solution_interface_t; template class cpu_optimization_problem_t : public optimization_problem_interface_t { public: + using typename optimization_problem_interface_t::mps_quadratic_constraint_t; + cpu_optimization_problem_t(); // Setters @@ -113,6 +115,17 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t& get_quadratic_objective_values() const override; bool has_quadratic_objective() const override; + void set_quadratic_constraints(std::vector constraints) override; + bool has_quadratic_constraints() const override; + const std::vector& get_quadratic_constraints() const override; + // Additional methods for MPS export if quadratic constraints are present + void set_linear_constraint_mps_indices(std::vector indices) override; + void set_mps_declaration_constraint_row_count(i_t count) override; + void set_mps_all_constraint_row_names(std::vector names) override; + i_t get_mps_declaration_constraint_row_count() const override; + const std::vector& get_linear_constraint_mps_indices() const override; + const std::vector& get_mps_all_constraint_row_names() const override; + // Host getters - these are the only supported getters for CPU implementation std::vector get_constraint_matrix_values_host() const override; std::vector get_constraint_matrix_indices_host() const override; @@ -185,6 +198,12 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t Q_indices_; std::vector Q_values_; + std::vector quadratic_constraints_{}; + + std::vector linear_constraint_mps_indices_{}; + i_t mps_declaration_constraint_row_count_{0}; + std::vector mps_all_constraint_row_names_{}; + std::vector variable_lower_bounds_; std::vector variable_upper_bounds_; std::vector constraint_lower_bounds_; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem.hpp b/cpp/include/cuopt/linear_programming/optimization_problem.hpp index df78dd17c7..27f5ac1168 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem.hpp @@ -72,6 +72,9 @@ class optimization_problem_t : public optimization_problem_interface_t static_assert(std::is_floating_point::value, "'optimization_problem_t' accepts only floating point types for weights"); + // nvcc does not always find base typedefs in derived class scope; inject explicitly. + using typename optimization_problem_interface_t::mps_quadratic_constraint_t; + /** * @brief A device-side view of the `optimization_problem_t` structure with * the RAII stuffs stripped out, to make it easy to work inside kernels @@ -196,6 +199,8 @@ class optimization_problem_t : public optimization_problem_interface_t i_t size_offsets, bool validate_positive_semi_definite = false) override; + void set_quadratic_constraints(std::vector constraints) override; + /** @copydoc optimization_problem_interface_t::set_variable_lower_bounds */ void set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) override; /** @copydoc optimization_problem_interface_t::set_variable_upper_bounds */ @@ -259,7 +264,16 @@ class optimization_problem_t : public optimization_problem_interface_t const std::vector& get_quadratic_objective_offsets() const override; const std::vector& get_quadratic_objective_indices() const override; const std::vector& get_quadratic_objective_values() const override; + const std::vector& get_quadratic_constraints() const override; bool has_quadratic_objective() const override; + bool has_quadratic_constraints() const override; + + void set_linear_constraint_mps_indices(std::vector indices) override; + void set_mps_declaration_constraint_row_count(i_t count) override; + void set_mps_all_constraint_row_names(std::vector names) override; + i_t get_mps_declaration_constraint_row_count() const override; + const std::vector& get_linear_constraint_mps_indices() const override; + const std::vector& get_mps_all_constraint_row_names() const override; // ============================================================================ // Host getters @@ -376,6 +390,13 @@ class optimization_problem_t : public optimization_problem_interface_t std::vector Q_indices_; std::vector Q_values_; + /** QCQP: quadratic constraints **/ + std::vector quadratic_constraints_{}; + + std::vector linear_constraint_mps_indices_{}; + i_t mps_declaration_constraint_row_count_{0}; + std::vector mps_all_constraint_row_names_{}; + rmm::device_uvector variable_lower_bounds_; rmm::device_uvector variable_upper_bounds_; rmm::device_uvector constraint_lower_bounds_; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index 767e62e746..c4ee81d875 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include @@ -56,8 +57,55 @@ class optimization_problem_interface_t { static_assert(std::is_floating_point::value, "'optimization_problem_interface_t' accepts only floating point types for weights"); + /** Quadratic constraints as parsed/stored for MPS QCQP (QCMATRIX rows). */ + using mps_quadratic_constraint_t = + typename mps_parser::mps_data_model_t::quadratic_constraint_t; + virtual ~optimization_problem_interface_t() = default; + /** + * @brief Store quadratic constraints for MPS round-trip (linear + Q parts per QC row). + * @note Default implementation ignores; GPU/CPU implementations persist for write_to_mps. + */ + virtual void set_quadratic_constraints(std::vector constraints) + { + (void)constraints; + } + + /** @brief Whether quadratic constraint metadata is present (for MPS export). */ + virtual bool has_quadratic_constraints() const { return false; } + + /** @brief Quadratic constraints for MPS export (empty if none). */ + virtual const std::vector& get_quadratic_constraints() const + { + static const std::vector k_empty{}; + return k_empty; + } + + /** + * @brief When QCMATRIX rows are omitted from the linear CSR, maps linear CSR row j to the MPS + * ROWS declaration index. Used for MPS export only. + */ + virtual void set_linear_constraint_mps_indices(std::vector indices) { (void)indices; } + + virtual void set_mps_declaration_constraint_row_count(i_t count) { (void)count; } + + virtual void set_mps_all_constraint_row_names(std::vector names) { (void)names; } + + virtual i_t get_mps_declaration_constraint_row_count() const { return 0; } + + virtual const std::vector& get_linear_constraint_mps_indices() const + { + static const std::vector k_empty{}; + return k_empty; + } + + virtual const std::vector& get_mps_all_constraint_row_names() const + { + static const std::vector k_empty{}; + return k_empty; + } + // ============================================================================ // Setters (accept both CPU and GPU pointers) // ============================================================================ diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp index 90e853f530..bd05b3c1b2 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp @@ -109,6 +109,24 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr q_offsets.data(), n_vars + 1); } + // Handle quadratic constraints if present + if (data_model.has_quadratic_constraints()) { + problem->set_quadratic_constraints( + std::vector::quadratic_constraint_t>( + data_model.get_quadratic_constraints())); + } + + if (data_model.get_mps_declaration_constraint_row_count() > 0) { + problem->set_linear_constraint_mps_indices( + std::vector(data_model.get_linear_constraint_mps_indices())); + problem->set_mps_declaration_constraint_row_count(data_model.get_mps_declaration_constraint_row_count()); + problem->set_mps_all_constraint_row_names( + std::vector(data_model.get_mps_all_constraint_row_names())); + } else { + problem->set_linear_constraint_mps_indices({}); + problem->set_mps_declaration_constraint_row_count(0); + problem->set_mps_all_constraint_row_names({}); + } } /** @@ -266,6 +284,21 @@ void populate_from_data_model_view(optimization_problem_interface_t* p if (data_model->get_row_names().size() != 0) { problem->set_row_names(data_model->get_row_names()); } + + if (data_model->get_mps_declaration_constraint_row_count() > 0) { + const auto lmi = data_model->get_linear_constraint_mps_indices(); + if (lmi.size() > 0) { + problem->set_linear_constraint_mps_indices(std::vector( + lmi.data(), lmi.data() + static_cast(lmi.size()))); + } + problem->set_mps_declaration_constraint_row_count(data_model->get_mps_declaration_constraint_row_count()); + problem->set_mps_all_constraint_row_names( + std::vector(data_model->get_mps_all_constraint_row_names())); + } else { + problem->set_linear_constraint_mps_indices({}); + problem->set_mps_declaration_constraint_row_count(0); + problem->set_mps_all_constraint_row_names({}); + } } } // namespace cuopt::linear_programming diff --git a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp index c2a8f84980..9e2c0e112e 100644 --- a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp +++ b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -415,6 +416,27 @@ class data_model_view_t { */ bool is_Q_symmetrized() const noexcept; + /** + * @brief Quadratic constraints (MPS QCMATRIX); owned copy for writers when not using spans. + */ + void set_quadratic_constraints( + std::vector::quadratic_constraint_t> constraints); + + bool has_quadratic_constraints() const noexcept; + + const std::vector::quadratic_constraint_t>& + get_quadratic_constraints() const noexcept; + + void set_linear_constraint_mps_indices(const i_t* indices, i_t size); + span get_linear_constraint_mps_indices() const noexcept; + + void set_mps_declaration_constraint_row_count(i_t count); + i_t get_mps_declaration_constraint_row_count() const noexcept; + + void set_mps_all_constraint_row_names(std::vector names); + + const std::vector& get_mps_all_constraint_row_names() const noexcept; + private: bool maximize_{false}; span A_; @@ -444,6 +466,13 @@ class data_model_view_t { span Q_objective_indices_; span Q_objective_offsets_; bool is_Q_symmetrized_{false}; + + std::vector::quadratic_constraint_t> quadratic_constraints_; + + span linear_mps_indices_{}; + std::vector linear_mps_indices_owned_{}; + i_t mps_declaration_constraint_row_count_{0}; + std::vector mps_all_constraint_row_names_{}; }; // class data_model_view_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 50f95e2496..2a578c36c5 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -272,6 +272,7 @@ class mps_data_model_t { * - quadratic matrix Q in CSR (from QCMATRIX). */ struct quadratic_constraint_t { + /** ROWS declaration index (among all constraint rows), not an index into the linear CSR. */ i_t constraint_row_index{}; std::string constraint_row_name{}; char constraint_row_type{}; @@ -347,6 +348,25 @@ class mps_data_model_t { bool has_quadratic_constraints() const noexcept; + /** + * @brief When quadratic constraints are present, CSR rows are linear-only; entry j is the MPS + * ROWS declaration index for linear CSR row j. + */ + void set_linear_constraint_mps_indices(std::vector indices); + + /** + * @brief ROWS names in declaration order (size == declaration row count). Used for MPS export + * when linear CSR excludes quadratic rows. + */ + void set_mps_all_constraint_row_names(std::vector names); + + /** @brief Total ROWS constraint count (linear + quadratic) when QC rows are separated; else 0. */ + void set_mps_declaration_constraint_row_count(i_t count); + + const std::vector& get_linear_constraint_mps_indices() const; + const std::vector& get_mps_all_constraint_row_names() const; + i_t get_mps_declaration_constraint_row_count() const; + /** whether to maximize or minimize the objective function */ bool maximize_; /** @@ -383,7 +403,10 @@ class mps_data_model_t { std::string problem_name_; /** names of each of the variables in the OP */ std::vector var_names_{}; - /** names of each of the rows (aka constraints or objective) in the OP */ + /** + * names of linear constraint rows only when QCMATRIX rows are separated; otherwise all constraint + * rows (same as MPS ROWS order excluding objective). + */ std::vector row_names_{}; /** number of variables */ i_t n_vars_{0}; @@ -405,6 +428,11 @@ class mps_data_model_t { /** One full quadratic constraint per QCMATRIX block, in order of appearance in the file */ std::vector quadratic_constraints_; + /** Maps linear CSR row j -> MPS ROWS declaration index; non-empty iff QC rows are split out. */ + std::vector linear_constraint_mps_indices_{}; + i_t mps_declaration_constraint_row_count_{0}; + std::vector mps_all_constraint_row_names_{}; + }; // class mps_data_model_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp b/cpp/libmps_parser/include/mps_parser/utilities/span.hpp index 02679cd378..7ad4f25d4c 100644 --- a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp +++ b/cpp/libmps_parser/include/mps_parser/utilities/span.hpp @@ -18,6 +18,8 @@ class span { span(T* ptr, std::size_t size) : ptr_(ptr), size_(size) {} std::size_t size() const noexcept { return size_; } const T* data() const noexcept { return ptr_; } + T& operator[](std::size_t i) noexcept { return ptr_[i]; } + T const& operator[](std::size_t i) const noexcept { return ptr_[i]; } private: T* ptr_{nullptr}; diff --git a/cpp/libmps_parser/src/data_model_view.cpp b/cpp/libmps_parser/src/data_model_view.cpp index 62b441aa60..be32eb3d3a 100644 --- a/cpp/libmps_parser/src/data_model_view.cpp +++ b/cpp/libmps_parser/src/data_model_view.cpp @@ -355,6 +355,74 @@ bool data_model_view_t::is_Q_symmetrized() const noexcept return is_Q_symmetrized_; } +template +void data_model_view_t::set_quadratic_constraints( + std::vector::quadratic_constraint_t> constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + +template +bool data_model_view_t::has_quadratic_constraints() const noexcept +{ + return !quadratic_constraints_.empty(); +} + +template +const std::vector::quadratic_constraint_t>& +data_model_view_t::get_quadratic_constraints() const noexcept +{ + return quadratic_constraints_; +} + +template +void data_model_view_t::set_linear_constraint_mps_indices(const i_t* indices, i_t size) +{ + if (size != 0) { + mps_parser_expects( + indices != nullptr, error_type_t::ValidationError, "linear MPS indices cannot be null"); + } + linear_mps_indices_owned_.assign(indices, indices + size); + if (linear_mps_indices_owned_.empty()) { + linear_mps_indices_ = span{}; + } else { + linear_mps_indices_ = span(linear_mps_indices_owned_.data(), + static_cast(linear_mps_indices_owned_.size())); + } +} + +template +span data_model_view_t::get_linear_constraint_mps_indices() const noexcept +{ + return linear_mps_indices_; +} + +template +void data_model_view_t::set_mps_declaration_constraint_row_count(i_t count) +{ + mps_declaration_constraint_row_count_ = count; +} + +template +i_t data_model_view_t::get_mps_declaration_constraint_row_count() const noexcept +{ + return mps_declaration_constraint_row_count_; +} + +template +void data_model_view_t::set_mps_all_constraint_row_names(std::vector names) +{ + mps_all_constraint_row_names_ = std::move(names); +} + +template +const std::vector& data_model_view_t::get_mps_all_constraint_row_names() const + noexcept +{ + return mps_all_constraint_row_names_; +} + + // NOTE: Explicitly instantiate all types here in order to avoid linker error template class data_model_view_t; diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 5feb405631..523a975447 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -220,6 +220,24 @@ void mps_data_model_t::set_quadratic_objective_matrix(const f_t* Q_val std::copy(Q_offsets, Q_offsets + size_offsets, Q_objective_offsets_.data()); } +template +void mps_data_model_t::set_linear_constraint_mps_indices(std::vector indices) +{ + linear_constraint_mps_indices_ = std::move(indices); +} + +template +void mps_data_model_t::set_mps_all_constraint_row_names(std::vector names) +{ + mps_all_constraint_row_names_ = std::move(names); +} + +template +void mps_data_model_t::set_mps_declaration_constraint_row_count(i_t count) +{ + mps_declaration_constraint_row_count_ = count; +} + template void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, @@ -292,13 +310,6 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ quadratic_constraints_.push_back(std::move(qc)); } -template -auto mps_data_model_t::get_quadratic_constraints() const - -> const std::vector& -{ - return quadratic_constraints_; -} - template const std::vector& mps_data_model_t::get_constraint_matrix_values() const { @@ -534,6 +545,31 @@ std::vector& mps_data_model_t::get_quadratic_objective_offsets() return Q_objective_offsets_; } +template +const std::vector& mps_data_model_t::get_linear_constraint_mps_indices() const +{ + return linear_constraint_mps_indices_; +} + +template +const std::vector& mps_data_model_t::get_mps_all_constraint_row_names() const +{ + return mps_all_constraint_row_names_; +} + +template +i_t mps_data_model_t::get_mps_declaration_constraint_row_count() const +{ + return mps_declaration_constraint_row_count_; +} + +template +auto mps_data_model_t::get_quadratic_constraints() const + -> const std::vector& +{ + return quadratic_constraints_; +} + template bool mps_data_model_t::has_quadratic_objective() const noexcept { diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 984fd213a4..1d31f2a9d8 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -271,6 +271,7 @@ ObjSenseType convert_to_obj_sense(const std::string& str) template void mps_parser_t::fill_problem(mps_data_model_t& problem) { + // count the row indices that are quadratic constraints std::unordered_set quadratic_row_ids{}; for (const auto& block : qcmatrix_blocks_) { quadratic_row_ids.insert(block.constraint_row_id); @@ -281,20 +282,19 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) std::vector h_values{}; h_offsets.push_back(0); + i_t num_linear_rows = 0; for (i_t i = 0; i < (i_t)A_indices.size(); ++i) { - i_t off = h_offsets.size() > 0 ? h_offsets[h_offsets.size() - 1] : 0; - // Keep quadratic-row linear coefficients out of global A; they are stored with each - // quadratic constraint object instead. - if (!quadratic_row_ids.count(i)) { - for (const auto& idx_itr : A_indices[i]) { - h_indices.push_back(idx_itr); - } - for (const auto& val_itr : A_values[i]) { - h_values.push_back(val_itr); - } - off += A_indices[i].size(); + // Quadratic constraint rows are omitted from the linear CSR; linear pieces live in each + // quadratic_constraint_t bundle. + if (quadratic_row_ids.count(i)) { continue; } + ++num_linear_rows; + for (const auto& idx_itr : A_indices[i]) { + h_indices.push_back(idx_itr); } - h_offsets.push_back(off); + for (const auto& val_itr : A_values[i]) { + h_values.push_back(val_itr); + } + h_offsets.push_back(static_cast(h_indices.size())); } problem.set_csr_constraint_matrix(h_values.data(), @@ -304,12 +304,13 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) h_offsets.data(), h_offsets.size()); - mps_parser_expects(A_indices.size() + 1 == h_offsets.size(), - error_type_t::ValidationError, - "The row indexing vector for the constraint matrix was not constructed " - "successfully. Should be size %zu, but was size %zu", - A_indices.size() + 1, - h_offsets.size()); + mps_parser_expects( + static_cast(num_linear_rows) + 1 == h_offsets.size(), + error_type_t::ValidationError, + "The row indexing vector for the constraint matrix was not constructed " + "successfully. Should be size %zu, but was size %zu", + static_cast(num_linear_rows) + 1, + h_offsets.size()); mps_parser_expects( h_indices.size() == h_values.size(), error_type_t::ValidationError, @@ -329,8 +330,13 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) } - // Set b & c - problem.set_constraint_bounds(b_values.data(), b_values.size()); + // Set b & c (RHS entries for quadratic rows are stored only on quadratic_constraint_t) + std::vector b_compacted{}; + b_compacted.reserve(b_values.size()); + for (i_t i = 0; i < (i_t)b_values.size(); ++i) { + if (!quadratic_row_ids.count(i)) { b_compacted.push_back(b_values[i]); } + } + problem.set_constraint_bounds(b_compacted.data(), static_cast(b_compacted.size())); problem.set_objective_coefficients(c_values.data(), c_values.size()); // Set offset and scaling factor of objective function @@ -352,22 +358,24 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.get_variable_lower_bounds().size(), problem.get_variable_upper_bounds().size()); - // Determine the constraint bounds based on row types + // Determine the constraint bounds based on row types (quadratic rows use bundles only, not counted here) { std::vector h_constraint_lower_bounds{}; std::vector h_constraint_upper_bounds{}; for (i_t i = 0; i < (i_t)row_types.size(); ++i) { + if (quadratic_row_ids.count(i)) { continue; } if (row_types[i] == Equality) { h_constraint_lower_bounds.push_back(b_values[i]); h_constraint_upper_bounds.push_back(b_values[i]); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_lower_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Constraints lower bound %d shouldn't be nan", i); - mps_parser_expects(!std::isnan(h_constraint_upper_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Constraints upper bound %d shouldn't be nan", i); @@ -376,17 +384,18 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) "Equality range value %d shouldn't be nan", i); if (ranges_values[i] < f_t(0)) - h_constraint_lower_bounds[i] = h_constraint_lower_bounds[i] + ranges_values[i]; + h_constraint_lower_bounds[r] = h_constraint_lower_bounds[r] + ranges_values[i]; else // Positive - h_constraint_upper_bounds[i] = h_constraint_upper_bounds[i] + ranges_values[i]; + h_constraint_upper_bounds[r] = h_constraint_upper_bounds[r] + ranges_values[i]; } } else if (row_types[i] == GreaterThanOrEqual) { h_constraint_lower_bounds.push_back(b_values[i]); h_constraint_upper_bounds.push_back(std::numeric_limits::infinity()); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_lower_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Constraints lower bound %d shouldn't be nan", i); @@ -394,15 +403,16 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) error_type_t::ValidationError, "Greater range value %d shouldn't be nan", i); - h_constraint_upper_bounds[i] = h_constraint_lower_bounds[i] + std::abs(ranges_values[i]); + h_constraint_upper_bounds[r] = h_constraint_lower_bounds[r] + std::abs(ranges_values[i]); } } else if (row_types[i] == LesserThanOrEqual) { h_constraint_lower_bounds.push_back(-std::numeric_limits::infinity()); h_constraint_upper_bounds.push_back(b_values[i]); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_upper_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Constraints upper bound %d shouldn't be nan", i); @@ -410,17 +420,18 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) error_type_t::ValidationError, "Lesser range value %d shouldn't be nan", i); - h_constraint_lower_bounds[i] = h_constraint_upper_bounds[i] - std::abs(ranges_values[i]); + h_constraint_lower_bounds[r] = h_constraint_upper_bounds[r] - std::abs(ranges_values[i]); } } else { mps_parser_expects(false, error_type_t::ValidationError, "Unsupported row type was passed to the Optimization Problem"); } + const size_t r = h_constraint_lower_bounds.size() - 1; mps_parser_expects( - !std::isnan(h_constraint_lower_bounds[i]), error_type_t::ValidationError, "Cannot be nan"); + !std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Cannot be nan"); mps_parser_expects( - !std::isnan(h_constraint_upper_bounds[i]), error_type_t::ValidationError, "Cannot be nan"); + !std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Cannot be nan"); } problem.set_constraint_lower_bounds(h_constraint_lower_bounds.data(), @@ -447,14 +458,6 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_objective_name(objective_name); problem.set_variable_names(std::move(var_names)); problem.set_variable_types(std::move(var_types)); - problem.set_row_names(std::move(row_names)); - { - std::vector row_types_host(row_types.size()); - for (size_t i = 0; i < row_types.size(); ++i) { - row_types_host[i] = static_cast(row_types[i]); - } - problem.set_row_types(row_types_host.data(), static_cast(row_types_host.size())); - } problem.set_maximize(maximize); // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of O(nnz*log(nnz))) @@ -554,23 +557,24 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) auto csr_result = build_csr_via_transpose( block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); const i_t row_id = block.constraint_row_id; - mps_parser_expects(row_id >= 0 && row_id < problem.get_n_constraints(), - error_type_t::ValidationError, - "QCMATRIX row index %d is out of range for constraints", - static_cast(row_id)); + mps_parser_expects( + row_id >= 0 && row_id < static_cast(row_types.size()), + error_type_t::ValidationError, + "QCMATRIX row index %d is out of range for constraints", + static_cast(row_id)); const i_t linear_nnz = static_cast(A_indices[row_id].size()); const f_t* linear_val = linear_nnz > 0 ? A_values[row_id].data() : nullptr; const i_t* linear_idx = linear_nnz > 0 ? A_indices[row_id].data() : nullptr; problem.append_quadratic_constraint( row_id, - problem.get_row_names()[row_id], - problem.get_row_types()[row_id], + row_names[row_id], + static_cast(row_types[row_id]), linear_val, linear_nnz, linear_idx, linear_nnz, - problem.get_constraint_bounds()[row_id], + b_values[row_id], csr_result.values.data(), static_cast(csr_result.values.size()), csr_result.indices.data(), @@ -578,6 +582,42 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) csr_result.offsets.data(), static_cast(csr_result.offsets.size())); } + + std::vector linear_mps_indices{}; + linear_mps_indices.reserve(row_types.size()); + for (i_t i = 0; i < static_cast(row_types.size()); ++i) { + if (!quadratic_row_ids.count(i)) { linear_mps_indices.push_back(i); } + } + + if (!quadratic_row_ids.empty()) { + problem.set_linear_constraint_mps_indices(std::move(linear_mps_indices)); + problem.set_mps_declaration_constraint_row_count(static_cast(row_names.size())); + problem.set_mps_all_constraint_row_names( + std::vector(row_names.begin(), row_names.end())); + + std::vector linear_row_names{}; + std::vector row_types_linear{}; + linear_row_names.reserve(row_names.size()); + row_types_linear.reserve(row_names.size()); + for (size_t i = 0; i < row_names.size(); ++i) { + if (!quadratic_row_ids.count(static_cast(i))) { + linear_row_names.push_back(row_names[i]); + row_types_linear.push_back(static_cast(row_types[i])); + } + } + problem.set_row_names(std::move(linear_row_names)); + problem.set_row_types(row_types_linear.data(), static_cast(row_types_linear.size())); + } else { + problem.set_linear_constraint_mps_indices({}); + problem.set_mps_declaration_constraint_row_count(0); + problem.set_mps_all_constraint_row_names({}); + std::vector row_types_host(row_types.size()); + for (size_t i = 0; i < row_types.size(); ++i) { + row_types_host[i] = static_cast(row_types[i]); + } + problem.set_row_names(std::move(row_names)); + problem.set_row_types(row_types_host.data(), static_cast(row_types_host.size())); + } } template diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp index 3a0997774b..41771d7be6 100644 --- a/cpp/libmps_parser/src/mps_writer.cpp +++ b/cpp/libmps_parser/src/mps_writer.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -19,9 +20,30 @@ #include #include #include +#include +#include namespace cuopt::mps_parser { +namespace { + +template +char linear_row_type_from_bounds(f_t cl, f_t cu) +{ + if (cl == cu) { return 'E'; } + if (std::isinf(cu)) { return 'G'; } + return 'L'; +} + +inline int linear_csr_row_for_mps_decl_index( + std::unordered_map const& linear_csr_row_for_mps_decl, size_t decl_k) +{ + auto const it = linear_csr_row_for_mps_decl.find(decl_k); + return it == linear_csr_row_for_mps_decl.end() ? -1 : it->second; +} + +} // namespace + template mps_writer_t::mps_writer_t(const data_model_view_t& problem) : problem_(problem) { @@ -103,6 +125,25 @@ data_model_view_t mps_writer_t::create_view( static_cast(Q_offsets.size())); } + if (model.has_quadratic_constraints()) { + view.set_quadratic_constraints( + std::vector::quadratic_constraint_t>( + model.get_quadratic_constraints())); + } + + if (model.get_mps_declaration_constraint_row_count() > 0) { + view.set_mps_declaration_constraint_row_count(model.get_mps_declaration_constraint_row_count()); + const auto& lmi = model.get_linear_constraint_mps_indices(); + if (!lmi.empty()) { + view.set_linear_constraint_mps_indices(lmi.data(), static_cast(lmi.size())); + } + const auto& all_names = model.get_mps_all_constraint_row_names(); + if (!all_names.empty()) { + view.set_mps_all_constraint_row_names( + std::vector(all_names.begin(), all_names.end())); + } + } + return view; } @@ -130,6 +171,26 @@ void mps_writer_t::write(const std::string& mps_file_path) else n_constraints = problem_.get_constraint_lower_bounds().size(); + const bool qc_rows_separated = + problem_.get_mps_declaration_constraint_row_count() > 0 && + problem_.get_linear_constraint_mps_indices().size() > 0 && + !problem_.get_mps_all_constraint_row_names().empty(); + + i_t n_mps_declaration_rows = n_constraints; + std::unordered_map linear_csr_row_for_mps_decl; + if (qc_rows_separated) { + n_mps_declaration_rows = problem_.get_mps_declaration_constraint_row_count(); + span const ltd = problem_.get_linear_constraint_mps_indices(); + linear_csr_row_for_mps_decl.reserve(static_cast(ltd.size())); + size_t const n_decl_sz = static_cast(n_mps_declaration_rows); + for (size_t j = 0; j < ltd.size(); ++j) { + i_t const decl_row = ltd[j]; + if (decl_row < 0) { continue; } + size_t const d = static_cast(decl_row); + if (d < n_decl_sz) { linear_csr_row_for_mps_decl.try_emplace(d, static_cast(j)); } + } + } + std::vector objective_coefficients(problem_.get_objective_coefficients().size()); std::vector constraint_lower_bounds(n_constraints); std::vector constraint_upper_bounds(n_constraints); @@ -206,20 +267,65 @@ void mps_writer_t::write(const std::string& mps_file_path) if (problem_.get_sense()) { mps_file << "OBJSENSE\n MAXIMIZE\n"; } + // sort the quadratic constraints by the constraint row index, useful for both ROWS and RHS sections + using qc_t = typename mps_data_model_t::quadratic_constraint_t; + std::vector qcs_by_decl_row; + if (qc_rows_separated) { + std::vector const& qcs = problem_.get_quadratic_constraints(); + size_t const n_decl = static_cast(n_mps_declaration_rows); + qcs_by_decl_row.reserve(qcs.size()); + for (qc_t const& qc : qcs) { + size_t const idx = static_cast(qc.constraint_row_index); + if (idx < n_decl) { qcs_by_decl_row.push_back(&qc); } + } + std::stable_sort(qcs_by_decl_row.begin(), + qcs_by_decl_row.end(), + [](qc_t const* lhs, qc_t const* rhs) { + return lhs->constraint_row_index < rhs->constraint_row_index; + }); + } + // ROWS section mps_file << "ROWS\n"; mps_file << " N " << (problem_.get_objective_name().empty() ? "OBJ" : problem_.get_objective_name()) << "\n"; - for (size_t i = 0; i < (size_t)n_constraints; i++) { - std::string row_name = - i < problem_.get_row_names().size() ? problem_.get_row_names()[i] : "R" + std::to_string(i); - char type = 'L'; - if (constraint_lower_bounds[i] == constraint_upper_bounds[i]) - type = 'E'; - else if (std::isinf(constraint_upper_bounds[i])) - type = 'G'; - mps_file << " " << type << " " << row_name << "\n"; + if (!qc_rows_separated) { + for (size_t k = 0; k < (size_t)n_mps_declaration_rows; ++k) { + std::string row_name = k < problem_.get_row_names().size() ? problem_.get_row_names()[k] + : "R" + std::to_string(k); + char const type = + linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); + mps_file << " " << type << " " << row_name << "\n"; + } + } else { + size_t const n_decl = static_cast(n_mps_declaration_rows); + const auto& alln = problem_.get_mps_all_constraint_row_names(); + size_t qc_idx = 0; + for (size_t k = 0; k < n_decl; ++k) { + std::string row_name = alln[k]; + char type = 'L'; + // find the quadratic constraint that corresponds to the current row + while (qc_idx < qcs_by_decl_row.size() && + static_cast(qcs_by_decl_row[qc_idx]->constraint_row_index) < k) { + ++qc_idx; + } + qc_t const* const qc_match = + (qc_idx < qcs_by_decl_row.size() && + static_cast(qcs_by_decl_row[qc_idx]->constraint_row_index) == k) + ? qcs_by_decl_row[qc_idx] + : nullptr; + if (qc_match != nullptr) { + type = qc_match->constraint_row_type; + } else { + int const lj = linear_csr_row_for_mps_decl_index(linear_csr_row_for_mps_decl, k); + if (lj >= 0) { + size_t const j = static_cast(lj); + type = linear_row_type_from_bounds(constraint_lower_bounds[j], constraint_upper_bounds[j]); + } + } + mps_file << " " << type << " " << row_name << "\n"; + } } // COLUMNS section @@ -230,9 +336,13 @@ void mps_writer_t::write(const std::string& mps_file_path) std::vector var_in_constraint(n_variables, false); std::map>> integral_col_nnzs; std::map>> continuous_col_nnzs; - for (size_t row_id = 0; row_id < (size_t)n_constraints; row_id++) { - for (size_t k = (size_t)constraint_matrix_offsets[row_id]; - k < (size_t)constraint_matrix_offsets[row_id + 1]; + + // iterate over the constraint matrix and add the nonzeros to the integral and continuous col_nnzs maps + for (size_t csr_row = 0; csr_row < (size_t)n_constraints; csr_row++) { + const i_t row_id = + qc_rows_separated ? problem_.get_linear_constraint_mps_indices()[csr_row] : static_cast(csr_row); + for (size_t k = (size_t)constraint_matrix_offsets[csr_row]; + k < (size_t)constraint_matrix_offsets[csr_row + 1]; k++) { size_t var = (size_t)constraint_matrix_indices[k]; if (variable_types[var] == 'I') { @@ -244,6 +354,23 @@ void mps_writer_t::write(const std::string& mps_file_path) } } + // Quadratic constraint rows omit linear coefficients from global A; add them from QC bundles. + if (problem_.has_quadratic_constraints()) { + for (const auto& qc : problem_.get_quadratic_constraints()) { + const size_t row_id = static_cast(qc.constraint_row_index); + for (size_t t = 0; t < qc.linear_indices.size(); ++t) { + size_t var = static_cast(qc.linear_indices[t]); + f_t val = qc.linear_values[t]; + if (variable_types[var] == 'I') { + integral_col_nnzs[var].emplace_back(row_id, val); + } else { + continuous_col_nnzs[var].emplace_back(row_id, val); + } + var_in_constraint[var] = true; + } + } + } + // Record and explicitely declared variables not contained in any constraint. std::vector orphan_continuous_vars; std::vector orphan_integer_vars; @@ -276,9 +403,15 @@ void mps_writer_t::write(const std::string& mps_file_path) ? problem_.get_variable_names()[var_id] : "C" + std::to_string(var_id); for (auto& nnz : nnzs) { - std::string row_name = nnz.first < problem_.get_row_names().size() - ? problem_.get_row_names()[nnz.first] - : "R" + std::to_string(nnz.first); + std::string row_name; + if (qc_rows_separated && + static_cast(nnz.first) < problem_.get_mps_all_constraint_row_names().size()) { + row_name = problem_.get_mps_all_constraint_row_names()[nnz.first]; + } else if (static_cast(nnz.first) < problem_.get_row_names().size()) { + row_name = problem_.get_row_names()[nnz.first]; + } else { + row_name = "R" + std::to_string(nnz.first); + } mps_file << " " << col_name << " " << row_name << " " << nnz.second << "\n"; } // Write objective coefficients @@ -293,19 +426,54 @@ void mps_writer_t::write(const std::string& mps_file_path) // RHS section mps_file << "RHS\n"; - for (size_t i = 0; i < (size_t)n_constraints; i++) { - std::string row_name = - i < problem_.get_row_names().size() ? problem_.get_row_names()[i] : "R" + std::to_string(i); - - f_t rhs; - if (constraint_bounds.size() > 0) - rhs = constraint_bounds[i]; - else if (std::isinf(constraint_lower_bounds[i])) { - rhs = constraint_upper_bounds[i]; - } else if (std::isinf(constraint_upper_bounds[i])) { - rhs = constraint_lower_bounds[i]; - } else { // RANGES, encode the lower bound - rhs = constraint_lower_bounds[i]; + size_t qc_idx_rhs = 0; + for (size_t k = 0; k < (size_t)n_mps_declaration_rows; ++k) { + std::string row_name; + f_t rhs{0}; + + if (!qc_rows_separated) { + row_name = k < problem_.get_row_names().size() ? problem_.get_row_names()[k] + : "R" + std::to_string(k); + if (constraint_bounds.size() > 0) + rhs = constraint_bounds[k]; + else if (std::isinf(constraint_lower_bounds[k])) { + rhs = constraint_upper_bounds[k]; + } else if (std::isinf(constraint_upper_bounds[k])) { + rhs = constraint_lower_bounds[k]; + } else { + rhs = constraint_lower_bounds[k]; + } + } else { + const auto& alln = problem_.get_mps_all_constraint_row_names(); + row_name = alln[k]; + while (qc_idx_rhs < qcs_by_decl_row.size() && + static_cast(qcs_by_decl_row[qc_idx_rhs]->constraint_row_index) < k) { + ++qc_idx_rhs; + } + qc_t const* const qc_match = + (qc_idx_rhs < qcs_by_decl_row.size() && + static_cast(qcs_by_decl_row[qc_idx_rhs]->constraint_row_index) == k) + ? qcs_by_decl_row[qc_idx_rhs] + : nullptr; + if (qc_match != nullptr) { + rhs = qc_match->rhs_value; + } else { + int const lj = linear_csr_row_for_mps_decl_index(linear_csr_row_for_mps_decl, k); + mps_parser_expects(lj >= 0, + error_type_t::ValidationError, + "RHS row %zu has no linear or quadratic mapping", + k); + const size_t j = static_cast(lj); + if (constraint_bounds.size() > 0) + rhs = constraint_bounds[j]; + else if (std::isinf(constraint_lower_bounds[j])) { + rhs = constraint_upper_bounds[j]; + } else if (std::isinf(constraint_upper_bounds[j])) { + rhs = constraint_lower_bounds[j]; + } else { + rhs = constraint_lower_bounds[j]; + } + } } if (std::isfinite(rhs) && rhs != 0.0) { @@ -427,6 +595,29 @@ void mps_writer_t::write(const std::string& mps_file_path) } } + // QCMATRIX sections for quadratic constraints (QCQP) + if (problem_.has_quadratic_constraints()) { + for (const auto& qc : problem_.get_quadratic_constraints()) { + mps_file << "QCMATRIX " << qc.constraint_row_name << "\n"; + const i_t n_quad_rows = static_cast(qc.quadratic_offsets.size()) - 1; + for (i_t i = 0; i < n_quad_rows; ++i) { + std::string row_var_name = static_cast(i) < problem_.get_variable_names().size() + ? problem_.get_variable_names()[i] + : "C" + std::to_string(i); + for (i_t p = qc.quadratic_offsets[i]; p < qc.quadratic_offsets[i + 1]; ++p) { + i_t j = qc.quadratic_indices[p]; + f_t v = qc.quadratic_values[p]; + std::string col_var_name = static_cast(j) < problem_.get_variable_names().size() + ? problem_.get_variable_names()[j] + : "C" + std::to_string(j); + if (v != f_t(0)) { + mps_file << " " << row_var_name << " " << col_var_name << " " << v << "\n"; + } + } + } + } + } + mps_file << "ENDATA\n"; mps_file.close(); } diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index dd1376621b..5689260a5b 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -956,12 +956,17 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) const auto& qcs = model.get_quadratic_constraints(); ASSERT_EQ(2u, qcs.size()); - ASSERT_EQ(3, model.get_n_constraints()); - ASSERT_EQ(3u, model.get_row_names().size()); + ASSERT_EQ(1, model.get_n_constraints()); + ASSERT_EQ(1u, model.get_row_names().size()); EXPECT_EQ("LIN0", model.get_row_names()[0]); - EXPECT_EQ("QC0", model.get_row_names()[1]); - EXPECT_EQ("QC1", model.get_row_names()[2]); EXPECT_EQ('L', model.get_row_types()[0]); + ASSERT_EQ(3, model.get_mps_declaration_constraint_row_count()); + ASSERT_EQ(3u, model.get_mps_all_constraint_row_names().size()); + EXPECT_EQ("LIN0", model.get_mps_all_constraint_row_names()[0]); + EXPECT_EQ("QC0", model.get_mps_all_constraint_row_names()[1]); + EXPECT_EQ("QC1", model.get_mps_all_constraint_row_names()[2]); + ASSERT_EQ(1u, model.get_linear_constraint_mps_indices().size()); + EXPECT_EQ(0, model.get_linear_constraint_mps_indices()[0]); // LIN0: 2*x1 + x2 ≤ 15 (linear row only; not duplicated in quadratic_constraints) EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), @@ -976,7 +981,7 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_EQ(0, A_idx[A_off[0] + 0]); EXPECT_EQ(1, A_idx[A_off[0] + 1]); - // QC0: x1 + x2 + xᵀQ₀x ≤ 5 (row index 1: OBJ 'N' rows are not counted in row_names) + // QC0: x1 + x2 + xᵀQ₀x ≤ 5 (MPS ROWS declaration index 1; OBJ 'N' rows are not counted) EXPECT_EQ(1, qcs[0].constraint_row_index); EXPECT_EQ("QC0", qcs[0].constraint_row_name); EXPECT_EQ('L', qcs[0].constraint_row_type); @@ -986,9 +991,6 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_EQ(0, qcs[0].linear_indices[0]); EXPECT_EQ(1, qcs[0].linear_indices[1]); EXPECT_DOUBLE_EQ(5.0, qcs[0].rhs_value); - EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), - model.get_constraint_lower_bounds()[qcs[0].constraint_row_index]); - EXPECT_DOUBLE_EQ(5.0, model.get_constraint_upper_bounds()[qcs[0].constraint_row_index]); EXPECT_FALSE(qcs[0].quadratic_values.empty()); // QC1: 3*x1 + x2 + xᵀQ₁x ≤ 10 @@ -999,9 +1001,6 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_EQ(3.0, qcs[1].linear_values[0]); EXPECT_EQ(1.0, qcs[1].linear_values[1]); EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value); - EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), - model.get_constraint_lower_bounds()[qcs[1].constraint_row_index]); - EXPECT_DOUBLE_EQ(10.0, model.get_constraint_upper_bounds()[qcs[1].constraint_row_index]); } TEST(qps_parser, qcqp_p0033_mps_sections) @@ -1012,12 +1011,14 @@ TEST(qps_parser, qcqp_p0033_mps_sections) const auto model = parse_mps( cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false); - EXPECT_EQ(16, model.get_n_constraints()); + EXPECT_EQ(12, model.get_n_constraints()); EXPECT_EQ(33, model.get_n_variables()); - ASSERT_EQ(16u, model.get_row_types().size()); - ASSERT_EQ(16u, model.get_row_names().size()); + ASSERT_EQ(12u, model.get_row_types().size()); + ASSERT_EQ(12u, model.get_row_names().size()); + ASSERT_EQ(16, model.get_mps_declaration_constraint_row_count()); + ASSERT_EQ(16u, model.get_mps_all_constraint_row_names().size()); - const auto& rnames = model.get_row_names(); + const auto& rnames = model.get_mps_all_constraint_row_names(); auto qc1_it = std::find(rnames.begin(), rnames.end(), std::string("QC1")); ASSERT_NE(qc1_it, rnames.end()); const int qc1_row = static_cast(qc1_it - rnames.begin()); @@ -1201,6 +1202,46 @@ void compare_data_models(const mps_data_model_t& original, EXPECT_EQ(orig_Q_off[i], reload_Q_off[i]) << "Q offset mismatch at index " << i; } } + + EXPECT_EQ(original.has_quadratic_constraints(), reloaded.has_quadratic_constraints()); + if (original.has_quadratic_constraints() && reloaded.has_quadratic_constraints()) { + const auto& oqc = original.get_quadratic_constraints(); + const auto& rq = reloaded.get_quadratic_constraints(); + ASSERT_EQ(oqc.size(), rq.size()) << "Quadratic constraint count mismatch"; + for (size_t k = 0; k < oqc.size(); ++k) { + EXPECT_EQ(oqc[k].constraint_row_index, rq[k].constraint_row_index); + EXPECT_EQ(oqc[k].constraint_row_name, rq[k].constraint_row_name); + EXPECT_EQ(oqc[k].constraint_row_type, rq[k].constraint_row_type); + EXPECT_NEAR(oqc[k].rhs_value, rq[k].rhs_value, tol); + ASSERT_EQ(oqc[k].linear_values.size(), rq[k].linear_values.size()); + ASSERT_EQ(oqc[k].linear_indices.size(), rq[k].linear_indices.size()); + for (size_t i = 0; i < oqc[k].linear_values.size(); ++i) { + EXPECT_NEAR(oqc[k].linear_values[i], rq[k].linear_values[i], tol); + EXPECT_EQ(oqc[k].linear_indices[i], rq[k].linear_indices[i]); + } + ASSERT_EQ(oqc[k].quadratic_values.size(), rq[k].quadratic_values.size()); + ASSERT_EQ(oqc[k].quadratic_indices.size(), rq[k].quadratic_indices.size()); + ASSERT_EQ(oqc[k].quadratic_offsets.size(), rq[k].quadratic_offsets.size()); + for (size_t i = 0; i < oqc[k].quadratic_values.size(); ++i) { + EXPECT_NEAR(oqc[k].quadratic_values[i], rq[k].quadratic_values[i], tol); + } + for (size_t i = 0; i < oqc[k].quadratic_indices.size(); ++i) { + EXPECT_EQ(oqc[k].quadratic_indices[i], rq[k].quadratic_indices[i]); + } + for (size_t i = 0; i < oqc[k].quadratic_offsets.size(); ++i) { + EXPECT_EQ(oqc[k].quadratic_offsets[i], rq[k].quadratic_offsets[i]); + } + } + } + + EXPECT_EQ(original.get_mps_declaration_constraint_row_count(), + reloaded.get_mps_declaration_constraint_row_count()); + EXPECT_EQ(original.get_linear_constraint_mps_indices(), reloaded.get_linear_constraint_mps_indices()); + ASSERT_EQ(original.get_mps_all_constraint_row_names().size(), + reloaded.get_mps_all_constraint_row_names().size()); + for (size_t i = 0; i < original.get_mps_all_constraint_row_names().size(); ++i) { + EXPECT_EQ(original.get_mps_all_constraint_row_names()[i], reloaded.get_mps_all_constraint_row_names()[i]); + } } TEST(mps_roundtrip, linear_programming_basic) @@ -1311,4 +1352,27 @@ TEST(mps_roundtrip, quadratic_programming_qp_test_2) std::filesystem::remove(temp_file); } +TEST(mps_roundtrip, qcqp_p0033_qc1) +{ + if (!file_exists("qcqp/p0033_qc1.mps")) { + GTEST_SKIP() << "Test file not found"; + } + + std::string input_file = + cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; + std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; + + auto original = parse_mps(input_file, false); + ASSERT_TRUE(original.has_quadratic_objective()); + ASSERT_TRUE(original.has_quadratic_constraints()); + + mps_writer_t writer(original); + writer.write(temp_file); + + auto reloaded = parse_mps(temp_file, false); + compare_data_models(original, reloaded); + + std::filesystem::remove(temp_file); +} + } // namespace cuopt::mps_parser diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index 406b0b6541..4bbe5ef545 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -133,6 +133,52 @@ void cpu_optimization_problem_t::set_quadratic_objective_matrix( std::copy(Q_offsets, Q_offsets + size_offsets, Q_offsets_.begin()); } +template +void cpu_optimization_problem_t::set_quadratic_constraints( + std::vector::mps_quadratic_constraint_t> + constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + +template +void cpu_optimization_problem_t::set_linear_constraint_mps_indices(std::vector indices) +{ + linear_constraint_mps_indices_ = std::move(indices); +} + +template +void cpu_optimization_problem_t::set_mps_declaration_constraint_row_count(i_t count) +{ + mps_declaration_constraint_row_count_ = count; +} + +template +void cpu_optimization_problem_t::set_mps_all_constraint_row_names( + std::vector names) +{ + mps_all_constraint_row_names_ = std::move(names); +} + +template +i_t cpu_optimization_problem_t::get_mps_declaration_constraint_row_count() const +{ + return mps_declaration_constraint_row_count_; +} + +template +const std::vector& cpu_optimization_problem_t::get_linear_constraint_mps_indices() const +{ + return linear_constraint_mps_indices_; +} + +template +const std::vector& cpu_optimization_problem_t::get_mps_all_constraint_row_names() + const +{ + return mps_all_constraint_row_names_; +} + template void cpu_optimization_problem_t::set_variable_lower_bounds( const f_t* variable_lower_bounds, i_t size) @@ -494,6 +540,19 @@ bool cpu_optimization_problem_t::has_quadratic_objective() const return !Q_values_.empty(); } +template +const std::vector::mps_quadratic_constraint_t>& +cpu_optimization_problem_t::get_quadratic_constraints() const +{ + return quadratic_constraints_; +} + +template +bool cpu_optimization_problem_t::has_quadratic_constraints() const +{ + return !quadratic_constraints_.empty(); +} + // ============================================================================== // Host Getters (return references to CPU memory) // ============================================================================== @@ -621,6 +680,20 @@ cpu_optimization_problem_t::to_optimization_problem(raft::handle_t con Q_offsets_.size()); } + if (!quadratic_constraints_.empty()) { + gpu_problem->set_quadratic_constraints( + std::vector::mps_quadratic_constraint_t>( + quadratic_constraints_)); + } + + if (mps_declaration_constraint_row_count_ > 0) { + gpu_problem->set_linear_constraint_mps_indices( + std::vector(linear_constraint_mps_indices_)); + gpu_problem->set_mps_declaration_constraint_row_count(mps_declaration_constraint_row_count_); + gpu_problem->set_mps_all_constraint_row_names( + std::vector(mps_all_constraint_row_names_)); + } + // Set variable bounds if (!variable_lower_bounds_.empty()) { gpu_problem->set_variable_lower_bounds(variable_lower_bounds_.data(), @@ -740,6 +813,25 @@ void cpu_optimization_problem_t::write_to_mps(const std::string& mps_f false); } + if (has_quadratic_constraints()) { + data_model_view.set_quadratic_constraints( + std::vector::mps_quadratic_constraint_t>( + get_quadratic_constraints())); + } + + if (get_mps_declaration_constraint_row_count() > 0) { + data_model_view.set_mps_declaration_constraint_row_count(get_mps_declaration_constraint_row_count()); + if (!get_linear_constraint_mps_indices().empty()) { + data_model_view.set_linear_constraint_mps_indices(get_linear_constraint_mps_indices().data(), + static_cast( + get_linear_constraint_mps_indices().size())); + } + if (!get_mps_all_constraint_row_names().empty()) { + data_model_view.set_mps_all_constraint_row_names( + std::vector(get_mps_all_constraint_row_names())); + } + } + cuopt::mps_parser::write_mps(data_model_view, mps_file_path); } @@ -755,6 +847,10 @@ bool cpu_optimization_problem_t::is_equivalent( if (maximize_ != other.get_sense()) return false; if (n_vars_ != other.get_n_variables()) return false; if (n_constraints_ != other.get_n_constraints()) return false; + if (get_mps_declaration_constraint_row_count() != other.get_mps_declaration_constraint_row_count()) + return false; + if (get_linear_constraint_mps_indices() != other.get_linear_constraint_mps_indices()) return false; + if (get_mps_all_constraint_row_names() != other.get_mps_all_constraint_row_names()) return false; if (std::abs(objective_scaling_factor_ - other.get_objective_scaling_factor()) > 1e-9) return false; if (std::abs(objective_offset_ - other.get_objective_offset()) > 1e-9) return false; diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu index 87ff9dab08..b3f71bd92c 100644 --- a/cpp/src/pdlp/optimization_problem.cu +++ b/cpp/src/pdlp/optimization_problem.cu @@ -97,7 +97,11 @@ optimization_problem_t::optimization_problem_t( problem_name_{other.get_problem_name()}, problem_category_{other.get_problem_category()}, var_names_{other.get_variable_names()}, - row_names_{other.get_row_names()} + row_names_{other.get_row_names()}, + quadratic_constraints_{other.get_quadratic_constraints()}, + linear_constraint_mps_indices_{other.get_linear_constraint_mps_indices()}, + mps_declaration_constraint_row_count_{other.get_mps_declaration_constraint_row_count()}, + mps_all_constraint_row_names_{other.get_mps_all_constraint_row_names()} { } @@ -197,6 +201,51 @@ void optimization_problem_t::set_quadratic_objective_matrix( // FIX ME:: check for positive semi definite matrix } +template +void optimization_problem_t::set_quadratic_constraints( + std::vector::mps_quadratic_constraint_t> + constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + +template +void optimization_problem_t::set_linear_constraint_mps_indices(std::vector indices) +{ + linear_constraint_mps_indices_ = std::move(indices); +} + +template +void optimization_problem_t::set_mps_declaration_constraint_row_count(i_t count) +{ + mps_declaration_constraint_row_count_ = count; +} + +template +void optimization_problem_t::set_mps_all_constraint_row_names(std::vector names) +{ + mps_all_constraint_row_names_ = std::move(names); +} + +template +i_t optimization_problem_t::get_mps_declaration_constraint_row_count() const +{ + return mps_declaration_constraint_row_count_; +} + +template +const std::vector& optimization_problem_t::get_linear_constraint_mps_indices() const +{ + return linear_constraint_mps_indices_; +} + +template +const std::vector& optimization_problem_t::get_mps_all_constraint_row_names() + const +{ + return mps_all_constraint_row_names_; +} + template void optimization_problem_t::set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) @@ -548,6 +597,19 @@ bool optimization_problem_t::has_quadratic_objective() const return !Q_values_.empty(); } +template +const std::vector::mps_quadratic_constraint_t>& +optimization_problem_t::get_quadratic_constraints() const +{ + return quadratic_constraints_; +} + +template +bool optimization_problem_t::has_quadratic_constraints() const +{ + return !quadratic_constraints_.empty(); +} + template raft::handle_t const* optimization_problem_t::get_handle_ptr() const noexcept { @@ -820,6 +882,25 @@ void optimization_problem_t::write_to_mps(const std::string& mps_file_ is_symmetrized); } + if (has_quadratic_constraints()) { + data_model_view.set_quadratic_constraints( + std::vector::mps_quadratic_constraint_t>( + get_quadratic_constraints())); + } + + if (get_mps_declaration_constraint_row_count() > 0) { + data_model_view.set_mps_declaration_constraint_row_count(get_mps_declaration_constraint_row_count()); + if (!get_linear_constraint_mps_indices().empty()) { + data_model_view.set_linear_constraint_mps_indices(get_linear_constraint_mps_indices().data(), + static_cast( + get_linear_constraint_mps_indices().size())); + } + if (!get_mps_all_constraint_row_names().empty()) { + data_model_view.set_mps_all_constraint_row_names( + std::vector(get_mps_all_constraint_row_names())); + } + } + cuopt::mps_parser::write_mps(data_model_view, mps_file_path); } @@ -1030,6 +1111,11 @@ bool optimization_problem_t::is_equivalent( if (maximize_ != other.maximize_) { return false; } if (n_vars_ != other.n_vars_) { return false; } if (n_constraints_ != other.n_constraints_) { return false; } + if (linear_constraint_mps_indices_ != other.linear_constraint_mps_indices_) { return false; } + if (mps_declaration_constraint_row_count_ != other.mps_declaration_constraint_row_count_) { + return false; + } + if (mps_all_constraint_row_names_ != other.mps_all_constraint_row_names_) { return false; } if (objective_scaling_factor_ != other.objective_scaling_factor_) { return false; } if (objective_offset_ != other.objective_offset_) { return false; } if (problem_category_ != other.problem_category_) { return false; } @@ -1174,6 +1260,10 @@ bool optimization_problem_t::is_equivalent( if (maximize_ != other.get_sense()) return false; if (n_vars_ != other.get_n_variables()) return false; if (n_constraints_ != other.get_n_constraints()) return false; + if (get_mps_declaration_constraint_row_count() != other.get_mps_declaration_constraint_row_count()) + return false; + if (get_linear_constraint_mps_indices() != other.get_linear_constraint_mps_indices()) return false; + if (get_mps_all_constraint_row_names() != other.get_mps_all_constraint_row_names()) return false; if (std::abs(objective_scaling_factor_ - other.get_objective_scaling_factor()) > 1e-9) return false; if (std::abs(objective_offset_ - other.get_objective_offset()) > 1e-9) return false; From 21b0f9a6d77d4982efe46c8d30ec9df02bdca65d Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Mon, 20 Apr 2026 07:31:36 -0700 Subject: [PATCH 10/22] Restrict quadratic constraints to be convex with type 'L' --- cpp/libmps_parser/include/mps_parser/mps_data_model.hpp | 7 ++++++- cpp/libmps_parser/src/mps_data_model.cpp | 6 ++++++ cpp/libmps_parser/src/mps_writer.cpp | 3 ++- cpp/libmps_parser/tests/mps_parser_test.cpp | 1 + 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 2a578c36c5..a292bb5002 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -275,6 +275,7 @@ class mps_data_model_t { /** ROWS declaration index (among all constraint rows), not an index into the linear CSR. */ i_t constraint_row_index{}; std::string constraint_row_name{}; + /** MPS ROWS sense for this quadratic row; only 'L' (≤) is supported for convex QCQP at the moment. */ char constraint_row_type{}; std::vector linear_values{}; std::vector linear_indices{}; @@ -284,7 +285,11 @@ class mps_data_model_t { std::vector quadratic_offsets{}; }; - /** @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). */ + /** + * @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). + * @param constraint_row_type MPS ROWS type; must be 'L'. 'G' and 'E' quadratic rows are not + * supported. + */ void append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, char constraint_row_type, diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 523a975447..72ae524703 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -257,6 +257,12 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ mps_parser_expects( constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); + mps_parser_expects(constraint_row_type == 'L', + error_type_t::ValidationError, + "Quadratic constraint ROWS type must be 'L' (less-or-equal); got '%c'. " + "Only 'L' is supported for convex quadratic constraints.", + constraint_row_type); + mps_parser_expects(linear_nnz == linear_indices_nnz, error_type_t::ValidationError, "linear_values and linear_indices must have the same nnz count"); diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp index 41771d7be6..a7d6734027 100644 --- a/cpp/libmps_parser/src/mps_writer.cpp +++ b/cpp/libmps_parser/src/mps_writer.cpp @@ -316,7 +316,8 @@ void mps_writer_t::write(const std::string& mps_file_path) ? qcs_by_decl_row[qc_idx] : nullptr; if (qc_match != nullptr) { - type = qc_match->constraint_row_type; + // Quadratic rows are supported only as MPS 'L' (≤); always emit that sense. + type = 'L'; } else { int const lj = linear_csr_row_for_mps_decl_index(linear_csr_row_for_mps_decl, k); if (lj >= 0) { diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 5689260a5b..3704eac8db 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include From e6e1f427058750c6716964e5ffa102b69a4989c5 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Wed, 22 Apr 2026 04:47:45 -0700 Subject: [PATCH 11/22] Clean up for the MPS parser --- .../cpu_optimization_problem.hpp | 19 +- .../optimization_problem.hpp | 19 +- .../optimization_problem_interface.hpp | 67 +++--- .../optimization_problem_utils.hpp | 31 +-- .../include/mps_parser/data_model_view.hpp | 33 +-- .../include/mps_parser/mps_data_model.hpp | 29 +-- cpp/libmps_parser/src/data_model_view.cpp | 48 ----- cpp/libmps_parser/src/mps_data_model.cpp | 36 ---- cpp/libmps_parser/src/mps_parser.cpp | 19 +- cpp/libmps_parser/src/mps_writer.cpp | 196 ++++-------------- cpp/libmps_parser/tests/mps_parser_test.cpp | 26 +-- cpp/src/pdlp/cpu_optimization_problem.cpp | 76 +------ cpp/src/pdlp/optimization_problem.cu | 74 +------ 13 files changed, 125 insertions(+), 548 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp index 2865fc877f..48d61b9e0c 100644 --- a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp @@ -41,7 +41,7 @@ class mip_solution_interface_t; template class cpu_optimization_problem_t : public optimization_problem_interface_t { public: - using typename optimization_problem_interface_t::mps_quadratic_constraint_t; + using typename optimization_problem_interface_t::quadratic_constraint_t; cpu_optimization_problem_t(); @@ -115,16 +115,9 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t& get_quadratic_objective_values() const override; bool has_quadratic_objective() const override; - void set_quadratic_constraints(std::vector constraints) override; + void set_quadratic_constraints(std::vector constraints) override; bool has_quadratic_constraints() const override; - const std::vector& get_quadratic_constraints() const override; - // Additional methods for MPS export if quadratic constraints are present - void set_linear_constraint_mps_indices(std::vector indices) override; - void set_mps_declaration_constraint_row_count(i_t count) override; - void set_mps_all_constraint_row_names(std::vector names) override; - i_t get_mps_declaration_constraint_row_count() const override; - const std::vector& get_linear_constraint_mps_indices() const override; - const std::vector& get_mps_all_constraint_row_names() const override; + const std::vector& get_quadratic_constraints() const override; // Host getters - these are the only supported getters for CPU implementation std::vector get_constraint_matrix_values_host() const override; @@ -198,11 +191,7 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t Q_indices_; std::vector Q_values_; - std::vector quadratic_constraints_{}; - - std::vector linear_constraint_mps_indices_{}; - i_t mps_declaration_constraint_row_count_{0}; - std::vector mps_all_constraint_row_names_{}; + std::vector quadratic_constraints_{}; std::vector variable_lower_bounds_; std::vector variable_upper_bounds_; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem.hpp b/cpp/include/cuopt/linear_programming/optimization_problem.hpp index 27f5ac1168..c355fbde94 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem.hpp @@ -73,7 +73,7 @@ class optimization_problem_t : public optimization_problem_interface_t "'optimization_problem_t' accepts only floating point types for weights"); // nvcc does not always find base typedefs in derived class scope; inject explicitly. - using typename optimization_problem_interface_t::mps_quadratic_constraint_t; + using typename optimization_problem_interface_t::quadratic_constraint_t; /** * @brief A device-side view of the `optimization_problem_t` structure with @@ -199,7 +199,7 @@ class optimization_problem_t : public optimization_problem_interface_t i_t size_offsets, bool validate_positive_semi_definite = false) override; - void set_quadratic_constraints(std::vector constraints) override; + void set_quadratic_constraints(std::vector constraints) override; /** @copydoc optimization_problem_interface_t::set_variable_lower_bounds */ void set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) override; @@ -264,17 +264,10 @@ class optimization_problem_t : public optimization_problem_interface_t const std::vector& get_quadratic_objective_offsets() const override; const std::vector& get_quadratic_objective_indices() const override; const std::vector& get_quadratic_objective_values() const override; - const std::vector& get_quadratic_constraints() const override; + const std::vector& get_quadratic_constraints() const override; bool has_quadratic_objective() const override; bool has_quadratic_constraints() const override; - void set_linear_constraint_mps_indices(std::vector indices) override; - void set_mps_declaration_constraint_row_count(i_t count) override; - void set_mps_all_constraint_row_names(std::vector names) override; - i_t get_mps_declaration_constraint_row_count() const override; - const std::vector& get_linear_constraint_mps_indices() const override; - const std::vector& get_mps_all_constraint_row_names() const override; - // ============================================================================ // Host getters // ============================================================================ @@ -391,11 +384,7 @@ class optimization_problem_t : public optimization_problem_interface_t std::vector Q_values_; /** QCQP: quadratic constraints **/ - std::vector quadratic_constraints_{}; - - std::vector linear_constraint_mps_indices_{}; - i_t mps_declaration_constraint_row_count_{0}; - std::vector mps_all_constraint_row_names_{}; + std::vector quadratic_constraints_{}; rmm::device_uvector variable_lower_bounds_; rmm::device_uvector variable_upper_bounds_; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index c4ee81d875..27a106037b 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -8,7 +8,6 @@ #pragma once #include -#include #include #include @@ -57,9 +56,18 @@ class optimization_problem_interface_t { static_assert(std::is_floating_point::value, "'optimization_problem_interface_t' accepts only floating point types for weights"); - /** Quadratic constraints as parsed/stored for MPS QCQP (QCMATRIX rows). */ - using mps_quadratic_constraint_t = - typename mps_parser::mps_data_model_t::quadratic_constraint_t; + /** Quadratic constraint bundle used by core optimization problem interfaces. */ + struct quadratic_constraint_t { + i_t constraint_row_index{}; + std::string constraint_row_name{}; + char constraint_row_type{}; + std::vector linear_values{}; + std::vector linear_indices{}; + f_t rhs_value{f_t(0)}; + std::vector quadratic_values{}; + std::vector quadratic_indices{}; + std::vector quadratic_offsets{}; + }; virtual ~optimization_problem_interface_t() = default; @@ -67,42 +75,37 @@ class optimization_problem_interface_t { * @brief Store quadratic constraints for MPS round-trip (linear + Q parts per QC row). * @note Default implementation ignores; GPU/CPU implementations persist for write_to_mps. */ - virtual void set_quadratic_constraints(std::vector constraints) + virtual void set_quadratic_constraints(std::vector constraints) { (void)constraints; } - - /** @brief Whether quadratic constraint metadata is present (for MPS export). */ - virtual bool has_quadratic_constraints() const { return false; } - - /** @brief Quadratic constraints for MPS export (empty if none). */ - virtual const std::vector& get_quadratic_constraints() const + template >> + void set_quadratic_constraints(const std::vector& constraints) { - static const std::vector k_empty{}; - return k_empty; + std::vector converted_constraints; + converted_constraints.reserve(constraints.size()); + for (const auto& qc : constraints) { + converted_constraints.push_back( + {static_cast(qc.constraint_row_index), + qc.constraint_row_name, + qc.constraint_row_type, + std::vector(qc.linear_values.begin(), qc.linear_values.end()), + std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), + static_cast(qc.rhs_value), + std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), + std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), + std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + } + set_quadratic_constraints(std::move(converted_constraints)); } - /** - * @brief When QCMATRIX rows are omitted from the linear CSR, maps linear CSR row j to the MPS - * ROWS declaration index. Used for MPS export only. - */ - virtual void set_linear_constraint_mps_indices(std::vector indices) { (void)indices; } - - virtual void set_mps_declaration_constraint_row_count(i_t count) { (void)count; } - - virtual void set_mps_all_constraint_row_names(std::vector names) { (void)names; } - - virtual i_t get_mps_declaration_constraint_row_count() const { return 0; } - - virtual const std::vector& get_linear_constraint_mps_indices() const - { - static const std::vector k_empty{}; - return k_empty; - } + /** @brief Whether quadratic constraint metadata is present (for MPS export). */ + virtual bool has_quadratic_constraints() const = 0; - virtual const std::vector& get_mps_all_constraint_row_names() const + /** @brief Quadratic constraints for MPS export (empty if none). */ + virtual const std::vector& get_quadratic_constraints() const { - static const std::vector k_empty{}; + static const std::vector k_empty{}; return k_empty; } diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp index bd05b3c1b2..204556e053 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp @@ -111,22 +111,9 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr } // Handle quadratic constraints if present if (data_model.has_quadratic_constraints()) { - problem->set_quadratic_constraints( - std::vector::quadratic_constraint_t>( - data_model.get_quadratic_constraints())); + problem->set_quadratic_constraints(data_model.get_quadratic_constraints()); } - if (data_model.get_mps_declaration_constraint_row_count() > 0) { - problem->set_linear_constraint_mps_indices( - std::vector(data_model.get_linear_constraint_mps_indices())); - problem->set_mps_declaration_constraint_row_count(data_model.get_mps_declaration_constraint_row_count()); - problem->set_mps_all_constraint_row_names( - std::vector(data_model.get_mps_all_constraint_row_names())); - } else { - problem->set_linear_constraint_mps_indices({}); - problem->set_mps_declaration_constraint_row_count(0); - problem->set_mps_all_constraint_row_names({}); - } } /** @@ -285,20 +272,10 @@ void populate_from_data_model_view(optimization_problem_interface_t* p problem->set_row_names(data_model->get_row_names()); } - if (data_model->get_mps_declaration_constraint_row_count() > 0) { - const auto lmi = data_model->get_linear_constraint_mps_indices(); - if (lmi.size() > 0) { - problem->set_linear_constraint_mps_indices(std::vector( - lmi.data(), lmi.data() + static_cast(lmi.size()))); - } - problem->set_mps_declaration_constraint_row_count(data_model->get_mps_declaration_constraint_row_count()); - problem->set_mps_all_constraint_row_names( - std::vector(data_model->get_mps_all_constraint_row_names())); - } else { - problem->set_linear_constraint_mps_indices({}); - problem->set_mps_declaration_constraint_row_count(0); - problem->set_mps_all_constraint_row_names({}); + if (data_model->has_quadratic_constraints()) { + problem->set_quadratic_constraints(data_model->get_quadratic_constraints()); } + } } // namespace cuopt::linear_programming diff --git a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp index 9e2c0e112e..100492e2b0 100644 --- a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp +++ b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp @@ -421,22 +421,30 @@ class data_model_view_t { */ void set_quadratic_constraints( std::vector::quadratic_constraint_t> constraints); + template + void set_quadratic_constraints(const std::vector& constraints) + { + quadratic_constraints_.clear(); + quadratic_constraints_.reserve(constraints.size()); + for (const auto& qc : constraints) { + quadratic_constraints_.push_back( + {static_cast(qc.constraint_row_index), + qc.constraint_row_name, + qc.constraint_row_type, + std::vector(qc.linear_values.begin(), qc.linear_values.end()), + std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), + static_cast(qc.rhs_value), + std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), + std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), + std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + } + } bool has_quadratic_constraints() const noexcept; const std::vector::quadratic_constraint_t>& get_quadratic_constraints() const noexcept; - void set_linear_constraint_mps_indices(const i_t* indices, i_t size); - span get_linear_constraint_mps_indices() const noexcept; - - void set_mps_declaration_constraint_row_count(i_t count); - i_t get_mps_declaration_constraint_row_count() const noexcept; - - void set_mps_all_constraint_row_names(std::vector names); - - const std::vector& get_mps_all_constraint_row_names() const noexcept; - private: bool maximize_{false}; span A_; @@ -468,11 +476,6 @@ class data_model_view_t { bool is_Q_symmetrized_{false}; std::vector::quadratic_constraint_t> quadratic_constraints_; - - span linear_mps_indices_{}; - std::vector linear_mps_indices_owned_{}; - i_t mps_declaration_constraint_row_count_{0}; - std::vector mps_all_constraint_row_names_{}; }; // class data_model_view_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index a292bb5002..69909fb3a6 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -353,25 +353,6 @@ class mps_data_model_t { bool has_quadratic_constraints() const noexcept; - /** - * @brief When quadratic constraints are present, CSR rows are linear-only; entry j is the MPS - * ROWS declaration index for linear CSR row j. - */ - void set_linear_constraint_mps_indices(std::vector indices); - - /** - * @brief ROWS names in declaration order (size == declaration row count). Used for MPS export - * when linear CSR excludes quadratic rows. - */ - void set_mps_all_constraint_row_names(std::vector names); - - /** @brief Total ROWS constraint count (linear + quadratic) when QC rows are separated; else 0. */ - void set_mps_declaration_constraint_row_count(i_t count); - - const std::vector& get_linear_constraint_mps_indices() const; - const std::vector& get_mps_all_constraint_row_names() const; - i_t get_mps_declaration_constraint_row_count() const; - /** whether to maximize or minimize the objective function */ bool maximize_; /** @@ -408,10 +389,7 @@ class mps_data_model_t { std::string problem_name_; /** names of each of the variables in the OP */ std::vector var_names_{}; - /** - * names of linear constraint rows only when QCMATRIX rows are separated; otherwise all constraint - * rows (same as MPS ROWS order excluding objective). - */ + /** names of linear constraint rows in exported MPS order. */ std::vector row_names_{}; /** number of variables */ i_t n_vars_{0}; @@ -433,11 +411,6 @@ class mps_data_model_t { /** One full quadratic constraint per QCMATRIX block, in order of appearance in the file */ std::vector quadratic_constraints_; - /** Maps linear CSR row j -> MPS ROWS declaration index; non-empty iff QC rows are split out. */ - std::vector linear_constraint_mps_indices_{}; - i_t mps_declaration_constraint_row_count_{0}; - std::vector mps_all_constraint_row_names_{}; - }; // class mps_data_model_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/src/data_model_view.cpp b/cpp/libmps_parser/src/data_model_view.cpp index be32eb3d3a..aacedfcadb 100644 --- a/cpp/libmps_parser/src/data_model_view.cpp +++ b/cpp/libmps_parser/src/data_model_view.cpp @@ -375,54 +375,6 @@ data_model_view_t::get_quadratic_constraints() const noexcept return quadratic_constraints_; } -template -void data_model_view_t::set_linear_constraint_mps_indices(const i_t* indices, i_t size) -{ - if (size != 0) { - mps_parser_expects( - indices != nullptr, error_type_t::ValidationError, "linear MPS indices cannot be null"); - } - linear_mps_indices_owned_.assign(indices, indices + size); - if (linear_mps_indices_owned_.empty()) { - linear_mps_indices_ = span{}; - } else { - linear_mps_indices_ = span(linear_mps_indices_owned_.data(), - static_cast(linear_mps_indices_owned_.size())); - } -} - -template -span data_model_view_t::get_linear_constraint_mps_indices() const noexcept -{ - return linear_mps_indices_; -} - -template -void data_model_view_t::set_mps_declaration_constraint_row_count(i_t count) -{ - mps_declaration_constraint_row_count_ = count; -} - -template -i_t data_model_view_t::get_mps_declaration_constraint_row_count() const noexcept -{ - return mps_declaration_constraint_row_count_; -} - -template -void data_model_view_t::set_mps_all_constraint_row_names(std::vector names) -{ - mps_all_constraint_row_names_ = std::move(names); -} - -template -const std::vector& data_model_view_t::get_mps_all_constraint_row_names() const - noexcept -{ - return mps_all_constraint_row_names_; -} - - // NOTE: Explicitly instantiate all types here in order to avoid linker error template class data_model_view_t; diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 72ae524703..1ecaf47b2c 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -220,24 +220,6 @@ void mps_data_model_t::set_quadratic_objective_matrix(const f_t* Q_val std::copy(Q_offsets, Q_offsets + size_offsets, Q_objective_offsets_.data()); } -template -void mps_data_model_t::set_linear_constraint_mps_indices(std::vector indices) -{ - linear_constraint_mps_indices_ = std::move(indices); -} - -template -void mps_data_model_t::set_mps_all_constraint_row_names(std::vector names) -{ - mps_all_constraint_row_names_ = std::move(names); -} - -template -void mps_data_model_t::set_mps_declaration_constraint_row_count(i_t count) -{ - mps_declaration_constraint_row_count_ = count; -} - template void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, @@ -551,24 +533,6 @@ std::vector& mps_data_model_t::get_quadratic_objective_offsets() return Q_objective_offsets_; } -template -const std::vector& mps_data_model_t::get_linear_constraint_mps_indices() const -{ - return linear_constraint_mps_indices_; -} - -template -const std::vector& mps_data_model_t::get_mps_all_constraint_row_names() const -{ - return mps_all_constraint_row_names_; -} - -template -i_t mps_data_model_t::get_mps_declaration_constraint_row_count() const -{ - return mps_declaration_constraint_row_count_; -} - template auto mps_data_model_t::get_quadratic_constraints() const -> const std::vector& diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 1d31f2a9d8..e77ed915fb 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -553,6 +553,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs). // Bundle row metadata, row-linear coefficients (from COLUMNS), rhs, and quadratic part together. constexpr f_t k_qcmatrix_value_scale = f_t(1); + const i_t linear_row_count = static_cast(row_types.size() - quadratic_row_ids.size()); + i_t quadratic_row_id = 0; for (const auto& block : qcmatrix_blocks_) { auto csr_result = build_csr_via_transpose( block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); @@ -567,7 +569,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) const i_t* linear_idx = linear_nnz > 0 ? A_indices[row_id].data() : nullptr; problem.append_quadratic_constraint( - row_id, + linear_row_count + quadratic_row_id, row_names[row_id], static_cast(row_types[row_id]), linear_val, @@ -581,20 +583,10 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) static_cast(csr_result.indices.size()), csr_result.offsets.data(), static_cast(csr_result.offsets.size())); - } - - std::vector linear_mps_indices{}; - linear_mps_indices.reserve(row_types.size()); - for (i_t i = 0; i < static_cast(row_types.size()); ++i) { - if (!quadratic_row_ids.count(i)) { linear_mps_indices.push_back(i); } + ++quadratic_row_id; } if (!quadratic_row_ids.empty()) { - problem.set_linear_constraint_mps_indices(std::move(linear_mps_indices)); - problem.set_mps_declaration_constraint_row_count(static_cast(row_names.size())); - problem.set_mps_all_constraint_row_names( - std::vector(row_names.begin(), row_names.end())); - std::vector linear_row_names{}; std::vector row_types_linear{}; linear_row_names.reserve(row_names.size()); @@ -608,9 +600,6 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_row_names(std::move(linear_row_names)); problem.set_row_types(row_types_linear.data(), static_cast(row_types_linear.size())); } else { - problem.set_linear_constraint_mps_indices({}); - problem.set_mps_declaration_constraint_row_count(0); - problem.set_mps_all_constraint_row_names({}); std::vector row_types_host(row_types.size()); for (size_t i = 0; i < row_types.size(); ++i) { row_types_host[i] = static_cast(row_types[i]); diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp index a7d6734027..67346cdcb5 100644 --- a/cpp/libmps_parser/src/mps_writer.cpp +++ b/cpp/libmps_parser/src/mps_writer.cpp @@ -16,11 +16,9 @@ #include #include #include -#include #include #include #include -#include #include namespace cuopt::mps_parser { @@ -35,13 +33,6 @@ char linear_row_type_from_bounds(f_t cl, f_t cu) return 'L'; } -inline int linear_csr_row_for_mps_decl_index( - std::unordered_map const& linear_csr_row_for_mps_decl, size_t decl_k) -{ - auto const it = linear_csr_row_for_mps_decl.find(decl_k); - return it == linear_csr_row_for_mps_decl.end() ? -1 : it->second; -} - } // namespace template @@ -131,19 +122,6 @@ data_model_view_t mps_writer_t::create_view( model.get_quadratic_constraints())); } - if (model.get_mps_declaration_constraint_row_count() > 0) { - view.set_mps_declaration_constraint_row_count(model.get_mps_declaration_constraint_row_count()); - const auto& lmi = model.get_linear_constraint_mps_indices(); - if (!lmi.empty()) { - view.set_linear_constraint_mps_indices(lmi.data(), static_cast(lmi.size())); - } - const auto& all_names = model.get_mps_all_constraint_row_names(); - if (!all_names.empty()) { - view.set_mps_all_constraint_row_names( - std::vector(all_names.begin(), all_names.end())); - } - } - return view; } @@ -170,26 +148,8 @@ void mps_writer_t::write(const std::string& mps_file_path) n_constraints = problem_.get_constraint_bounds().size(); else n_constraints = problem_.get_constraint_lower_bounds().size(); - - const bool qc_rows_separated = - problem_.get_mps_declaration_constraint_row_count() > 0 && - problem_.get_linear_constraint_mps_indices().size() > 0 && - !problem_.get_mps_all_constraint_row_names().empty(); - - i_t n_mps_declaration_rows = n_constraints; - std::unordered_map linear_csr_row_for_mps_decl; - if (qc_rows_separated) { - n_mps_declaration_rows = problem_.get_mps_declaration_constraint_row_count(); - span const ltd = problem_.get_linear_constraint_mps_indices(); - linear_csr_row_for_mps_decl.reserve(static_cast(ltd.size())); - size_t const n_decl_sz = static_cast(n_mps_declaration_rows); - for (size_t j = 0; j < ltd.size(); ++j) { - i_t const decl_row = ltd[j]; - if (decl_row < 0) { continue; } - size_t const d = static_cast(decl_row); - if (d < n_decl_sz) { linear_csr_row_for_mps_decl.try_emplace(d, static_cast(j)); } - } - } + const auto& quadratic_constraints = problem_.get_quadratic_constraints(); + const i_t n_quadratic_constraints = static_cast(quadratic_constraints.size()); std::vector objective_coefficients(problem_.get_objective_coefficients().size()); std::vector constraint_lower_bounds(n_constraints); @@ -267,66 +227,23 @@ void mps_writer_t::write(const std::string& mps_file_path) if (problem_.get_sense()) { mps_file << "OBJSENSE\n MAXIMIZE\n"; } - // sort the quadratic constraints by the constraint row index, useful for both ROWS and RHS sections - using qc_t = typename mps_data_model_t::quadratic_constraint_t; - std::vector qcs_by_decl_row; - if (qc_rows_separated) { - std::vector const& qcs = problem_.get_quadratic_constraints(); - size_t const n_decl = static_cast(n_mps_declaration_rows); - qcs_by_decl_row.reserve(qcs.size()); - for (qc_t const& qc : qcs) { - size_t const idx = static_cast(qc.constraint_row_index); - if (idx < n_decl) { qcs_by_decl_row.push_back(&qc); } - } - std::stable_sort(qcs_by_decl_row.begin(), - qcs_by_decl_row.end(), - [](qc_t const* lhs, qc_t const* rhs) { - return lhs->constraint_row_index < rhs->constraint_row_index; - }); - } - // ROWS section mps_file << "ROWS\n"; mps_file << " N " << (problem_.get_objective_name().empty() ? "OBJ" : problem_.get_objective_name()) << "\n"; - if (!qc_rows_separated) { - for (size_t k = 0; k < (size_t)n_mps_declaration_rows; ++k) { - std::string row_name = k < problem_.get_row_names().size() ? problem_.get_row_names()[k] - : "R" + std::to_string(k); - char const type = - linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); - mps_file << " " << type << " " << row_name << "\n"; - } - } else { - size_t const n_decl = static_cast(n_mps_declaration_rows); - const auto& alln = problem_.get_mps_all_constraint_row_names(); - size_t qc_idx = 0; - for (size_t k = 0; k < n_decl; ++k) { - std::string row_name = alln[k]; - char type = 'L'; - // find the quadratic constraint that corresponds to the current row - while (qc_idx < qcs_by_decl_row.size() && - static_cast(qcs_by_decl_row[qc_idx]->constraint_row_index) < k) { - ++qc_idx; - } - qc_t const* const qc_match = - (qc_idx < qcs_by_decl_row.size() && - static_cast(qcs_by_decl_row[qc_idx]->constraint_row_index) == k) - ? qcs_by_decl_row[qc_idx] - : nullptr; - if (qc_match != nullptr) { - // Quadratic rows are supported only as MPS 'L' (≤); always emit that sense. - type = 'L'; - } else { - int const lj = linear_csr_row_for_mps_decl_index(linear_csr_row_for_mps_decl, k); - if (lj >= 0) { - size_t const j = static_cast(lj); - type = linear_row_type_from_bounds(constraint_lower_bounds[j], constraint_upper_bounds[j]); - } - } - mps_file << " " << type << " " << row_name << "\n"; - } + for (size_t k = 0; k < static_cast(n_constraints); ++k) { + std::string row_name = + k < problem_.get_row_names().size() ? problem_.get_row_names()[k] : "R" + std::to_string(k); + char const type = linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); + mps_file << " " << type << " " << row_name << "\n"; + } + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + std::string row_name = + qc.constraint_row_name.empty() ? "QC" + std::to_string(q) : qc.constraint_row_name; + // Quadratic rows are currently restricted to MPS 'L' (<=). + mps_file << " L " << row_name << "\n"; } // COLUMNS section @@ -340,8 +257,7 @@ void mps_writer_t::write(const std::string& mps_file_path) // iterate over the constraint matrix and add the nonzeros to the integral and continuous col_nnzs maps for (size_t csr_row = 0; csr_row < (size_t)n_constraints; csr_row++) { - const i_t row_id = - qc_rows_separated ? problem_.get_linear_constraint_mps_indices()[csr_row] : static_cast(csr_row); + const i_t row_id = static_cast(csr_row); for (size_t k = (size_t)constraint_matrix_offsets[csr_row]; k < (size_t)constraint_matrix_offsets[csr_row + 1]; k++) { @@ -357,8 +273,9 @@ void mps_writer_t::write(const std::string& mps_file_path) // Quadratic constraint rows omit linear coefficients from global A; add them from QC bundles. if (problem_.has_quadratic_constraints()) { - for (const auto& qc : problem_.get_quadratic_constraints()) { - const size_t row_id = static_cast(qc.constraint_row_index); + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + const size_t row_id = static_cast(n_constraints) + q; for (size_t t = 0; t < qc.linear_indices.size(); ++t) { size_t var = static_cast(qc.linear_indices[t]); f_t val = qc.linear_values[t]; @@ -405,11 +322,14 @@ void mps_writer_t::write(const std::string& mps_file_path) : "C" + std::to_string(var_id); for (auto& nnz : nnzs) { std::string row_name; - if (qc_rows_separated && - static_cast(nnz.first) < problem_.get_mps_all_constraint_row_names().size()) { - row_name = problem_.get_mps_all_constraint_row_names()[nnz.first]; - } else if (static_cast(nnz.first) < problem_.get_row_names().size()) { + if (static_cast(nnz.first) < problem_.get_row_names().size()) { row_name = problem_.get_row_names()[nnz.first]; + } else if (static_cast(nnz.first) < + static_cast(n_constraints) + quadratic_constraints.size()) { + const size_t q = static_cast(nnz.first) - static_cast(n_constraints); + row_name = quadratic_constraints[q].constraint_row_name.empty() + ? "QC" + std::to_string(q) + : quadratic_constraints[q].constraint_row_name; } else { row_name = "R" + std::to_string(nnz.first); } @@ -427,56 +347,28 @@ void mps_writer_t::write(const std::string& mps_file_path) // RHS section mps_file << "RHS\n"; - size_t qc_idx_rhs = 0; - for (size_t k = 0; k < (size_t)n_mps_declaration_rows; ++k) { - std::string row_name; + for (size_t k = 0; k < static_cast(n_constraints); ++k) { + std::string row_name = + k < problem_.get_row_names().size() ? problem_.get_row_names()[k] : "R" + std::to_string(k); f_t rhs{0}; - - if (!qc_rows_separated) { - row_name = k < problem_.get_row_names().size() ? problem_.get_row_names()[k] - : "R" + std::to_string(k); - if (constraint_bounds.size() > 0) - rhs = constraint_bounds[k]; - else if (std::isinf(constraint_lower_bounds[k])) { - rhs = constraint_upper_bounds[k]; - } else if (std::isinf(constraint_upper_bounds[k])) { - rhs = constraint_lower_bounds[k]; - } else { - rhs = constraint_lower_bounds[k]; - } + if (constraint_bounds.size() > 0) + rhs = constraint_bounds[k]; + else if (std::isinf(constraint_lower_bounds[k])) { + rhs = constraint_upper_bounds[k]; + } else if (std::isinf(constraint_upper_bounds[k])) { + rhs = constraint_lower_bounds[k]; } else { - const auto& alln = problem_.get_mps_all_constraint_row_names(); - row_name = alln[k]; - while (qc_idx_rhs < qcs_by_decl_row.size() && - static_cast(qcs_by_decl_row[qc_idx_rhs]->constraint_row_index) < k) { - ++qc_idx_rhs; - } - qc_t const* const qc_match = - (qc_idx_rhs < qcs_by_decl_row.size() && - static_cast(qcs_by_decl_row[qc_idx_rhs]->constraint_row_index) == k) - ? qcs_by_decl_row[qc_idx_rhs] - : nullptr; - if (qc_match != nullptr) { - rhs = qc_match->rhs_value; - } else { - int const lj = linear_csr_row_for_mps_decl_index(linear_csr_row_for_mps_decl, k); - mps_parser_expects(lj >= 0, - error_type_t::ValidationError, - "RHS row %zu has no linear or quadratic mapping", - k); - const size_t j = static_cast(lj); - if (constraint_bounds.size() > 0) - rhs = constraint_bounds[j]; - else if (std::isinf(constraint_lower_bounds[j])) { - rhs = constraint_upper_bounds[j]; - } else if (std::isinf(constraint_upper_bounds[j])) { - rhs = constraint_lower_bounds[j]; - } else { - rhs = constraint_lower_bounds[j]; - } - } + rhs = constraint_lower_bounds[k]; } - + if (std::isfinite(rhs) && rhs != 0.0) { + mps_file << " RHS1 " << row_name << " " << rhs << "\n"; + } + } + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + std::string row_name = + qc.constraint_row_name.empty() ? "QC" + std::to_string(q) : qc.constraint_row_name; + const f_t rhs = qc.rhs_value; if (std::isfinite(rhs) && rhs != 0.0) { mps_file << " RHS1 " << row_name << " " << rhs << "\n"; } diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 3704eac8db..899aa069f9 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -961,13 +961,6 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) ASSERT_EQ(1u, model.get_row_names().size()); EXPECT_EQ("LIN0", model.get_row_names()[0]); EXPECT_EQ('L', model.get_row_types()[0]); - ASSERT_EQ(3, model.get_mps_declaration_constraint_row_count()); - ASSERT_EQ(3u, model.get_mps_all_constraint_row_names().size()); - EXPECT_EQ("LIN0", model.get_mps_all_constraint_row_names()[0]); - EXPECT_EQ("QC0", model.get_mps_all_constraint_row_names()[1]); - EXPECT_EQ("QC1", model.get_mps_all_constraint_row_names()[2]); - ASSERT_EQ(1u, model.get_linear_constraint_mps_indices().size()); - EXPECT_EQ(0, model.get_linear_constraint_mps_indices()[0]); // LIN0: 2*x1 + x2 ≤ 15 (linear row only; not duplicated in quadratic_constraints) EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), @@ -1016,19 +1009,10 @@ TEST(qps_parser, qcqp_p0033_mps_sections) EXPECT_EQ(33, model.get_n_variables()); ASSERT_EQ(12u, model.get_row_types().size()); ASSERT_EQ(12u, model.get_row_names().size()); - ASSERT_EQ(16, model.get_mps_declaration_constraint_row_count()); - ASSERT_EQ(16u, model.get_mps_all_constraint_row_names().size()); - const auto& rnames = model.get_mps_all_constraint_row_names(); - auto qc1_it = std::find(rnames.begin(), rnames.end(), std::string("QC1")); - ASSERT_NE(qc1_it, rnames.end()); - const int qc1_row = static_cast(qc1_it - rnames.begin()); - - std::vector coeff; - std::vector vars; const auto& qcs = model.get_quadratic_constraints(); ASSERT_EQ(4u, qcs.size()); - EXPECT_EQ(qc1_row, qcs[0].constraint_row_index); + EXPECT_EQ(12, qcs[0].constraint_row_index); ASSERT_EQ(1u, qcs[0].linear_values.size()); EXPECT_DOUBLE_EQ(1.0, qcs[0].linear_values[0]); @@ -1235,14 +1219,6 @@ void compare_data_models(const mps_data_model_t& original, } } - EXPECT_EQ(original.get_mps_declaration_constraint_row_count(), - reloaded.get_mps_declaration_constraint_row_count()); - EXPECT_EQ(original.get_linear_constraint_mps_indices(), reloaded.get_linear_constraint_mps_indices()); - ASSERT_EQ(original.get_mps_all_constraint_row_names().size(), - reloaded.get_mps_all_constraint_row_names().size()); - for (size_t i = 0; i < original.get_mps_all_constraint_row_names().size(); ++i) { - EXPECT_EQ(original.get_mps_all_constraint_row_names()[i], reloaded.get_mps_all_constraint_row_names()[i]); - } } TEST(mps_roundtrip, linear_programming_basic) diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index 4bbe5ef545..e4018a290a 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -135,50 +135,11 @@ void cpu_optimization_problem_t::set_quadratic_objective_matrix( template void cpu_optimization_problem_t::set_quadratic_constraints( - std::vector::mps_quadratic_constraint_t> - constraints) + std::vector::quadratic_constraint_t> constraints) { quadratic_constraints_ = std::move(constraints); } -template -void cpu_optimization_problem_t::set_linear_constraint_mps_indices(std::vector indices) -{ - linear_constraint_mps_indices_ = std::move(indices); -} - -template -void cpu_optimization_problem_t::set_mps_declaration_constraint_row_count(i_t count) -{ - mps_declaration_constraint_row_count_ = count; -} - -template -void cpu_optimization_problem_t::set_mps_all_constraint_row_names( - std::vector names) -{ - mps_all_constraint_row_names_ = std::move(names); -} - -template -i_t cpu_optimization_problem_t::get_mps_declaration_constraint_row_count() const -{ - return mps_declaration_constraint_row_count_; -} - -template -const std::vector& cpu_optimization_problem_t::get_linear_constraint_mps_indices() const -{ - return linear_constraint_mps_indices_; -} - -template -const std::vector& cpu_optimization_problem_t::get_mps_all_constraint_row_names() - const -{ - return mps_all_constraint_row_names_; -} - template void cpu_optimization_problem_t::set_variable_lower_bounds( const f_t* variable_lower_bounds, i_t size) @@ -541,7 +502,7 @@ bool cpu_optimization_problem_t::has_quadratic_objective() const } template -const std::vector::mps_quadratic_constraint_t>& +const std::vector::quadratic_constraint_t>& cpu_optimization_problem_t::get_quadratic_constraints() const { return quadratic_constraints_; @@ -682,18 +643,10 @@ cpu_optimization_problem_t::to_optimization_problem(raft::handle_t con if (!quadratic_constraints_.empty()) { gpu_problem->set_quadratic_constraints( - std::vector::mps_quadratic_constraint_t>( + std::vector::quadratic_constraint_t>( quadratic_constraints_)); } - if (mps_declaration_constraint_row_count_ > 0) { - gpu_problem->set_linear_constraint_mps_indices( - std::vector(linear_constraint_mps_indices_)); - gpu_problem->set_mps_declaration_constraint_row_count(mps_declaration_constraint_row_count_); - gpu_problem->set_mps_all_constraint_row_names( - std::vector(mps_all_constraint_row_names_)); - } - // Set variable bounds if (!variable_lower_bounds_.empty()) { gpu_problem->set_variable_lower_bounds(variable_lower_bounds_.data(), @@ -813,23 +766,8 @@ void cpu_optimization_problem_t::write_to_mps(const std::string& mps_f false); } - if (has_quadratic_constraints()) { - data_model_view.set_quadratic_constraints( - std::vector::mps_quadratic_constraint_t>( - get_quadratic_constraints())); - } - - if (get_mps_declaration_constraint_row_count() > 0) { - data_model_view.set_mps_declaration_constraint_row_count(get_mps_declaration_constraint_row_count()); - if (!get_linear_constraint_mps_indices().empty()) { - data_model_view.set_linear_constraint_mps_indices(get_linear_constraint_mps_indices().data(), - static_cast( - get_linear_constraint_mps_indices().size())); - } - if (!get_mps_all_constraint_row_names().empty()) { - data_model_view.set_mps_all_constraint_row_names( - std::vector(get_mps_all_constraint_row_names())); - } + if (!quadratic_constraints_.empty()) { + data_model_view.set_quadratic_constraints(quadratic_constraints_); } cuopt::mps_parser::write_mps(data_model_view, mps_file_path); @@ -847,10 +785,6 @@ bool cpu_optimization_problem_t::is_equivalent( if (maximize_ != other.get_sense()) return false; if (n_vars_ != other.get_n_variables()) return false; if (n_constraints_ != other.get_n_constraints()) return false; - if (get_mps_declaration_constraint_row_count() != other.get_mps_declaration_constraint_row_count()) - return false; - if (get_linear_constraint_mps_indices() != other.get_linear_constraint_mps_indices()) return false; - if (get_mps_all_constraint_row_names() != other.get_mps_all_constraint_row_names()) return false; if (std::abs(objective_scaling_factor_ - other.get_objective_scaling_factor()) > 1e-9) return false; if (std::abs(objective_offset_ - other.get_objective_offset()) > 1e-9) return false; diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu index b3f71bd92c..634fd86c60 100644 --- a/cpp/src/pdlp/optimization_problem.cu +++ b/cpp/src/pdlp/optimization_problem.cu @@ -98,10 +98,7 @@ optimization_problem_t::optimization_problem_t( problem_category_{other.get_problem_category()}, var_names_{other.get_variable_names()}, row_names_{other.get_row_names()}, - quadratic_constraints_{other.get_quadratic_constraints()}, - linear_constraint_mps_indices_{other.get_linear_constraint_mps_indices()}, - mps_declaration_constraint_row_count_{other.get_mps_declaration_constraint_row_count()}, - mps_all_constraint_row_names_{other.get_mps_all_constraint_row_names()} + quadratic_constraints_{other.get_quadratic_constraints()} { } @@ -203,49 +200,12 @@ void optimization_problem_t::set_quadratic_objective_matrix( template void optimization_problem_t::set_quadratic_constraints( - std::vector::mps_quadratic_constraint_t> + std::vector::quadratic_constraint_t> constraints) { quadratic_constraints_ = std::move(constraints); } -template -void optimization_problem_t::set_linear_constraint_mps_indices(std::vector indices) -{ - linear_constraint_mps_indices_ = std::move(indices); -} - -template -void optimization_problem_t::set_mps_declaration_constraint_row_count(i_t count) -{ - mps_declaration_constraint_row_count_ = count; -} - -template -void optimization_problem_t::set_mps_all_constraint_row_names(std::vector names) -{ - mps_all_constraint_row_names_ = std::move(names); -} - -template -i_t optimization_problem_t::get_mps_declaration_constraint_row_count() const -{ - return mps_declaration_constraint_row_count_; -} - -template -const std::vector& optimization_problem_t::get_linear_constraint_mps_indices() const -{ - return linear_constraint_mps_indices_; -} - -template -const std::vector& optimization_problem_t::get_mps_all_constraint_row_names() - const -{ - return mps_all_constraint_row_names_; -} - template void optimization_problem_t::set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) @@ -598,7 +558,7 @@ bool optimization_problem_t::has_quadratic_objective() const } template -const std::vector::mps_quadratic_constraint_t>& +const std::vector::quadratic_constraint_t>& optimization_problem_t::get_quadratic_constraints() const { return quadratic_constraints_; @@ -882,23 +842,8 @@ void optimization_problem_t::write_to_mps(const std::string& mps_file_ is_symmetrized); } - if (has_quadratic_constraints()) { - data_model_view.set_quadratic_constraints( - std::vector::mps_quadratic_constraint_t>( - get_quadratic_constraints())); - } - - if (get_mps_declaration_constraint_row_count() > 0) { - data_model_view.set_mps_declaration_constraint_row_count(get_mps_declaration_constraint_row_count()); - if (!get_linear_constraint_mps_indices().empty()) { - data_model_view.set_linear_constraint_mps_indices(get_linear_constraint_mps_indices().data(), - static_cast( - get_linear_constraint_mps_indices().size())); - } - if (!get_mps_all_constraint_row_names().empty()) { - data_model_view.set_mps_all_constraint_row_names( - std::vector(get_mps_all_constraint_row_names())); - } + if (!quadratic_constraints_.empty()) { + data_model_view.set_quadratic_constraints(quadratic_constraints_); } cuopt::mps_parser::write_mps(data_model_view, mps_file_path); @@ -1111,11 +1056,6 @@ bool optimization_problem_t::is_equivalent( if (maximize_ != other.maximize_) { return false; } if (n_vars_ != other.n_vars_) { return false; } if (n_constraints_ != other.n_constraints_) { return false; } - if (linear_constraint_mps_indices_ != other.linear_constraint_mps_indices_) { return false; } - if (mps_declaration_constraint_row_count_ != other.mps_declaration_constraint_row_count_) { - return false; - } - if (mps_all_constraint_row_names_ != other.mps_all_constraint_row_names_) { return false; } if (objective_scaling_factor_ != other.objective_scaling_factor_) { return false; } if (objective_offset_ != other.objective_offset_) { return false; } if (problem_category_ != other.problem_category_) { return false; } @@ -1260,10 +1200,6 @@ bool optimization_problem_t::is_equivalent( if (maximize_ != other.get_sense()) return false; if (n_vars_ != other.get_n_variables()) return false; if (n_constraints_ != other.get_n_constraints()) return false; - if (get_mps_declaration_constraint_row_count() != other.get_mps_declaration_constraint_row_count()) - return false; - if (get_linear_constraint_mps_indices() != other.get_linear_constraint_mps_indices()) return false; - if (get_mps_all_constraint_row_names() != other.get_mps_all_constraint_row_names()) return false; if (std::abs(objective_scaling_factor_ - other.get_objective_scaling_factor()) > 1e-9) return false; if (std::abs(objective_offset_ - other.get_objective_offset()) > 1e-9) return false; From 8ff6ce1fb78e1bcd0992ffd7904e32ec880cdf6c Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Wed, 22 Apr 2026 12:23:14 -0700 Subject: [PATCH 12/22] minor adjustment for parser --- cpp/libmps_parser/src/mps_parser.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index e77ed915fb..2921f9dc09 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef MPS_PARSER_WITH_BZIP2 #include @@ -271,11 +272,13 @@ ObjSenseType convert_to_obj_sense(const std::string& str) template void mps_parser_t::fill_problem(mps_data_model_t& problem) { - // count the row indices that are quadratic constraints + // Row indices that have QCMATRIX blocks (quadratic rows follow linear rows in ROWS under + // our MPS section rules; names are not required to be QC0..QCN) std::unordered_set quadratic_row_ids{}; for (const auto& block : qcmatrix_blocks_) { quadratic_row_ids.insert(block.constraint_row_id); } + const auto is_quadratic_row = [&quadratic_row_ids](i_t row) { return quadratic_row_ids.count(row); }; { std::vector h_offsets{}, h_indices{}; @@ -286,7 +289,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (i_t i = 0; i < (i_t)A_indices.size(); ++i) { // Quadratic constraint rows are omitted from the linear CSR; linear pieces live in each // quadratic_constraint_t bundle. - if (quadratic_row_ids.count(i)) { continue; } + if (is_quadratic_row(i)) { continue; } ++num_linear_rows; for (const auto& idx_itr : A_indices[i]) { h_indices.push_back(idx_itr); @@ -334,7 +337,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) std::vector b_compacted{}; b_compacted.reserve(b_values.size()); for (i_t i = 0; i < (i_t)b_values.size(); ++i) { - if (!quadratic_row_ids.count(i)) { b_compacted.push_back(b_values[i]); } + if (!is_quadratic_row(i)) { b_compacted.push_back(b_values[i]); } } problem.set_constraint_bounds(b_compacted.data(), static_cast(b_compacted.size())); problem.set_objective_coefficients(c_values.data(), c_values.size()); @@ -363,7 +366,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) std::vector h_constraint_lower_bounds{}; std::vector h_constraint_upper_bounds{}; for (i_t i = 0; i < (i_t)row_types.size(); ++i) { - if (quadratic_row_ids.count(i)) { continue; } + if (is_quadratic_row(i)) { continue; } if (row_types[i] == Equality) { h_constraint_lower_bounds.push_back(b_values[i]); h_constraint_upper_bounds.push_back(b_values[i]); @@ -510,6 +513,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (i_t row = 0; row < num_rows; ++row) { for (const auto& [col, val] : csr_data[row]) { + // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q xExpand commentComment on line L488 + // so we have to multiply the value by value_scale=0.5 to get the correct value. result.values.push_back(val * value_scale); result.indices.push_back(col); } @@ -553,8 +558,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs). // Bundle row metadata, row-linear coefficients (from COLUMNS), rhs, and quadratic part together. constexpr f_t k_qcmatrix_value_scale = f_t(1); - const i_t linear_row_count = static_cast(row_types.size() - quadratic_row_ids.size()); - i_t quadratic_row_id = 0; + const i_t linear_row_count = static_cast(row_types.size() - quadratic_row_ids.size()); + i_t quadratic_row_id = 0; for (const auto& block : qcmatrix_blocks_) { auto csr_result = build_csr_via_transpose( block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); @@ -592,7 +597,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) linear_row_names.reserve(row_names.size()); row_types_linear.reserve(row_names.size()); for (size_t i = 0; i < row_names.size(); ++i) { - if (!quadratic_row_ids.count(static_cast(i))) { + if (!is_quadratic_row(static_cast(i))) { linear_row_names.push_back(row_names[i]); row_types_linear.push_back(static_cast(row_types[i])); } From 6617f94396eb96eb8e938dfb5e073734f560a999 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Thu, 23 Apr 2026 07:55:21 -0700 Subject: [PATCH 13/22] Add bridging for mps model to user_problem model --- cpp/src/pdlp/solve.cu | 13 ++- cpp/src/pdlp/translate.hpp | 185 ++++++++++++++++++++++++++++++++++++- 2 files changed, 192 insertions(+), 6 deletions(-) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 29a7f32db6..9087e732b9 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -1378,12 +1378,17 @@ optimization_problem_solution_t solve_lp( // This needs to be called before pdlp is initialized init_handler(op_problem.get_handle_ptr()); - if (op_problem.has_quadratic_objective()) { - CUOPT_LOG_INFO("Problem has a quadratic objective. Using Barrier."); + if (op_problem.has_quadratic_objective() || op_problem.has_quadratic_constraints()) { + if (op_problem.has_quadratic_objective()) { + CUOPT_LOG_INFO("Problem has a quadratic objective. Using Barrier."); + } + if (op_problem.has_quadratic_constraints()) { + CUOPT_LOG_INFO("Problem has quadratic constraints. Using Barrier with SOC conversion."); + } settings.method = method_t::Barrier; settings.presolver = presolver_t::None; - // check for sense of the problem - if (op_problem.get_sense()) { + // Quadratic objective support is minimization-only. + if (op_problem.has_quadratic_objective() && op_problem.get_sense()) { CUOPT_LOG_ERROR("Quadratic problems must be minimized"); return optimization_problem_solution_t(pdlp_termination_status_t::NumericalError, op_problem.get_handle_ptr()->get_stream()); diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index b143a206d4..1628290c5a 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -34,8 +35,6 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( csr_A.j = std::vector(cuopt::host_copy(model.variables, handle_ptr->get_stream())); csr_A.row_start = std::vector(cuopt::host_copy(model.offsets, handle_ptr->get_stream())); - csr_A.to_compressed_col(user_problem.A); - user_problem.rhs.resize(m); user_problem.row_sense.resize(m); user_problem.range_rows.clear(); @@ -105,6 +104,188 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( user_problem.Q_indices = model.Q_indices; user_problem.Q_values = model.Q_values; + if (model.original_problem_ptr->has_quadratic_constraints()) { + const auto& qcs = model.original_problem_ptr->get_quadratic_constraints(); + cuopt_expects(!qcs.empty(), + error_type_t::ValidationError, + "Quadratic-constraint flag is set, but no constraints were provided"); + + const i_t original_rows = static_cast(user_problem.num_rows); + const f_t tol = f_t(1e-16); + + // SOC: Q is n×n diagonal CSR (offsets length n+1). Exactly q_n = nnz on the main diagonal, at + // q_n distinct variable indices: one −1 (head) and (q_n−1) +1 (tails). Lifting: q_n rows, each + // with −1 in one column; the **first** row must be the head (variable with Q = −1); order of + // remaining rows (+1 diagonals) is unconstrained (CSR row scan order). + + const i_t old_nnz = csr_A.row_start[original_rows]; + std::vector row_cone_dims{}; + row_cone_dims.reserve(qcs.size()); + + for (const auto& qc : qcs) { + cuopt_expects(qc.constraint_row_type == 'L', + error_type_t::ValidationError, + "Only <= quadratic constraints are supported for SOC conversion"); + cuopt_expects(qc.linear_values.empty(), + error_type_t::ValidationError, + "SOC conversion currently requires zero linear terms in quadratic constraints"); + cuopt_expects(qc.rhs_value < tol && qc.rhs_value > -tol, + error_type_t::ValidationError, + "SOC conversion currently requires rhs = 0 for quadratic constraints"); + + cuopt_expects(qc.quadratic_offsets.size() >= 2, + error_type_t::ValidationError, + "Quadratic constraint '%s' has invalid CSR offsets (need at least 2 entries)", + qc.constraint_row_name.c_str()); + cuopt_expects( + qc.quadratic_values.size() == qc.quadratic_indices.size(), + error_type_t::ValidationError, + "Quadratic constraint '%s' quadratic_values and quadratic_indices length mismatch for CSR Q", + qc.constraint_row_name.c_str()); + + const i_t q_n = static_cast(qc.quadratic_values.size()); + cuopt_expects(q_n >= 2, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC must have at least 2 diagonal entries in Q (nnz " + "%d)", + qc.constraint_row_name.c_str(), + static_cast(q_n)); + + cuopt_expects( + qc.quadratic_offsets.size() == static_cast(n) + 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q must be n×n in CSR: expected %zu CSR row pointers (offsets " + "length n+1), got %zu (n = %d)", + qc.constraint_row_name.c_str(), + static_cast(n) + 1, + qc.quadratic_offsets.size(), + static_cast(n)); + cuopt_expects( + qc.quadratic_offsets[static_cast(n)] == q_n, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q last CSR offset %d must equal number of nonzeros (nnz) %d for " + "this diagonal Q", + qc.constraint_row_name.c_str(), + static_cast(qc.quadratic_offsets[static_cast(n)]), + static_cast(q_n)); + cuopt_expects( + qc.quadratic_offsets[0] == 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q CSR offsets[0] must be 0", + qc.constraint_row_name.c_str()); + + // Verify Q: n×n CSR, diagonal entries only, Lorentz pattern, then build the lift. + // Scan each row r: empty or one nnz on (r,r) with value -1 (head) or +1 (tail); + // tail order follows this scan; no requirement that diagonal indices be sorted. + i_t head = static_cast(-1); + i_t n_head_m = 0; + std::vector tail_row_vars{}; + tail_row_vars.reserve(static_cast(q_n - 1)); + + for (i_t r = 0; r < n; ++r) { + const i_t p_beg = qc.quadratic_offsets[static_cast(r)]; + const i_t p_end = qc.quadratic_offsets[static_cast(r + 1)]; + + if (p_beg == p_end) { continue; } + + cuopt_expects( + p_beg + 1 == p_end, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: expected at most one stored entry on the diagonal per " + "row (got end - beg = %d)", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(p_end - p_beg)); + + const i_t col = qc.quadratic_indices[static_cast(p_beg)]; + const f_t v = qc.quadratic_values[static_cast(p_beg)]; + cuopt_expects( + col == r, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: only main diagonal (j,j) entries are allowed; got " + "column %d", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(col)); + + if (v > f_t(-1) - tol && v < f_t(-1) + tol) { + ++n_head_m; + head = r; + } else if (v > f_t(1) - tol && v < f_t(1) + tol) { + tail_row_vars.push_back(r); + } else { + cuopt_expects( + false, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: diagonal for SOC must be -1 (head) or +1 (tail); got " + "%g", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(v)); + } + } + cuopt_expects( + n_head_m == 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected exactly one diagonal with value -1 (cone head), " + "found %d", + qc.constraint_row_name.c_str(), + static_cast(n_head_m)); + cuopt_expects( + static_cast(tail_row_vars.size()) == q_n - 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected %d diagonals with value +1 (tails), found %zu", + qc.constraint_row_name.c_str(), + static_cast(q_n - 1), + tail_row_vars.size()); + cuopt_expects( + head >= 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: internal error (head index invalid)", + qc.constraint_row_name.c_str()); + + row_cone_dims.push_back(q_n); + dual_simplex::csr_matrix_t lift_block(q_n, n, q_n); + for (i_t t = 0; t <= q_n; ++t) { + lift_block.row_start[t] = t; + } + + // One lift row per cone component: -1 in column head, then -1 in each tail column + // (order matches tail_row_vars from the Q scan). + lift_block.j[0] = head; + lift_block.x[0] = f_t(-1); + for (i_t t = 0; t < q_n - 1; ++t) { + lift_block.j[static_cast(t) + 1U] = tail_row_vars[static_cast(t)]; + lift_block.x[static_cast(t) + 1U] = f_t(-1); + } + cuopt_expects(csr_A.append_rows(lift_block) == 0, + error_type_t::RuntimeError, + "Internal error while appending SOC lifting rows to CSR A"); + } + + // Update user_problem to include the new SOC rows + const i_t next_row = static_cast(csr_A.m); + const i_t lifted_rows = next_row - original_rows; + const i_t new_nnz = old_nnz + lifted_rows; + cuopt_expects(csr_A.row_start[next_row] == new_nnz, + error_type_t::RuntimeError, + "Internal error while building SOC lifting rows in CSR A"); + + user_problem.rhs.resize(next_row, f_t(0)); + user_problem.row_sense.resize(next_row, 'E'); + if (user_problem.row_names.size() == static_cast(original_rows)) { + for (i_t r = original_rows; r < next_row; ++r) { + user_problem.row_names.push_back("_CUOPT_soc_row_" + std::to_string(r - original_rows)); + } + } + user_problem.num_rows = next_row; + + user_problem.cone_row_start = original_rows; + user_problem.second_order_cone_row_dims = std::move(row_cone_dims); + } + + csr_A.to_compressed_col(user_problem.A); + return user_problem; } From 62380e49ca8f9827ee6020773aaa2e1d3891a243 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Thu, 23 Apr 2026 09:16:18 -0700 Subject: [PATCH 14/22] Update QCQP dataset for parser check and address minor issues --- .../optimization_problem_interface.hpp | 12 +- .../include/mps_parser/mps_data_model.hpp | 19 +- cpp/libmps_parser/src/mps_data_model.cpp | 65 ++---- cpp/libmps_parser/src/mps_parser.cpp | 19 +- cpp/libmps_parser/tests/mps_parser_test.cpp | 31 +-- datasets/qcqp/QC_Test_1.mps | 30 +++ datasets/qcqp/p0033_qc1.mps | 214 ++++++++++++++++++ 7 files changed, 282 insertions(+), 108 deletions(-) create mode 100644 datasets/qcqp/QC_Test_1.mps create mode 100644 datasets/qcqp/p0033_qc1.mps diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index 27a106037b..eb0850ba44 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -73,12 +73,8 @@ class optimization_problem_interface_t { /** * @brief Store quadratic constraints for MPS round-trip (linear + Q parts per QC row). - * @note Default implementation ignores; GPU/CPU implementations persist for write_to_mps. */ - virtual void set_quadratic_constraints(std::vector constraints) - { - (void)constraints; - } + virtual void set_quadratic_constraints(std::vector constraints) = 0; template >> void set_quadratic_constraints(const std::vector& constraints) { @@ -103,11 +99,7 @@ class optimization_problem_interface_t { virtual bool has_quadratic_constraints() const = 0; /** @brief Quadratic constraints for MPS export (empty if none). */ - virtual const std::vector& get_quadratic_constraints() const - { - static const std::vector k_empty{}; - return k_empty; - } + virtual const std::vector& get_quadratic_constraints() const = 0; // ============================================================================ // Setters (accept both CPU and GPU pointers) diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 69909fb3a6..c4c8b56e28 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -287,23 +288,21 @@ class mps_data_model_t { /** * @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). + * @param linear_values, linear_indices Same nnz; empty spans for a purely quadratic row (rare). + * @param quadratic_values, quadratic_indices CSR nnz; may be empty if Q is empty. + * @param quadratic_offsets CSR row starts; must be non-empty. * @param constraint_row_type MPS ROWS type; must be 'L'. 'G' and 'E' quadratic rows are not * supported. */ void append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, char constraint_row_type, - const f_t* linear_values, - i_t linear_nnz, - const i_t* linear_indices, - i_t linear_indices_nnz, + std::span linear_values, + std::span linear_indices, f_t rhs_value, - const f_t* quadratic_values, - i_t quadratic_size_values, - const i_t* quadratic_indices, - i_t quadratic_size_indices, - const i_t* quadratic_offsets, - i_t quadratic_size_offsets); + std::span quadratic_values, + std::span quadratic_indices, + std::span quadratic_offsets); const std::vector& get_quadratic_constraints() const; diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 1ecaf47b2c..f5db136a58 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -224,17 +224,12 @@ template void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, char constraint_row_type, - const f_t* linear_values, - i_t linear_nnz, - const i_t* linear_indices, - i_t linear_indices_nnz, + std::span linear_values, + std::span linear_indices, f_t rhs_value, - const f_t* quadratic_values, - i_t quadratic_size_values, - const i_t* quadratic_indices, - i_t quadratic_size_indices, - const i_t* quadratic_offsets, - i_t quadratic_size_offsets) + std::span quadratic_values, + std::span quadratic_indices, + std::span quadratic_offsets) { mps_parser_expects( constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); @@ -245,28 +240,13 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ "Only 'L' is supported for convex quadratic constraints.", constraint_row_type); - mps_parser_expects(linear_nnz == linear_indices_nnz, - error_type_t::ValidationError, - "linear_values and linear_indices must have the same nnz count"); - if (linear_nnz != 0) { - mps_parser_expects( - linear_values != nullptr && linear_indices != nullptr, - error_type_t::ValidationError, - "linear_values and linear_indices cannot be null when linear_nnz > 0"); - } - - if (quadratic_size_values != 0) { - mps_parser_expects( - quadratic_values != nullptr, error_type_t::ValidationError, "quadratic_values cannot be null"); - } mps_parser_expects( - quadratic_offsets != nullptr, error_type_t::ValidationError, "quadratic_offsets cannot be null"); - if (quadratic_size_indices != 0) { - mps_parser_expects( - quadratic_indices != nullptr, error_type_t::ValidationError, "quadratic_indices cannot be null"); - } + linear_values.size() == linear_indices.size(), + error_type_t::ValidationError, + "linear_values and linear_indices must have the same length"); + mps_parser_expects( - quadratic_size_offsets > 0, error_type_t::ValidationError, "quadratic_size_offsets cannot be empty"); + !quadratic_offsets.empty(), error_type_t::ValidationError, "quadratic_offsets cannot be empty"); quadratic_constraint_t qc; qc.constraint_row_index = constraint_row_index; @@ -274,26 +254,11 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ qc.constraint_row_type = constraint_row_type; qc.rhs_value = rhs_value; - qc.linear_values.resize(linear_nnz); - qc.linear_indices.resize(linear_nnz); - if (linear_nnz > 0) { - std::copy(linear_values, linear_values + linear_nnz, qc.linear_values.data()); - std::copy(linear_indices, linear_indices + linear_nnz, qc.linear_indices.data()); - } - - qc.quadratic_values.resize(quadratic_size_values); - if (quadratic_size_values > 0) { - std::copy( - quadratic_values, quadratic_values + quadratic_size_values, qc.quadratic_values.data()); - } - qc.quadratic_indices.resize(quadratic_size_indices); - if (quadratic_size_indices > 0) { - std::copy( - quadratic_indices, quadratic_indices + quadratic_size_indices, qc.quadratic_indices.data()); - } - qc.quadratic_offsets.resize(quadratic_size_offsets); - std::copy( - quadratic_offsets, quadratic_offsets + quadratic_size_offsets, qc.quadratic_offsets.data()); + qc.linear_values.assign(linear_values.begin(), linear_values.end()); + qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); + qc.quadratic_values.assign(quadratic_values.begin(), quadratic_values.end()); + qc.quadratic_indices.assign(quadratic_indices.begin(), quadratic_indices.end()); + qc.quadratic_offsets.assign(quadratic_offsets.begin(), quadratic_offsets.end()); quadratic_constraints_.push_back(std::move(qc)); } diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 2921f9dc09..b446db5009 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -569,25 +569,16 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) error_type_t::ValidationError, "QCMATRIX row index %d is out of range for constraints", static_cast(row_id)); - const i_t linear_nnz = static_cast(A_indices[row_id].size()); - const f_t* linear_val = linear_nnz > 0 ? A_values[row_id].data() : nullptr; - const i_t* linear_idx = linear_nnz > 0 ? A_indices[row_id].data() : nullptr; - problem.append_quadratic_constraint( linear_row_count + quadratic_row_id, row_names[row_id], static_cast(row_types[row_id]), - linear_val, - linear_nnz, - linear_idx, - linear_nnz, + A_values[row_id], + A_indices[row_id], b_values[row_id], - csr_result.values.data(), - static_cast(csr_result.values.size()), - csr_result.indices.data(), - static_cast(csr_result.indices.size()), - csr_result.offsets.data(), - static_cast(csr_result.offsets.size())); + csr_result.values, + csr_result.indices, + csr_result.offsets); ++quadratic_row_id; } diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 899aa069f9..b8ddc243aa 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -883,20 +883,8 @@ TEST(qps_parser, qcmatrix_append_api) const std::vector qc0_offsets = {0, 2, 4}; const std::vector qc0_linear_values = {1.0, 1.0}; const std::vector qc0_linear_indices = {0, 1}; - model.append_quadratic_constraint(0, - "QC0", - 'L', - qc0_linear_values.data(), - qc0_linear_values.size(), - qc0_linear_indices.data(), - qc0_linear_indices.size(), - 5.0, - qc0_values.data(), - qc0_values.size(), - qc0_indices.data(), - qc0_indices.size(), - qc0_offsets.data(), - qc0_offsets.size()); + model.append_quadratic_constraint( + 0, "QC0", 'L', qc0_linear_values, qc0_linear_indices, 5.0, qc0_values, qc0_indices, qc0_offsets); // QC1: [[4, 1], [1, 6]] const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; @@ -907,17 +895,12 @@ TEST(qps_parser, qcmatrix_append_api) model.append_quadratic_constraint(1, "QC1", 'L', - qc1_linear_values.data(), - qc1_linear_values.size(), - qc1_linear_indices.data(), - qc1_linear_indices.size(), + qc1_linear_values, + qc1_linear_indices, 10.0, - qc1_values.data(), - qc1_values.size(), - qc1_indices.data(), - qc1_indices.size(), - qc1_offsets.data(), - qc1_offsets.size()); + qc1_values, + qc1_indices, + qc1_offsets); ASSERT_TRUE(model.has_quadratic_constraints()); const auto& qcs = model.get_quadratic_constraints(); diff --git a/datasets/qcqp/QC_Test_1.mps b/datasets/qcqp/QC_Test_1.mps new file mode 100644 index 0000000000..e791fdadd6 --- /dev/null +++ b/datasets/qcqp/QC_Test_1.mps @@ -0,0 +1,30 @@ +NAME QCTest +ROWS + N OBJ + L LIN0 + L QC0 + L QC1 +COLUMNS + VAR1 OBJ 0 + VAR1 LIN0 2 + VAR1 QC0 1 + VAR1 QC1 3 + VAR2 OBJ 0 + VAR2 LIN0 1 + VAR2 QC0 1 + VAR2 QC1 1 +RHS + RHS1 LIN0 15 + RHS1 QC0 5 + RHS1 QC1 10 +QCMATRIX QC0 + VAR1 VAR1 10 + VAR1 VAR2 2 + VAR2 VAR1 2 + VAR2 VAR2 2 +QCMATRIX QC1 + VAR1 VAR1 4 + VAR1 VAR2 1 + VAR2 VAR1 1 + VAR2 VAR2 6 +ENDATA diff --git a/datasets/qcqp/p0033_qc1.mps b/datasets/qcqp/p0033_qc1.mps new file mode 100644 index 0000000000..a06cec03c5 --- /dev/null +++ b/datasets/qcqp/p0033_qc1.mps @@ -0,0 +1,214 @@ +NAME p0033_qc1 +ROWS + N R100 + L R118 + L R119 + L R120 + L R121 + L R122 + L R123 + L R124 + L R125 + L R126 + L R127 + L R128 + L ZBESTROW + L QC1 + L QC2 + L QC3 + L QC4 +COLUMNS + C157 R100 171 + C157 R122 -300 + C157 R123 -300 + C158 R100 171 + C158 R126 -300 + C158 R127 -300 + C159 R100 171 + C159 R119 300 + C159 R120 -300 + C159 R121 -300 + C159 QC1 1 + C160 R100 171 + C160 R119 300 + C160 R120 -300 + C160 R121 -300 + C161 R100 163 + C161 R119 285 + C161 R120 -285 + C161 R124 -285 + C161 R125 -285 + C162 R100 162 + C162 R119 285 + C162 R120 -285 + C162 R122 -285 + C162 R123 -285 + C163 R100 163 + C163 R128 -285 + C164 R100 69 + C164 R119 265 + C164 R120 -265 + C164 R124 -265 + C164 R125 -265 + C165 R100 69 + C165 R119 265 + C165 R120 -265 + C165 R122 -265 + C165 R123 -265 + C166 R100 183 + C166 R118 -230 + C167 R100 183 + C167 R124 -230 + C167 R125 -230 + C168 R100 183 + C168 R119 230 + C168 R120 -230 + C168 R125 -230 + C169 R100 183 + C169 R119 230 + C169 R120 -230 + C169 R123 -230 + C170 R100 49 + C170 R119 190 + C170 R120 -190 + C170 R122 -190 + C170 R123 -190 + C171 R100 183 + C172 R100 258 + C172 R118 -200 + C173 R100 517 + C173 R118 -400 + C174 R100 250 + C174 R126 -200 + C174 R127 -200 + C175 R100 500 + C175 R126 -400 + C175 R127 -400 + C176 R100 250 + C176 R127 -200 + C177 R100 500 + C177 R127 -400 + C178 R100 159 + C178 R119 200 + C178 R120 -200 + C178 R124 -200 + C178 R125 -200 + C179 R100 318 + C179 R119 400 + C179 R120 -400 + C179 R124 -400 + C179 R125 -400 + C180 R100 159 + C180 R119 200 + C180 R120 -200 + C180 R125 -200 + C181 R100 318 + C181 R119 400 + C181 R120 -400 + C181 R125 -400 + C182 R100 159 + C182 R119 200 + C182 R120 -200 + C182 R122 -200 + C182 R123 -200 + C183 R100 318 + C183 R119 400 + C183 R120 -400 + C183 R122 -400 + C183 R123 -400 + C184 R100 159 + C184 R119 200 + C184 R120 -200 + C184 R123 -200 + C185 R100 318 + C185 R119 400 + C185 R120 -400 + C185 R123 -400 + C186 R100 114 + C186 R119 200 + C186 R120 -200 + C186 R121 -200 + C187 R100 228 + C187 R119 400 + C187 R120 -400 + C187 R121 -400 + C188 R100 159 + C188 R128 -200 + C189 R100 318 + C189 R128 -400 +RHS + rhs R118 -5 + rhs R119 2700 + rhs R120 -2600 + rhs R121 -100 + rhs R122 -900 + rhs R123 -1656 + rhs R124 -335 + rhs R125 -1026 + rhs R126 -5 + rhs R127 -500 + rhs R128 -270 + rhs QC1 1 + rhs QC2 2 + rhs QC3 1 + rhs QC4 1 +BOUNDS + UP bnd C157 1 + UP bnd C158 1 + UP bnd C159 1 + UP bnd C160 1 + UP bnd C161 1 + UP bnd C162 1 + UP bnd C163 1 + UP bnd C164 1 + UP bnd C165 1 + UP bnd C166 1 + UP bnd C167 1 + UP bnd C168 1 + UP bnd C169 1 + UP bnd C170 1 + UP bnd C171 1 + UP bnd C172 1 + UP bnd C173 1 + UP bnd C174 1 + UP bnd C175 1 + UP bnd C176 1 + UP bnd C177 1 + UP bnd C178 1 + UP bnd C179 1 + UP bnd C180 1 + UP bnd C181 1 + UP bnd C182 1 + UP bnd C183 1 + UP bnd C184 1 + UP bnd C185 1 + UP bnd C186 1 + UP bnd C187 1 + UP bnd C188 1 + UP bnd C189 1 +QMATRIX + C158 C158 1 + C158 C189 0.5 + C189 C158 0.5 + C189 C189 1 +QCMATRIX QC1 + C157 C157 1 + C157 C158 0.5 + C158 C157 0.5 + C158 C158 1 + C159 C159 1 + C160 C160 1 +QCMATRIX QC2 + C161 C161 2 + C162 C162 2 + C163 C163 1 +QCMATRIX QC3 + C164 C164 1 + C165 C165 1 +QCMATRIX QC4 + C166 C166 1 + C167 C167 1 + C168 C168 1 + C169 C169 1 + C171 C171 1 +ENDATA \ No newline at end of file From 969ff06f0ec37049b78065e1ffd6d307eb9e7b68 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Thu, 23 Apr 2026 10:06:15 -0700 Subject: [PATCH 15/22] modified test check for the MPS parser --- cpp/libmps_parser/tests/mps_parser_test.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index b8ddc243aa..81a51b5dda 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -1320,7 +1320,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1) std::string input_file = cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; - std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; + std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; + std::string temp_file_2 = "/tmp/mps_roundtrip_p0033_qc1_r2.mps"; + std::string temp_file_3 = "/tmp/mps_roundtrip_p0033_qc1_r3.mps"; auto original = parse_mps(input_file, false); ASSERT_TRUE(original.has_quadratic_objective()); @@ -1330,9 +1332,13 @@ TEST(mps_roundtrip, qcqp_p0033_qc1) writer.write(temp_file); auto reloaded = parse_mps(temp_file, false); - compare_data_models(original, reloaded); + mps_writer_t writer_r2(reloaded); + writer_r2.write(temp_file_2); + auto reloaded_2 = parse_mps(temp_file_2, false); + compare_data_models(reloaded, reloaded_2); std::filesystem::remove(temp_file); + std::filesystem::remove(temp_file_2); } } // namespace cuopt::mps_parser From 926633a03083bbfd9b121288770fd922f0cc839b Mon Sep 17 00:00:00 2001 From: Yuwen Chen <37250191+yuwenchen95@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:35:07 +0200 Subject: [PATCH 16/22] Remove unused temp_file_3 in mps_parser_test Removed unused temporary file variable in test. --- cpp/libmps_parser/tests/mps_parser_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 81a51b5dda..48811cb218 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -1322,7 +1322,6 @@ TEST(mps_roundtrip, qcqp_p0033_qc1) cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; std::string temp_file_2 = "/tmp/mps_roundtrip_p0033_qc1_r2.mps"; - std::string temp_file_3 = "/tmp/mps_roundtrip_p0033_qc1_r3.mps"; auto original = parse_mps(input_file, false); ASSERT_TRUE(original.has_quadratic_objective()); From 6f2b1fb6654caaf8496ca97bf77b1e66b83fe04c Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Fri, 24 Apr 2026 01:05:40 -0700 Subject: [PATCH 17/22] undo std::span for CI tests --- .../include/mps_parser/mps_data_model.hpp | 20 ++++-- cpp/libmps_parser/src/mps_data_model.cpp | 65 ++++++++++++++----- cpp/libmps_parser/src/mps_parser.cpp | 15 +++-- cpp/libmps_parser/tests/mps_parser_test.cpp | 30 +++++++-- 4 files changed, 97 insertions(+), 33 deletions(-) diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index c4c8b56e28..88b3ed62a6 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -8,7 +8,6 @@ #pragma once #include -#include #include #include #include @@ -288,7 +287,9 @@ class mps_data_model_t { /** * @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). - * @param linear_values, linear_indices Same nnz; empty spans for a purely quadratic row (rare). + * @note Pointer+size signature is kept for current CI/toolchain compatibility; `std::span` + * can be revisited later when compatibility constraints are lifted. + * @param linear_values, linear_indices Same nnz; can be empty for a purely quadratic row (rare). * @param quadratic_values, quadratic_indices CSR nnz; may be empty if Q is empty. * @param quadratic_offsets CSR row starts; must be non-empty. * @param constraint_row_type MPS ROWS type; must be 'L'. 'G' and 'E' quadratic rows are not @@ -297,12 +298,17 @@ class mps_data_model_t { void append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, char constraint_row_type, - std::span linear_values, - std::span linear_indices, + const f_t* linear_values, + i_t linear_nnz, + const i_t* linear_indices, + i_t linear_indices_nnz, f_t rhs_value, - std::span quadratic_values, - std::span quadratic_indices, - std::span quadratic_offsets); + const f_t* quadratic_values, + i_t quadratic_size_values, + const i_t* quadratic_indices, + i_t quadratic_size_indices, + const i_t* quadratic_offsets, + i_t quadratic_size_offsets); const std::vector& get_quadratic_constraints() const; diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index f5db136a58..1ecaf47b2c 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -224,12 +224,17 @@ template void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, char constraint_row_type, - std::span linear_values, - std::span linear_indices, + const f_t* linear_values, + i_t linear_nnz, + const i_t* linear_indices, + i_t linear_indices_nnz, f_t rhs_value, - std::span quadratic_values, - std::span quadratic_indices, - std::span quadratic_offsets) + const f_t* quadratic_values, + i_t quadratic_size_values, + const i_t* quadratic_indices, + i_t quadratic_size_indices, + const i_t* quadratic_offsets, + i_t quadratic_size_offsets) { mps_parser_expects( constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); @@ -240,13 +245,28 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ "Only 'L' is supported for convex quadratic constraints.", constraint_row_type); - mps_parser_expects( - linear_values.size() == linear_indices.size(), - error_type_t::ValidationError, - "linear_values and linear_indices must have the same length"); + mps_parser_expects(linear_nnz == linear_indices_nnz, + error_type_t::ValidationError, + "linear_values and linear_indices must have the same nnz count"); + if (linear_nnz != 0) { + mps_parser_expects( + linear_values != nullptr && linear_indices != nullptr, + error_type_t::ValidationError, + "linear_values and linear_indices cannot be null when linear_nnz > 0"); + } + if (quadratic_size_values != 0) { + mps_parser_expects( + quadratic_values != nullptr, error_type_t::ValidationError, "quadratic_values cannot be null"); + } + mps_parser_expects( + quadratic_offsets != nullptr, error_type_t::ValidationError, "quadratic_offsets cannot be null"); + if (quadratic_size_indices != 0) { + mps_parser_expects( + quadratic_indices != nullptr, error_type_t::ValidationError, "quadratic_indices cannot be null"); + } mps_parser_expects( - !quadratic_offsets.empty(), error_type_t::ValidationError, "quadratic_offsets cannot be empty"); + quadratic_size_offsets > 0, error_type_t::ValidationError, "quadratic_size_offsets cannot be empty"); quadratic_constraint_t qc; qc.constraint_row_index = constraint_row_index; @@ -254,11 +274,26 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ qc.constraint_row_type = constraint_row_type; qc.rhs_value = rhs_value; - qc.linear_values.assign(linear_values.begin(), linear_values.end()); - qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); - qc.quadratic_values.assign(quadratic_values.begin(), quadratic_values.end()); - qc.quadratic_indices.assign(quadratic_indices.begin(), quadratic_indices.end()); - qc.quadratic_offsets.assign(quadratic_offsets.begin(), quadratic_offsets.end()); + qc.linear_values.resize(linear_nnz); + qc.linear_indices.resize(linear_nnz); + if (linear_nnz > 0) { + std::copy(linear_values, linear_values + linear_nnz, qc.linear_values.data()); + std::copy(linear_indices, linear_indices + linear_nnz, qc.linear_indices.data()); + } + + qc.quadratic_values.resize(quadratic_size_values); + if (quadratic_size_values > 0) { + std::copy( + quadratic_values, quadratic_values + quadratic_size_values, qc.quadratic_values.data()); + } + qc.quadratic_indices.resize(quadratic_size_indices); + if (quadratic_size_indices > 0) { + std::copy( + quadratic_indices, quadratic_indices + quadratic_size_indices, qc.quadratic_indices.data()); + } + qc.quadratic_offsets.resize(quadratic_size_offsets); + std::copy( + quadratic_offsets, quadratic_offsets + quadratic_size_offsets, qc.quadratic_offsets.data()); quadratic_constraints_.push_back(std::move(qc)); } diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index b446db5009..5613e1aeda 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -573,12 +573,17 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) linear_row_count + quadratic_row_id, row_names[row_id], static_cast(row_types[row_id]), - A_values[row_id], - A_indices[row_id], + A_values[row_id].data(), + static_cast(A_values[row_id].size()), + A_indices[row_id].data(), + static_cast(A_indices[row_id].size()), b_values[row_id], - csr_result.values, - csr_result.indices, - csr_result.offsets); + csr_result.values.data(), + static_cast(csr_result.values.size()), + csr_result.indices.data(), + static_cast(csr_result.indices.size()), + csr_result.offsets.data(), + static_cast(csr_result.offsets.size())); ++quadratic_row_id; } diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 48811cb218..907e0080d3 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -884,7 +884,20 @@ TEST(qps_parser, qcmatrix_append_api) const std::vector qc0_linear_values = {1.0, 1.0}; const std::vector qc0_linear_indices = {0, 1}; model.append_quadratic_constraint( - 0, "QC0", 'L', qc0_linear_values, qc0_linear_indices, 5.0, qc0_values, qc0_indices, qc0_offsets); + 0, + "QC0", + 'L', + qc0_linear_values.data(), + qc0_linear_values.size(), + qc0_linear_indices.data(), + qc0_linear_indices.size(), + 5.0, + qc0_values.data(), + qc0_values.size(), + qc0_indices.data(), + qc0_indices.size(), + qc0_offsets.data(), + qc0_offsets.size()); // QC1: [[4, 1], [1, 6]] const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; @@ -895,12 +908,17 @@ TEST(qps_parser, qcmatrix_append_api) model.append_quadratic_constraint(1, "QC1", 'L', - qc1_linear_values, - qc1_linear_indices, + qc1_linear_values.data(), + qc1_linear_values.size(), + qc1_linear_indices.data(), + qc1_linear_indices.size(), 10.0, - qc1_values, - qc1_indices, - qc1_offsets); + qc1_values.data(), + qc1_values.size(), + qc1_indices.data(), + qc1_indices.size(), + qc1_offsets.data(), + qc1_offsets.size()); ASSERT_TRUE(model.has_quadratic_constraints()); const auto& qcs = model.get_quadratic_constraints(); From 9defd60627f9afb4d7e0ecf7249ec42897345c29 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Fri, 24 Apr 2026 08:57:34 +0000 Subject: [PATCH 18/22] Apply pre-commit fixes Signed-off-by: yuwenchen95 --- .../optimization_problem_interface.hpp | 3 +- .../optimization_problem_utils.hpp | 2 - .../include/mps_parser/mps_data_model.hpp | 5 +- .../include/mps_parser/parser.hpp | 2 +- .../include/mps_parser/utilities/span.hpp | 2 +- cpp/libmps_parser/src/mps_data_model.cpp | 34 ++++--- cpp/libmps_parser/src/mps_parser.cpp | 93 ++++++++++--------- cpp/libmps_parser/src/mps_parser.hpp | 4 +- cpp/libmps_parser/src/mps_writer.cpp | 10 +- cpp/libmps_parser/tests/mps_parser_test.cpp | 51 +++++----- cpp/src/pdlp/cpu_optimization_problem.cpp | 3 +- 11 files changed, 106 insertions(+), 103 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index eb0850ba44..8ffd38578b 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -75,7 +75,8 @@ class optimization_problem_interface_t { * @brief Store quadratic constraints for MPS round-trip (linear + Q parts per QC row). */ virtual void set_quadratic_constraints(std::vector constraints) = 0; - template >> + template >> void set_quadratic_constraints(const std::vector& constraints) { std::vector converted_constraints; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp index 204556e053..e37ade9660 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp @@ -113,7 +113,6 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr if (data_model.has_quadratic_constraints()) { problem->set_quadratic_constraints(data_model.get_quadratic_constraints()); } - } /** @@ -275,7 +274,6 @@ void populate_from_data_model_view(optimization_problem_interface_t* p if (data_model->has_quadratic_constraints()) { problem->set_quadratic_constraints(data_model->get_quadratic_constraints()); } - } } // namespace cuopt::linear_programming diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 88b3ed62a6..4d9c4bd8f3 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -275,7 +275,8 @@ class mps_data_model_t { /** ROWS declaration index (among all constraint rows), not an index into the linear CSR. */ i_t constraint_row_index{}; std::string constraint_row_name{}; - /** MPS ROWS sense for this quadratic row; only 'L' (≤) is supported for convex QCQP at the moment. */ + /** MPS ROWS sense for this quadratic row; only 'L' (≤) is supported for convex QCQP at the + * moment. */ char constraint_row_type{}; std::vector linear_values{}; std::vector linear_indices{}; diff --git a/cpp/libmps_parser/include/mps_parser/parser.hpp b/cpp/libmps_parser/include/mps_parser/parser.hpp index 6578ffb4d5..94230a0d4c 100644 --- a/cpp/libmps_parser/include/mps_parser/parser.hpp +++ b/cpp/libmps_parser/include/mps_parser/parser.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ diff --git a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp b/cpp/libmps_parser/include/mps_parser/utilities/span.hpp index 7ad4f25d4c..24a865de6a 100644 --- a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp +++ b/cpp/libmps_parser/include/mps_parser/utilities/span.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 1ecaf47b2c..b9ae16dc03 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -236,8 +236,9 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ const i_t* quadratic_offsets, i_t quadratic_size_offsets) { - mps_parser_expects( - constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); + mps_parser_expects(constraint_row_index >= 0, + error_type_t::ValidationError, + "constraint_row_index must be non-negative"); mps_parser_expects(constraint_row_type == 'L', error_type_t::ValidationError, @@ -249,24 +250,27 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ error_type_t::ValidationError, "linear_values and linear_indices must have the same nnz count"); if (linear_nnz != 0) { - mps_parser_expects( - linear_values != nullptr && linear_indices != nullptr, - error_type_t::ValidationError, - "linear_values and linear_indices cannot be null when linear_nnz > 0"); + mps_parser_expects(linear_values != nullptr && linear_indices != nullptr, + error_type_t::ValidationError, + "linear_values and linear_indices cannot be null when linear_nnz > 0"); } if (quadratic_size_values != 0) { - mps_parser_expects( - quadratic_values != nullptr, error_type_t::ValidationError, "quadratic_values cannot be null"); + mps_parser_expects(quadratic_values != nullptr, + error_type_t::ValidationError, + "quadratic_values cannot be null"); } - mps_parser_expects( - quadratic_offsets != nullptr, error_type_t::ValidationError, "quadratic_offsets cannot be null"); + mps_parser_expects(quadratic_offsets != nullptr, + error_type_t::ValidationError, + "quadratic_offsets cannot be null"); if (quadratic_size_indices != 0) { - mps_parser_expects( - quadratic_indices != nullptr, error_type_t::ValidationError, "quadratic_indices cannot be null"); + mps_parser_expects(quadratic_indices != nullptr, + error_type_t::ValidationError, + "quadratic_indices cannot be null"); } - mps_parser_expects( - quadratic_size_offsets > 0, error_type_t::ValidationError, "quadratic_size_offsets cannot be empty"); + mps_parser_expects(quadratic_size_offsets > 0, + error_type_t::ValidationError, + "quadratic_size_offsets cannot be empty"); quadratic_constraint_t qc; qc.constraint_row_index = constraint_row_index; diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 5613e1aeda..7f4d742084 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -278,7 +278,9 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (const auto& block : qcmatrix_blocks_) { quadratic_row_ids.insert(block.constraint_row_id); } - const auto is_quadratic_row = [&quadratic_row_ids](i_t row) { return quadratic_row_ids.count(row); }; + const auto is_quadratic_row = [&quadratic_row_ids](i_t row) { + return quadratic_row_ids.count(row); + }; { std::vector h_offsets{}, h_indices{}; @@ -307,13 +309,12 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) h_offsets.data(), h_offsets.size()); - mps_parser_expects( - static_cast(num_linear_rows) + 1 == h_offsets.size(), - error_type_t::ValidationError, - "The row indexing vector for the constraint matrix was not constructed " - "successfully. Should be size %zu, but was size %zu", - static_cast(num_linear_rows) + 1, - h_offsets.size()); + mps_parser_expects(static_cast(num_linear_rows) + 1 == h_offsets.size(), + error_type_t::ValidationError, + "The row indexing vector for the constraint matrix was not constructed " + "successfully. Should be size %zu, but was size %zu", + static_cast(num_linear_rows) + 1, + h_offsets.size()); mps_parser_expects( h_indices.size() == h_values.size(), error_type_t::ValidationError, @@ -330,7 +331,6 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) "nonzero vector. Nonzero has size %zu but the last offset is %d.", h_values.size(), h_offsets[h_offsets.size() - 1]); - } // Set b & c (RHS entries for quadratic rows are stored only on quadratic_constraint_t) @@ -361,7 +361,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.get_variable_lower_bounds().size(), problem.get_variable_upper_bounds().size()); - // Determine the constraint bounds based on row types (quadratic rows use bundles only, not counted here) + // Determine the constraint bounds based on row types (quadratic rows use bundles only, not + // counted here) { std::vector h_constraint_lower_bounds{}; std::vector h_constraint_upper_bounds{}; @@ -463,11 +464,12 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_variable_types(std::move(var_types)); problem.set_maximize(maximize); - // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of O(nnz*log(nnz))) - // For QUADOBJ: handles upper triangular input by expanding to full symmetric matrix. - // - // @p value_scale: - // QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); + // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of + // O(nnz*log(nnz))) For QUADOBJ: handles upper triangular input by expanding to full symmetric + // matrix. + // + // @p value_scale: + // QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); // QCMATRIX uses 1.0 (symmetric Q defines xᵀQx directly in the constraint). auto build_csr_via_transpose = [](const std::vector>& entries, i_t num_rows, @@ -513,8 +515,9 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (i_t row = 0; row < num_rows; ++row) { for (const auto& [col, val] : csr_data[row]) { - // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q xExpand commentComment on line L488 - // so we have to multiply the value by value_scale=0.5 to get the correct value. + // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q xExpand + // commentComment on line L488 so we have to multiply the value by value_scale=0.5 to get + // the correct value. result.values.push_back(val * value_scale); result.indices.push_back(col); } @@ -529,8 +532,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // Convert quadratic objective entries to CSR format using double transpose // QUADOBJ stores upper triangular elements, so we expand to full symmetric matrix constexpr f_t k_mps_quad_half_scale = f_t(0.5); // MPS ½ xᵀQx vs internal xᵀQx - auto csr_result = - build_csr_via_transpose(quadobj_entries, num_vars_for_quad, num_vars_for_quad, true, k_mps_quad_half_scale); + auto csr_result = build_csr_via_transpose( + quadobj_entries, num_vars_for_quad, num_vars_for_quad, true, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) problem.set_quadratic_objective_matrix(csr_result.values.data(), @@ -543,8 +546,8 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // Convert quadratic objective entries to CSR format using double transpose // QMATRIX stores full symmetric matrix constexpr f_t k_mps_quad_half_scale = f_t(0.5); - auto csr_result = - build_csr_via_transpose(qmatrix_entries, num_vars_for_quad, num_vars_for_quad, false, k_mps_quad_half_scale); + auto csr_result = build_csr_via_transpose( + qmatrix_entries, num_vars_for_quad, num_vars_for_quad, false, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) problem.set_quadratic_objective_matrix(csr_result.values.data(), @@ -564,26 +567,24 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) auto csr_result = build_csr_via_transpose( block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); const i_t row_id = block.constraint_row_id; - mps_parser_expects( - row_id >= 0 && row_id < static_cast(row_types.size()), - error_type_t::ValidationError, - "QCMATRIX row index %d is out of range for constraints", - static_cast(row_id)); - problem.append_quadratic_constraint( - linear_row_count + quadratic_row_id, - row_names[row_id], - static_cast(row_types[row_id]), - A_values[row_id].data(), - static_cast(A_values[row_id].size()), - A_indices[row_id].data(), - static_cast(A_indices[row_id].size()), - b_values[row_id], - csr_result.values.data(), - static_cast(csr_result.values.size()), - csr_result.indices.data(), - static_cast(csr_result.indices.size()), - csr_result.offsets.data(), - static_cast(csr_result.offsets.size())); + mps_parser_expects(row_id >= 0 && row_id < static_cast(row_types.size()), + error_type_t::ValidationError, + "QCMATRIX row index %d is out of range for constraints", + static_cast(row_id)); + problem.append_quadratic_constraint(linear_row_count + quadratic_row_id, + row_names[row_id], + static_cast(row_types[row_id]), + A_values[row_id].data(), + static_cast(A_values[row_id].size()), + A_indices[row_id].data(), + static_cast(A_indices[row_id].size()), + b_values[row_id], + csr_result.values.data(), + static_cast(csr_result.values.size()), + csr_result.indices.data(), + static_cast(csr_result.indices.size()), + csr_result.offsets.data(), + static_cast(csr_result.offsets.size())); ++quadratic_row_id; } @@ -1420,7 +1421,7 @@ void mps_parser_t::parse_qcmatrix_header(std::string_view line) error_type_t::ValidationError, "QCMATRIX header line too short! line=%s", std::string(line).c_str()); - //fixed MPS: constraint name starts in column 12 (1-based) → 0-based index 11, 8 chars + // fixed MPS: constraint name starts in column 12 (1-based) → 0-based index 11, 8 chars row_name = std::string(trim(line.substr(11, 8))); } else { std::stringstream ss{std::string(line)}; @@ -1471,8 +1472,8 @@ void mps_parser_t::parse_qcmatrix_data(std::string_view line) i_t pos = 24; value = get_numerical_bound(line, pos); } else { - i_t pos = 0; - i_t end = 0; + i_t pos = 0; + i_t end = 0; const std::string_view var1_sv = get_next_string(line, pos, end); mps_parser_expects(!var1_sv.empty(), error_type_t::ValidationError, @@ -1529,8 +1530,8 @@ void mps_parser_t::parse_quad(std::string_view line, bool is_quadobj) i_t pos = 24; value = get_numerical_bound(line, pos); } else { - i_t pos = 0; - i_t end = 0; + i_t pos = 0; + i_t end = 0; const std::string_view var1_sv = get_next_string(line, pos, end); mps_parser_expects(!var1_sv.empty(), error_type_t::ValidationError, diff --git a/cpp/libmps_parser/src/mps_parser.hpp b/cpp/libmps_parser/src/mps_parser.hpp index d73cfdd8b3..6e56d4bce3 100644 --- a/cpp/libmps_parser/src/mps_parser.hpp +++ b/cpp/libmps_parser/src/mps_parser.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -12,8 +12,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp index 67346cdcb5..c5cbd4b3f9 100644 --- a/cpp/libmps_parser/src/mps_writer.cpp +++ b/cpp/libmps_parser/src/mps_writer.cpp @@ -235,7 +235,8 @@ void mps_writer_t::write(const std::string& mps_file_path) for (size_t k = 0; k < static_cast(n_constraints); ++k) { std::string row_name = k < problem_.get_row_names().size() ? problem_.get_row_names()[k] : "R" + std::to_string(k); - char const type = linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); + char const type = + linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); mps_file << " " << type << " " << row_name << "\n"; } for (size_t q = 0; q < quadratic_constraints.size(); ++q) { @@ -255,7 +256,8 @@ void mps_writer_t::write(const std::string& mps_file_path) std::map>> integral_col_nnzs; std::map>> continuous_col_nnzs; - // iterate over the constraint matrix and add the nonzeros to the integral and continuous col_nnzs maps + // iterate over the constraint matrix and add the nonzeros to the integral and continuous col_nnzs + // maps for (size_t csr_row = 0; csr_row < (size_t)n_constraints; csr_row++) { const i_t row_id = static_cast(csr_row); for (size_t k = (size_t)constraint_matrix_offsets[csr_row]; @@ -498,8 +500,8 @@ void mps_writer_t::write(const std::string& mps_file_path) ? problem_.get_variable_names()[i] : "C" + std::to_string(i); for (i_t p = qc.quadratic_offsets[i]; p < qc.quadratic_offsets[i + 1]; ++p) { - i_t j = qc.quadratic_indices[p]; - f_t v = qc.quadratic_values[p]; + i_t j = qc.quadratic_indices[p]; + f_t v = qc.quadratic_values[p]; std::string col_var_name = static_cast(j) < problem_.get_variable_names().size() ? problem_.get_variable_names()[j] : "C" + std::to_string(j); diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index 907e0080d3..669a8c5e05 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -16,8 +16,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -878,31 +878,30 @@ TEST(qps_parser, qcmatrix_append_api) EXPECT_EQ(0.0, default_qcm.rhs_value); // QC0: [[10, 2], [2, 2]] - const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; - const std::vector qc0_indices = {0, 1, 0, 1}; - const std::vector qc0_offsets = {0, 2, 4}; + const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; + const std::vector qc0_indices = {0, 1, 0, 1}; + const std::vector qc0_offsets = {0, 2, 4}; const std::vector qc0_linear_values = {1.0, 1.0}; const std::vector qc0_linear_indices = {0, 1}; - model.append_quadratic_constraint( - 0, - "QC0", - 'L', - qc0_linear_values.data(), - qc0_linear_values.size(), - qc0_linear_indices.data(), - qc0_linear_indices.size(), - 5.0, - qc0_values.data(), - qc0_values.size(), - qc0_indices.data(), - qc0_indices.size(), - qc0_offsets.data(), - qc0_offsets.size()); + model.append_quadratic_constraint(0, + "QC0", + 'L', + qc0_linear_values.data(), + qc0_linear_values.size(), + qc0_linear_indices.data(), + qc0_linear_indices.size(), + 5.0, + qc0_values.data(), + qc0_values.size(), + qc0_indices.data(), + qc0_indices.size(), + qc0_offsets.data(), + qc0_offsets.size()); // QC1: [[4, 1], [1, 6]] - const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; - const std::vector qc1_indices = {0, 1, 0, 1}; - const std::vector qc1_offsets = {0, 2, 4}; + const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; + const std::vector qc1_indices = {0, 1, 0, 1}; + const std::vector qc1_offsets = {0, 2, 4}; const std::vector qc1_linear_values = {3.0, 1.0}; const std::vector qc1_linear_indices = {0, 1}; model.append_quadratic_constraint(1, @@ -1219,7 +1218,6 @@ void compare_data_models(const mps_data_model_t& original, } } } - } TEST(mps_roundtrip, linear_programming_basic) @@ -1332,12 +1330,9 @@ TEST(mps_roundtrip, quadratic_programming_qp_test_2) TEST(mps_roundtrip, qcqp_p0033_qc1) { - if (!file_exists("qcqp/p0033_qc1.mps")) { - GTEST_SKIP() << "Test file not found"; - } + if (!file_exists("qcqp/p0033_qc1.mps")) { GTEST_SKIP() << "Test file not found"; } - std::string input_file = - cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; + std::string input_file = cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; std::string temp_file_2 = "/tmp/mps_roundtrip_p0033_qc1_r2.mps"; diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index e4018a290a..de1f74ed47 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -135,7 +135,8 @@ void cpu_optimization_problem_t::set_quadratic_objective_matrix( template void cpu_optimization_problem_t::set_quadratic_constraints( - std::vector::quadratic_constraint_t> constraints) + std::vector::quadratic_constraint_t> + constraints) { quadratic_constraints_ = std::move(constraints); } From 0a31a443b6ad600e73149e3ca54b528f8e7f3285 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Fri, 24 Apr 2026 06:10:41 -0700 Subject: [PATCH 19/22] minior correction for bound check and update dataset for qcqp --- cpp/src/pdlp/translate.hpp | 77 ++++++++++++++++++++------------------ datasets/qcqp/socp1.mps | 23 ++++++++++++ datasets/qcqp/socp2.mps | 26 +++++++++++++ datasets/qcqp/socp3.mps | 22 +++++++++++ datasets/qcqp/socp4.mps | 33 ++++++++++++++++ 5 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 datasets/qcqp/socp1.mps create mode 100644 datasets/qcqp/socp2.mps create mode 100644 datasets/qcqp/socp3.mps create mode 100644 datasets/qcqp/socp4.mps diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index 1628290c5a..4e5c0be991 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -111,7 +111,8 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( "Quadratic-constraint flag is set, but no constraints were provided"); const i_t original_rows = static_cast(user_problem.num_rows); - const f_t tol = f_t(1e-16); + // Use a practical tolerance for text-parsed MPS numeric values. + const f_t tol = std::numeric_limits::epsilon() * 2; // SOC: Q is n×n diagonal CSR (offsets length n+1). Exactly q_n = nnz on the main diagonal, at // q_n distinct variable indices: one −1 (head) and (q_n−1) +1 (tails). Lifting: q_n rows, each @@ -137,11 +138,11 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( error_type_t::ValidationError, "Quadratic constraint '%s' has invalid CSR offsets (need at least 2 entries)", qc.constraint_row_name.c_str()); - cuopt_expects( - qc.quadratic_values.size() == qc.quadratic_indices.size(), - error_type_t::ValidationError, - "Quadratic constraint '%s' quadratic_values and quadratic_indices length mismatch for CSR Q", - qc.constraint_row_name.c_str()); + cuopt_expects(qc.quadratic_values.size() == qc.quadratic_indices.size(), + error_type_t::ValidationError, + "Quadratic constraint '%s' quadratic_values and quadratic_indices length " + "mismatch for CSR Q", + qc.constraint_row_name.c_str()); const i_t q_n = static_cast(qc.quadratic_values.size()); cuopt_expects(q_n >= 2, @@ -168,11 +169,10 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( qc.constraint_row_name.c_str(), static_cast(qc.quadratic_offsets[static_cast(n)]), static_cast(q_n)); - cuopt_expects( - qc.quadratic_offsets[0] == 0, - error_type_t::ValidationError, - "Quadratic constraint '%s' Q CSR offsets[0] must be 0", - qc.constraint_row_name.c_str()); + cuopt_expects(qc.quadratic_offsets[0] == 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q CSR offsets[0] must be 0", + qc.constraint_row_name.c_str()); // Verify Q: n×n CSR, diagonal entries only, Lorentz pattern, then build the lift. // Scan each row r: empty or one nnz on (r,r) with value -1 (head) or +1 (tail); @@ -188,14 +188,14 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( if (p_beg == p_end) { continue; } - cuopt_expects( - p_beg + 1 == p_end, - error_type_t::ValidationError, - "Quadratic constraint '%s' Q row %d: expected at most one stored entry on the diagonal per " - "row (got end - beg = %d)", - qc.constraint_row_name.c_str(), - static_cast(r), - static_cast(p_end - p_beg)); + cuopt_expects(p_beg + 1 == p_end, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: expected at most one stored entry on " + "the diagonal per " + "row (got end - beg = %d)", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(p_end - p_beg)); const i_t col = qc.quadratic_indices[static_cast(p_beg)]; const f_t v = qc.quadratic_values[static_cast(p_beg)]; @@ -208,20 +208,24 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( static_cast(r), static_cast(col)); - if (v > f_t(-1) - tol && v < f_t(-1) + tol) { + const f_t neg_one_delta = v + f_t(1); + const f_t pos_one_delta = v - f_t(1); + const bool is_neg_one = (neg_one_delta >= -tol && neg_one_delta <= tol); + const bool is_pos_one = (pos_one_delta >= -tol && pos_one_delta <= tol); + if (is_neg_one) { ++n_head_m; head = r; - } else if (v > f_t(1) - tol && v < f_t(1) + tol) { + } else if (is_pos_one) { tail_row_vars.push_back(r); } else { - cuopt_expects( - false, - error_type_t::ValidationError, - "Quadratic constraint '%s' Q row %d: diagonal for SOC must be -1 (head) or +1 (tail); got " - "%g", - qc.constraint_row_name.c_str(), - static_cast(r), - static_cast(v)); + cuopt_expects(false, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: diagonal for SOC must be -1 (head) or " + "+1 (tail); got " + "%.17g", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(v)); } } cuopt_expects( @@ -238,11 +242,10 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( qc.constraint_row_name.c_str(), static_cast(q_n - 1), tail_row_vars.size()); - cuopt_expects( - head >= 0, - error_type_t::ValidationError, - "Quadratic constraint '%s' SOC Q: internal error (head index invalid)", - qc.constraint_row_name.c_str()); + cuopt_expects(head >= 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: internal error (head index invalid)", + qc.constraint_row_name.c_str()); row_cone_dims.push_back(q_n); dual_simplex::csr_matrix_t lift_block(q_n, n, q_n); @@ -250,8 +253,8 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( lift_block.row_start[t] = t; } - // One lift row per cone component: -1 in column head, then -1 in each tail column - // (order matches tail_row_vars from the Q scan). + // One lift row per cone component: -1 in column head, then -1 in each tail column (since our + // slack variable is done by + s form) (order matches tail_row_vars from the Q scan). lift_block.j[0] = head; lift_block.x[0] = f_t(-1); for (i_t t = 0; t < q_n - 1; ++t) { @@ -280,7 +283,7 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( } user_problem.num_rows = next_row; - user_problem.cone_row_start = original_rows; + user_problem.cone_row_start = original_rows; user_problem.second_order_cone_row_dims = std::move(row_cone_dims); } diff --git a/datasets/qcqp/socp1.mps b/datasets/qcqp/socp1.mps new file mode 100644 index 0000000000..3e479054d0 --- /dev/null +++ b/datasets/qcqp/socp1.mps @@ -0,0 +1,23 @@ +NAME +ROWS + N OBJ + L c2 + E c1 +COLUMNS + y OBJ -1 + z OBJ -1 + x c1 1 +RHS + rhs c2 0 + rhs c1 1 +RANGES +BOUNDS + FR bounds y + FR bounds z + LO bounds x 0 + PL bounds x +QCMATRIX c2 + y y 1 + z z 1 + x x -1 +ENDATA diff --git a/datasets/qcqp/socp2.mps b/datasets/qcqp/socp2.mps new file mode 100644 index 0000000000..6542d1d4f8 --- /dev/null +++ b/datasets/qcqp/socp2.mps @@ -0,0 +1,26 @@ +NAME +ROWS + N OBJ + L c2 + G c1 + E c3 +COLUMNS + x OBJ 1 + y c1 1 + tau c3 1 +RHS + rhs c2 0 + rhs c1 0.7071067811865476 + rhs c3 1 +RANGES +BOUNDS + FR bounds x + LO bounds y 0 + PL bounds y + LO bounds tau 0 + PL bounds tau +QCMATRIX c2 + x x 1 + y y 1 + tau tau -1 +ENDATA diff --git a/datasets/qcqp/socp3.mps b/datasets/qcqp/socp3.mps new file mode 100644 index 0000000000..ae92220042 --- /dev/null +++ b/datasets/qcqp/socp3.mps @@ -0,0 +1,22 @@ +NAME +ROWS + N OBJ + L c2 + L c3 + G c1 +COLUMNS + y c1 1 + x c2 1 +RHS + rhs c2 1 + rhs c3 0 + rhs c1 2 +RANGES +BOUNDS + FR bounds y + LO bounds x 0 + PL bounds x +QCMATRIX c3 + y y 1 + x x -1 +ENDATA diff --git a/datasets/qcqp/socp4.mps b/datasets/qcqp/socp4.mps new file mode 100644 index 0000000000..584725963f --- /dev/null +++ b/datasets/qcqp/socp4.mps @@ -0,0 +1,33 @@ +NAME +ROWS + N OBJ + L c2 + E c1a + E c1b + E c1c +COLUMNS + x2 c1b 1 + x2 OBJ -2 + x3 c1c 1 + x3 OBJ -1 + x1 c1a 1 + x4 c1b -1 + x5 c1c -1 +RHS + rhs c2 0 + rhs c1a 1 + rhs c1b 0 + rhs c1c 0 +RANGES +BOUNDS + FR bounds x2 + FR bounds x3 + LO bounds x1 0 + PL bounds x1 + FR bounds x4 + FR bounds x5 +QCMATRIX c2 + x1 x1 -1 + x4 x4 1 + x5 x5 1 +ENDATA From d17221348fd4708d4da86d6138fbbef78ae231fc Mon Sep 17 00:00:00 2001 From: Christopher Maes Date: Fri, 24 Apr 2026 11:25:16 -0700 Subject: [PATCH 20/22] Print # of quadratic constraints and cones --- cpp/src/barrier/barrier.cu | 3 +++ cpp/src/pdlp/solve.cu | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 09a7616cdf..cc0b391731 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -3818,6 +3818,9 @@ lp_status_t barrier_solver_t::solve(f_t start_time, if (lp.Q.n > 0) { settings.log.printf("Quadratic objective matrix: %d nonzeros\n", lp.Q.row_start[lp.Q.n]); } + if (lp.second_order_cone_dims.size() > 0) { + settings.log.printf("Second-order cones: %d\n", static_cast(lp.second_order_cone_dims.size())); + } settings.log.printf("\n"); // Compute the number of free variables diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 68dbe4bda8..585bfe4476 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -1383,7 +1383,7 @@ optimization_problem_solution_t solve_lp( CUOPT_LOG_INFO("Problem has a quadratic objective. Using Barrier."); } if (op_problem.has_quadratic_constraints()) { - CUOPT_LOG_INFO("Problem has quadratic constraints. Using Barrier with SOC conversion."); + CUOPT_LOG_INFO("Problem has %d quadratic constraints. Using Barrier with SOC conversion.", static_cast(op_problem.get_quadratic_constraints().size())); } settings.method = method_t::Barrier; settings.presolver = presolver_t::None; From f70e45d2ca1d79a6874379d5b8ce8bc3df7b508b Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 28 Apr 2026 09:34:26 -0700 Subject: [PATCH 21/22] Refresh SOCP barrier kernels Replace the old second_order_cone wrapper with the new kernel and reduction headers, migrate barrier.cu to the new cone_data_t API, and keep cone topology handling behind barrier-local helpers. Update the dual simplex SOC kernel tests for the refreshed kernels. Signed-off-by: Yan Zaretskiy --- cpp/src/barrier/barrier.cu | 320 ++-- cpp/src/barrier/second_order_cone.cuh | 957 ----------- cpp/src/barrier/second_order_cone_kernels.cuh | 1004 ++++++++++++ .../barrier/second_order_cone_reduction.cuh | 261 +++ cpp/tests/dual_simplex/CMakeLists.txt | 2 +- .../unit_tests/second_order_cone_kernels.cu | 625 ++++++++ .../unit_tests/second_order_cone_test.cu | 1416 ----------------- 7 files changed, 2065 insertions(+), 2520 deletions(-) delete mode 100644 cpp/src/barrier/second_order_cone.cuh create mode 100644 cpp/src/barrier/second_order_cone_kernels.cuh create mode 100644 cpp/src/barrier/second_order_cone_reduction.cuh create mode 100644 cpp/tests/dual_simplex/unit_tests/second_order_cone_kernels.cu delete mode 100644 cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index cc0b391731..475879c8fd 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -276,14 +277,15 @@ class iteration_data_t { "cone variables exceed problem dimension"); cuopt_assert(cone_var_start_ + total_cone_dim == lp.num_cols, "barrier expects [linear | cone] layout"); - cones_.emplace(static_cast(lp.second_order_cone_dims.size()), - lp.second_order_cone_dims, - raft::device_span{}, - raft::device_span{}, - stream_view_); + cones_.emplace( + std::span(lp.second_order_cone_dims.data(), lp.second_order_cone_dims.size()), + raft::device_span{}, + raft::device_span{}, + stream_view_); + cuopt_assert(cone_count() > 0, "second-order cone topology must contain at least one cone"); + cuopt_assert(cone_entry_count() == total_cone_dim, "second-order cone entry count mismatch"); } - i_t linear_xz_rhs_size = lp.num_cols; - if (cones_.has_value() && cones_->K > 0) { linear_xz_rhs_size -= cones_->m_c; } + const i_t linear_xz_rhs_size = linear_xz_size(lp.num_cols); d_complementarity_xz_rhs_.resize(linear_xz_rhs_size, stream_view_); // Allocating GPU flag data for Form ADAT @@ -349,7 +351,7 @@ class iteration_data_t { use_augmented = !Q_diagonal; } - if (cones_.has_value() && !use_augmented) { + if (has_cones() && !use_augmented) { n_dense_columns = 0; use_augmented = true; } @@ -451,6 +453,51 @@ class iteration_data_t { } } + bool has_cones() const { return cones_.has_value(); } + + cone_data_t& cones() + { + cuopt_assert(cones_.has_value(), "second-order cone data is not initialized"); + return *cones_; + } + + const cone_data_t& cones() const + { + cuopt_assert(cones_.has_value(), "second-order cone data is not initialized"); + return *cones_; + } + + i_t cone_count() const { return has_cones() ? cones_->n_cones : i_t(0); } + + i_t cone_entry_count() const + { + return has_cones() ? static_cast(cones_->n_cone_entries) : i_t(0); + } + + i_t cone_start() const { return cone_var_start_; } + + i_t cone_end() const { return cone_start() + cone_entry_count(); } + + i_t linear_xz_size(std::size_t full_xz_size) const + { + return has_cones() ? cone_start() : static_cast(full_xz_size); + } + + bool is_cone_variable(i_t variable) const + { + return has_cones() && variable >= cone_start() && variable < cone_end(); + } + + f_t complementarity_degree(std::size_t num_primal_variables, i_t num_upper_bounds) const + { + f_t degree = static_cast(num_primal_variables) + static_cast(num_upper_bounds); + if (has_cones()) { + degree -= static_cast(cone_entry_count()); + degree += static_cast(cone_count()); + } + return degree; + } + void form_augmented(bool first_call = false) { i_t n = A.n; @@ -461,21 +508,23 @@ class iteration_data_t { const f_t dual_perturb = 0.0; const f_t primal_perturb = 1e-6; - const bool has_cones = cones_.has_value() && cones_->K > 0; - const i_t m_c = has_cones ? cones_->m_c : 0; - i_t total_block_nnz = 0; - - std::vector cone_offsets_host; - std::vector cone_block_offsets_host; - if (has_cones) { - cone_offsets_host.resize(cones_->K + 1); - cone_block_offsets_host.resize(cones_->K + 1); - raft::copy( - cone_offsets_host.data(), cones_->cone_offsets.data(), cones_->K + 1, stream_view_); - raft::copy( - cone_block_offsets_host.data(), cones_->block_offsets.data(), cones_->K + 1, stream_view_); + const bool has_soc = has_cones(); + const i_t m_c = cone_entry_count(); + i_t total_block_nnz = 0; + + std::vector cone_offsets_host; + std::vector cone_block_offsets_host; + if (has_soc) { + const i_t n_cones = cone_count(); + cone_offsets_host.resize(n_cones + 1); + cone_block_offsets_host.resize(n_cones + 1); + raft::copy(cone_offsets_host.data(), cones().cone_offsets.data(), n_cones + 1, stream_view_); handle_ptr->sync_stream(); - total_block_nnz = cone_block_offsets_host[cones_->K]; + for (i_t k = 0; k < n_cones; ++k) { + const auto q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; + cone_block_offsets_host[k + 1] = cone_block_offsets_host[k] + q_k * q_k; + } + total_block_nnz = static_cast(cone_block_offsets_host[n_cones]); } if (first_call) { @@ -490,19 +539,21 @@ class iteration_data_t { for (i_t i = 0; i < n; i++) { augmented_CSR.row_start[i] = q; - const bool is_cone_row = has_cones && i >= cone_var_start_ && i < cone_var_start_ + m_c; + const bool is_cone_row = is_cone_variable(i); if (is_cone_row) { // Determine which cone this variable belongs to and its local row - i_t local_idx = i - cone_var_start_; + i_t local_idx = i - cone_start(); i_t k = 0; - while (k + 1 < cones_->K && cone_offsets_host[k + 1] <= local_idx) { + while (k + 1 < cone_count() && + cone_offsets_host[k + 1] <= static_cast(local_idx)) { k++; } - i_t local_r = local_idx - cone_offsets_host[k]; - i_t q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; - i_t cone_col_start = cone_var_start_ + cone_offsets_host[k]; - i_t block_base = cone_block_offsets_host[k] + local_r * q_k; + i_t local_r = + static_cast(static_cast(local_idx) - cone_offsets_host[k]); + i_t q_k = static_cast(cone_offsets_host[k + 1] - cone_offsets_host[k]); + i_t cone_col_start = cone_start() + static_cast(cone_offsets_host[k]); + i_t block_base = static_cast(cone_block_offsets_host[k]) + local_r * q_k; // Merge-join: Q entries (sorted) with dense cone block columns (contiguous) i_t qp = (nnzQ > 0) ? Q.col_start[i] : 0; @@ -518,19 +569,20 @@ class iteration_data_t { // Dense cone block, absorbing any Q entries that fall inside for (i_t c = 0; c < q_k; c++) { - i_t col = cone_col_start + c; - f_t q_val = (c == local_r) ? dual_perturb : f_t(0); + i_t col = cone_col_start + c; + f_t q_contrib = f_t(0); + f_t initial_val = (c == local_r) ? f_t(1) : f_t(0); if (qp < q_end && Q.i[qp] == col) { - q_val += Q.x[qp]; + q_contrib = Q.x[qp]; qp++; } cone_csr_indices_host[block_base + c] = q; - cone_Q_values_host[block_base + c] = q_val; + cone_Q_values_host[block_base + c] = q_contrib; if (col == i) { augmented_diagonal_indices[i] = q; } augmented_CSR.j[q] = col; - augmented_CSR.x[q++] = f_t(0); + augmented_CSR.x[q++] = initial_val - q_contrib; } // Q entries after cone block @@ -606,7 +658,7 @@ class iteration_data_t { augmented_diagonal_indices.size(), handle_ptr->get_stream()); - if (has_cones) { + if (has_soc) { d_cone_csr_indices_.resize(total_block_nnz, handle_ptr->get_stream()); raft::copy(d_cone_csr_indices_.data(), cone_csr_indices_host.data(), @@ -658,8 +710,8 @@ class iteration_data_t { }); RAFT_CHECK_CUDA(handle_ptr->get_stream()); - if (has_cones) { - scatter_hinv2_into_augmented(*cones_, + if (has_soc) { + scatter_hinv2_into_augmented(cones(), device_augmented.x, d_cone_csr_indices_, d_cone_Q_values_, @@ -1542,9 +1594,9 @@ class iteration_data_t { f_t beta, rmm::device_uvector& y) { - const i_t m = A.m; - const i_t n = A.n; - const bool has_cones = cones_.has_value() && cones_->K > 0; + const i_t m = A.m; + const i_t n = A.n; + const bool has_soc = has_cones(); rmm::device_uvector d_x1(n, handle_ptr->get_stream()); rmm::device_uvector d_x2(m, handle_ptr->get_stream()); @@ -1563,9 +1615,9 @@ class iteration_data_t { // diag.pairwise_product(x1, r1); // r1 <- D * x_1 pairwise_multiply(d_x1.data(), d_diag_.data(), d_r1.data(), n, stream_view_); - if (has_cones) { - thrust::fill_n( - rmm::exec_policy(stream_view_), d_r1.begin() + cone_var_start_, cones_->m_c, f_t(0)); + if (has_soc) { + const i_t m_c = cone_entry_count(); + thrust::fill_n(rmm::exec_policy(stream_view_), d_r1.begin() + cone_start(), m_c, f_t(0)); } // r1 <- Q x1 + D x1 @@ -1573,13 +1625,12 @@ class iteration_data_t { // matrix_vector_multiply(Q, 1.0, x1, 1.0, r1); cusparse_Q_view_.spmv(1.0, d_x1, 1.0, d_r1); } - if (has_cones) { - accumulate_cone_hinv2_matvec( - raft::device_span(d_x1.data() + cone_var_start_, cones_->m_c), - *cones_, - d_cone_hinv2_dx_, - raft::device_span(d_r1.data() + cone_var_start_, cones_->m_c), - stream_view_); + if (has_soc) { + const i_t m_c = cone_entry_count(); + accumulate_cone_hinv2_matvec(raft::device_span(d_x1.data() + cone_start(), m_c), + cones(), + raft::device_span(d_r1.data() + cone_start(), m_c), + stream_view_); RAFT_CHECK_CUDA(stream_view_); } @@ -1892,7 +1943,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: initial_point"); const bool use_augmented = data.use_augmented; - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool has_soc = data.has_cones(); // Perform a numerical factorization i_t status; @@ -2052,9 +2103,9 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } } // Now handle the case with no upper bounds (skip cone variables) - const i_t cone_end = has_cones ? data.cone_var_start_ + data.cones_->m_c : 0; + const i_t cone_end = data.cone_end(); for (i_t j = 0; j < lp.num_cols; j++) { - if (has_cones && j >= data.cone_var_start_ && j < cone_end) continue; + if (has_soc && j >= data.cone_start() && j < cone_end) continue; if (lp.upper[j] == inf) { if (c[j] > epsilon_adjust) { data.z[j] = c[j]; @@ -2083,8 +2134,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive_skip_range( - epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); + data.z.ensure_positive_skip_range(epsilon_adjust, data.cone_start(), data.cone_entry_count()); } else { // First compute rhs = A*Dinv*c dense_vector_t rhs(lp.num_rows); @@ -2108,8 +2158,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.gather_upper_bounds(data.z, data.v); data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive_skip_range( - epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); + data.z.ensure_positive_skip_range(epsilon_adjust, data.cone_start(), data.cone_entry_count()); } // Verify A'*y + z - E*v - Q*x = c @@ -2128,12 +2177,11 @@ int barrier_solver_t::initial_point(iteration_data_t& data) #endif // Make sure (w, x, v, z) > 0; skip cone vars — handled by shift_into_interior below data.w.ensure_positive(epsilon_adjust); - data.x.ensure_positive_skip_range( - epsilon_adjust, data.cone_var_start_, has_cones ? data.cones_->m_c : 0); + data.x.ensure_positive_skip_range(epsilon_adjust, data.cone_start(), data.cone_entry_count()); - if (has_cones) { + if (has_soc) { const auto& dims = lp.second_order_cone_dims; - i_t cs = data.cone_var_start_; + i_t cs = data.cone_start(); i_t off = 0; for (i_t k = 0; k < static_cast(dims.size()); ++k) { @@ -2336,24 +2384,19 @@ void barrier_solver_t::gpu_compute_residual_norms(const rmm::device_uv primal_residual_norm = std::max(device_vector_norm_inf(data.d_primal_residual_, stream_view_), device_vector_norm_inf(data.d_bound_residual_, stream_view_)); - dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - const i_t linear_xz_size = - has_cones ? data.cone_var_start_ : static_cast(data.d_complementarity_xz_residual_.size()); + dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); + const bool has_soc = data.has_cones(); + const i_t linear_xz_size = data.linear_xz_size(data.d_complementarity_xz_residual_.size()); auto linear_xz_span = raft::device_span(data.d_complementarity_xz_residual_.data(), linear_xz_size); complementarity_residual_norm = std::max(device_vector_norm_inf(linear_xz_span, stream_view_), device_vector_norm_inf(data.d_complementarity_wv_residual_, stream_view_)); - if (has_cones) { + if (has_soc) { f_t cone_complementarity_norm = f_t(0); - auto cone_dot = data.cones_->scratch.hinv2_tail_dot(); - segmented_sum(data.d_complementarity_xz_residual_.data() + data.cone_var_start_, - cuopt::make_span(data.cones_->cone_offsets), - data.cones_->K, - cone_dot, - data.cones_->scratch.segmented_reduce_workspace, - stream_view_); + auto cone_dot = data.cones().scratch.template get_slot<0>(); + data.cones().segmented_sum( + data.d_complementarity_xz_residual_.data() + data.cone_start(), cone_dot, stream_view_); cone_complementarity_norm = thrust::reduce(rmm::exec_policy(stream_view_), cone_dot.begin(), cone_dot.end(), @@ -2369,9 +2412,8 @@ f_t barrier_solver_t::gpu_max_step_to_boundary(iteration_data_t& x, const rmm::device_uvector& dx) { - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - const bool skip_cone_range = - has_cones && static_cast(x.size()) >= data.cone_var_start_ + data.cones_->m_c; + const bool has_soc = data.has_cones(); + const bool skip_cone_range = has_soc && static_cast(x.size()) >= data.cone_end(); auto ratio_test = [] HD(const thrust::tuple t) { const f_t dx = thrust::get<0>(t); @@ -2381,8 +2423,8 @@ f_t barrier_solver_t::gpu_max_step_to_boundary(iteration_data_tm_c; + i_t cs = data.cone_start(); + i_t mc = data.cone_entry_count(); f_t alpha = f_t(1); if (cs > 0) { @@ -2432,10 +2474,10 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tK > 0; - const i_t m_c = has_cones ? data.cones_->m_c : 0; - const i_t cone_var_start = data.cone_var_start_; - const i_t linear_size = has_cones ? cone_var_start : lp.num_cols; + const bool has_soc = data.has_cones(); + const i_t m_c = data.cone_entry_count(); + const i_t cone_var_start = data.cone_start(); + const i_t linear_size = data.linear_xz_size(lp.num_cols); auto fill_linear_target = [&](raft::device_span target, raft::device_span xz_rhs, @@ -2506,10 +2548,11 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_ts = raft::device_span(data.d_x_.data() + cone_var_start, m_c); - data.cones_->lambda = raft::device_span(data.d_z_.data() + cone_var_start, m_c); - launch_nt_scaling(*data.cones_, stream_view_); + if (has_soc) { + auto& cones = data.cones(); + cones.x = raft::device_span(data.d_x_.data() + cone_var_start, m_c); + cones.z = raft::device_span(data.d_z_.data() + cone_var_start, m_c); + launch_nt_scaling(cones, stream_view_); cuopt_assert(cone_var_start + m_c == lp.num_cols, "barrier expects [linear | cone] layout"); fill_linear_target( @@ -2522,13 +2565,19 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data.d_dx_aff_.data() + cone_var_start, m_c), - *data.cones_, + raft::device_span(data.d_dz_aff_.data() + cone_var_start, m_c), + cones, data.cone_sigma_mu_, cone_target, - stream_view_, - f_t(-1)); + stream_view_); } else { - compute_affine_cone_rhs_term(*data.cones_, cone_target, stream_view_, f_t(-1)); + cub::DeviceTransform::Transform( + cones.z.data(), + cone_target.data(), + m_c, + [] HD(f_t z_val) { return -z_val; }, + stream_view_.value()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } RAFT_CHECK_CUDA(stream_view_); } else { @@ -2558,7 +2607,7 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t(data.d_dx_.data() + cone_var_start, m_c), - *data.cones_, + data.cones(), raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c), - data.d_cone_hinv2_dx_, raft::device_span(data.d_dz_.data() + cone_var_start, m_c), stream_view_); @@ -3179,8 +3227,7 @@ template void barrier_solver_t::compute_affine_rhs(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - const i_t linear_size = has_cones ? data.cone_var_start_ : lp.num_cols; + const i_t linear_size = data.linear_xz_size(lp.num_cols); data.primal_rhs = data.primal_residual; data.bound_rhs = data.bound_residual; @@ -3221,7 +3268,7 @@ void barrier_solver_t::compute_target_mu( iteration_data_t& data, f_t mu, f_t& mu_aff, f_t& sigma, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_target_mu"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool has_soc = data.has_cones(); f_t complementarity_aff_sum = 0.0; // TMP no copy and data should always be on the GPU @@ -3240,14 +3287,12 @@ void barrier_solver_t::compute_target_mu( f_t step_dual_aff = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_aff_), gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_aff_)); - if (has_cones) { - i_t cs = data.cone_var_start_; - i_t mc = data.cones_->m_c; + if (has_soc) { + i_t cs = data.cone_start(); + i_t mc = data.cone_entry_count(); auto [cone_p, cone_d] = - compute_cone_step_length(*data.cones_, - raft::device_span(data.d_x_.data() + cs, mc), + compute_cone_step_length(data.cones(), raft::device_span(data.d_dx_aff_.data() + cs, mc), - raft::device_span(data.d_z_.data() + cs, mc), raft::device_span(data.d_dz_aff_.data() + cs, mc), std::min(step_primal_aff, step_dual_aff), stream_view_); @@ -3256,7 +3301,7 @@ void barrier_solver_t::compute_target_mu( step_dual_aff = step_primal_aff; } - if (data.Q.n > 0 || has_cones) { + if (data.Q.n > 0 || has_soc) { step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); } @@ -3309,22 +3354,18 @@ void barrier_solver_t::compute_target_mu( stream_view_); complementarity_aff_sum = complementarity_xz_aff_sum + complementarity_wv_aff_sum; - f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); - if (has_cones) { - mu_denom -= static_cast(data.cones_->m_c); - mu_denom += static_cast(data.cones_->K); - } - mu_aff = complementarity_aff_sum / mu_denom; - sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); - new_mu = sigma * mu; + f_t mu_denom = data.complementarity_degree(data.x.size(), data.n_upper_bounds); + mu_aff = complementarity_aff_sum / mu_denom; + sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); + new_mu = sigma * mu; } template void barrier_solver_t::compute_cc_rhs(iteration_data_t& data, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_cc_rhs"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - const i_t linear_size = has_cones ? data.cone_var_start_ : lp.num_cols; + const bool has_soc = data.has_cones(); + const i_t linear_size = data.linear_xz_size(lp.num_cols); auto fill_linear_cc_rhs = [&](raft::device_span out, raft::device_span dx_aff, @@ -3352,8 +3393,8 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data data.primal_rhs.set_scalar(0.0); data.bound_rhs.set_scalar(0.0); data.dual_rhs.set_scalar(0.0); - data.cone_combined_step_ = has_cones; - data.cone_sigma_mu_ = has_cones ? new_mu : f_t(0); + data.cone_combined_step_ = has_soc; + data.cone_sigma_mu_ = has_soc ? new_mu : f_t(0); } template @@ -3426,7 +3467,7 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ f_t& step_dual) { raft::common::nvtx::range fun_scope("Barrier: compute_primal_dual_step_length"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; + const bool has_soc = data.has_cones(); f_t max_step_primal = 0.0; f_t max_step_dual = 0.0; @@ -3435,14 +3476,12 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ max_step_dual = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_), gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_)); - if (has_cones) { - i_t cs = data.cone_var_start_; - i_t mc = data.cones_->m_c; + if (has_soc) { + i_t cs = data.cone_start(); + i_t mc = data.cone_entry_count(); auto [cone_primal, cone_dual] = - compute_cone_step_length(*data.cones_, - raft::device_span(data.d_x_.data() + cs, mc), + compute_cone_step_length(data.cones(), raft::device_span(data.d_dx_.data() + cs, mc), - raft::device_span(data.d_z_.data() + cs, mc), raft::device_span(data.d_dz_.data() + cs, mc), f_t(1), stream_view_); @@ -3453,7 +3492,7 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ step_primal = step_scale * max_step_primal; step_dual = step_scale * max_step_dual; - if (data.Q.n > 0 || has_cones) { step_primal = step_dual = std::min(step_primal, step_dual); } + if (data.Q.n > 0 || has_soc) { step_primal = step_dual = std::min(step_primal, step_dual); } } template @@ -3543,13 +3582,7 @@ template void barrier_solver_t::compute_mu(iteration_data_t& data, f_t& mu) { raft::common::nvtx::range fun_scope("Barrier: compute_mu"); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - - f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); - if (has_cones) { - mu_denom -= static_cast(data.cones_->m_c); - mu_denom += static_cast(data.cones_->K); - } + f_t mu_denom = data.complementarity_degree(data.x.size(), data.n_upper_bounds); mu = (data.sum_reduce_helper_.sum(data.d_complementarity_xz_residual_.begin(), data.d_complementarity_xz_residual_.size(), @@ -3819,7 +3852,8 @@ lp_status_t barrier_solver_t::solve(f_t start_time, settings.log.printf("Quadratic objective matrix: %d nonzeros\n", lp.Q.row_start[lp.Q.n]); } if (lp.second_order_cone_dims.size() > 0) { - settings.log.printf("Second-order cones: %d\n", static_cast(lp.second_order_cone_dims.size())); + settings.log.printf("Second-order cones: %d\n", + static_cast(lp.second_order_cone_dims.size())); } settings.log.printf("\n"); @@ -3881,18 +3915,17 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t primal_residual_norm = std::max(vector_norm_inf(data.primal_residual, stream_view_), vector_norm_inf(data.bound_residual, stream_view_)); - f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); - const bool has_cones = data.cones_.has_value() && data.cones_->K > 0; - const i_t linear_xz_size = - has_cones ? data.cone_var_start_ : static_cast(data.complementarity_xz_residual.size()); + f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); + const bool has_soc = data.has_cones(); + const i_t linear_xz_size = data.linear_xz_size(data.complementarity_xz_residual.size()); auto linear_xz_span = raft::host_span(data.complementarity_xz_residual.data(), linear_xz_size); f_t complementarity_residual_norm = std::max(vector_norm_inf(linear_xz_span, stream_view_), vector_norm_inf(data.complementarity_wv_residual, stream_view_)); - if (has_cones) { + if (has_soc) { f_t cone_complementarity_norm = f_t(0); - i_t off = data.cone_var_start_; + i_t off = data.cone_start(); for (auto q_k : lp.second_order_cone_dims) { f_t cone_dot = f_t(0); for (i_t j = 0; j < q_k; ++j) { @@ -3904,11 +3937,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, complementarity_residual_norm = std::max(complementarity_residual_norm, cone_complementarity_norm); } - f_t mu_denom = static_cast(n) + static_cast(num_upper_bounds); - if (data.cones_.has_value() && data.cones_->K > 0) { - mu_denom -= static_cast(data.cones_->m_c); - mu_denom += static_cast(data.cones_->K); - } + f_t mu_denom = data.complementarity_degree(n, num_upper_bounds); f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / mu_denom; @@ -3960,8 +3989,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, (duality_gap_abs < settings.barrier_relative_complementarity_tol || duality_gap_rel < settings.barrier_relative_complementarity_tol); - const i_t linear_xz_rhs_size = - has_cones ? data.cone_var_start_ : static_cast(data.complementarity_xz_rhs.size()); + const i_t linear_xz_rhs_size = data.linear_xz_size(data.complementarity_xz_rhs.size()); data.d_complementarity_xz_residual_.resize(data.complementarity_xz_residual.size(), stream_view_); diff --git a/cpp/src/barrier/second_order_cone.cuh b/cpp/src/barrier/second_order_cone.cuh deleted file mode 100644 index 3487707f12..0000000000 --- a/cpp/src/barrier/second_order_cone.cuh +++ /dev/null @@ -1,957 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#pragma once - -#include -#include - -#include - -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace cuopt::linear_programming::dual_simplex { - -// --------------------------------------------------------------------------- -// Flat cone kernels: segmented reductions compute per-cone scalars, then a -// single elementwise launch applies the result across all packed cone entries. -// This keeps the cone math vectorized instead of one block per cone. -// --------------------------------------------------------------------------- - -constexpr int flat_block_dim = 256; - -template -__global__ void apply_hinv2_write_kernel(raft::device_span v, - raft::device_span out, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span tail_dot, - raft::device_span cone_offsets, - raft::device_span element_cone_ids, - f_t output_scale) -{ - i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (flat_idx >= static_cast(out.size())) return; - - i_t cone = element_cone_ids[flat_idx]; - i_t cone_off = cone_offsets[cone]; - i_t local_idx = flat_idx - cone_off; - - f_t ie_sq = inv_eta[cone] * inv_eta[cone]; - f_t u_tv = w_bar[cone_off] * v[cone_off] - tail_dot[cone]; - f_t coeff = f_t(2) * u_tv * ie_sq; - int sign = (local_idx == 0) * 2 - 1; - f_t value = coeff * w_bar[flat_idx] - ie_sq * v[flat_idx]; - out[flat_idx] = output_scale * value * sign; -} - -template -struct corrector_raw_t { - f_t zeta; - f_t xi; - f_t psi; -}; - -template -struct corrector_raw_sum_t { - HD corrector_raw_t operator()(const corrector_raw_t& lhs, - const corrector_raw_t& rhs) const - { - return {lhs.zeta + rhs.zeta, lhs.xi + rhs.xi, lhs.psi + rhs.psi}; - } -}; - -template -struct cone_scratch_t { - i_t K; - rmm::device_uvector> corrector_raw; // [K] {zeta, xi, psi} - rmm::device_uvector scalar_slots; // [6 * K] reusable K-length scalar scratch slots - rmm::device_uvector step_alpha_primal; - rmm::device_uvector step_alpha_dual; - rmm::device_uvector segmented_reduce_workspace; - - cone_scratch_t(i_t K_in, rmm::cuda_stream_view stream) - : K(K_in), - corrector_raw(K_in, stream), - scalar_slots(6 * K_in, stream), - step_alpha_primal(K_in, stream), - step_alpha_dual(K_in, stream), - segmented_reduce_workspace(0, stream) - { - } - - raft::device_span hinv2_tail_dot() { return slot_span(0); } - raft::device_span step_s_du1_sq() { return slot_span(0); } - raft::device_span step_s_u1du1() { return slot_span(1); } - raft::device_span step_s_u1_sq() { return slot_span(2); } - raft::device_span step_l_du1_sq() { return slot_span(3); } - raft::device_span step_l_u1du1() { return slot_span(4); } - raft::device_span step_l_u1_sq() { return slot_span(5); } - - raft::device_span nt_s1_sq() { return slot_span(0); } - raft::device_span nt_l1_sq() { return slot_span(1); } - raft::device_span nt_sl() { return slot_span(2); } - - raft::device_span step_alpha_primal_span() { return cuopt::make_span(step_alpha_primal); } - raft::device_span step_alpha_dual_span() { return cuopt::make_span(step_alpha_dual); } - - private: - raft::device_span slot_span(i_t slot) - { - return raft::device_span(scalar_slots.data() + slot * K, K); - } -}; - -template -__global__ void fused_corrector_write_kernel(raft::device_span s, - raft::device_span lambda, - raft::device_span dx_aff, - raft::device_span omega, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span rho, - raft::device_span> raw, - raft::device_span out, - raft::device_span cone_offsets, - raft::device_span element_cone_ids, - f_t sigma_mu, - f_t output_scale) -{ - i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (flat_idx >= static_cast(out.size())) return; - - i_t cone = element_cone_ids[flat_idx]; - i_t cone_off = cone_offsets[cone]; - i_t local_idx = flat_idx - cone_off; - f_t ie = inv_eta[cone]; - f_t ipw = inv_1pw0[cone]; - f_t w0 = w_bar[cone_off]; - f_t omega0 = omega[cone_off]; - f_t dx_a0 = dx_aff[cone_off]; - auto raw_vals = raw[cone]; - f_t coeff_a = -dx_a0 + raw_vals.zeta * ipw; - f_t dx0 = (w0 * dx_a0 - raw_vals.zeta) * ie; - f_t dz0 = -omega0 - dx0; - f_t w_sq_sum = max(f_t(0), w0 * w0 - f_t(1)); - f_t w_omega_sum = f_t(0.5) * (ie * s[cone_off] - lambda[cone_off] / ie); - f_t omega_sq_sum = max(f_t(0), omega0 * omega0 - rho[cone]); - f_t omega_dx_sum = ie * (raw_vals.xi + coeff_a * w_omega_sum); - f_t dx_sq_sum = - ie * ie * (raw_vals.psi + f_t(2) * coeff_a * raw_vals.zeta + coeff_a * coeff_a * w_sq_sum); - f_t r_K_0 = (omega0 * omega0 + omega_sq_sum) + (dx0 * dz0 - omega_dx_sum - dx_sq_sum) - sigma_mu; - f_t nu = (f_t(2) * omega0 - dx0) * omega_sq_sum - (omega0 + f_t(2) * dx0) * omega_dx_sum; - f_t inv_rho = f_t(1) / rho[cone]; - f_t corr0 = (omega0 * r_K_0 - nu) * inv_rho; - f_t inv_omega0 = f_t(1) / omega0; - f_t c_inv = (nu * inv_omega0 - r_K_0) * inv_rho; - f_t p1 = c_inv + f_t(2) - dx0 * inv_omega0; - f_t p2 = -(f_t(1) + f_t(2) * dx0 * inv_omega0); - f_t w_dx_sum = ie * (raw_vals.zeta + coeff_a * w_sq_sum); - f_t zeta2 = p1 * w_omega_sum + p2 * w_dx_sum; - f_t coeff_c = -corr0 + zeta2 * ipw; - - if (local_idx == 0) { - out[flat_idx] = output_scale * ((w0 * corr0 - zeta2) * ie); - return; - } - - f_t dx_j = (dx_aff[flat_idx] + coeff_a * w_bar[flat_idx]) * ie; - f_t corr_j = p1 * omega[flat_idx] + p2 * dx_j; - out[flat_idx] = output_scale * ((corr_j + coeff_c * w_bar[flat_idx]) * ie); -} - -// --------------------------------------------------------------------------- -// Flattened NT scaling / step-length kernels. -// All follow the same pattern: segmented reduction to per-cone scalars, then -// flat or scalar kernels to write the packed cone outputs. -// --------------------------------------------------------------------------- - -template -__global__ void nt_scaling_scalar_kernel(raft::device_span s, - raft::device_span lambda, - raft::device_span cone_offsets, - raft::device_span s1_sq, - raft::device_span l1_sq, - raft::device_span sl, - raft::device_span inv_eta, - raft::device_span inv_1pw0, - raft::device_span w_bar, - raft::device_span omega, - raft::device_span rho, - i_t K) -{ - i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - f_t s0 = s[off]; - f_t l0 = lambda[off]; - - f_t s_J = sqrt(max(f_t(0), s0 * s0 - s1_sq[cone])); - f_t l_J = sqrt(max(f_t(0), l0 * l0 - l1_sq[cone])); - f_t inv_s_J = f_t(1) / s_J; - f_t inv_l_J = f_t(1) / l_J; - f_t rho_val = s_J * l_J; - f_t inv_eta_v = sqrt(l_J / s_J); - f_t scale = sqrt(rho_val); - - f_t s_dot_l = (s0 * l0 + sl[cone]) * inv_s_J * inv_l_J; - f_t gamma = sqrt(max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); - f_t inv_2g = f_t(1) / (f_t(2) * gamma); - f_t sb0 = s0 * inv_s_J; - f_t lb0 = l0 * inv_l_J; - - f_t w0 = (sb0 + lb0) * inv_2g; - inv_eta[cone] = inv_eta_v; - inv_1pw0[cone] = f_t(1) / (f_t(1) + w0); - w_bar[off] = w0; - omega[off] = gamma * scale; - rho[cone] = rho_val; -} - -template -__global__ void nt_scaling_tail_kernel(raft::device_span s, - raft::device_span lambda, - raft::device_span inv_eta, - raft::device_span rho, - raft::device_span w_bar, - raft::device_span omega, - raft::device_span cone_offsets, - raft::device_span element_cone_ids) -{ - i_t flat_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (flat_idx >= static_cast(w_bar.size())) return; - - i_t cone = element_cone_ids[flat_idx]; - i_t cone_off = cone_offsets[cone]; - if (flat_idx == cone_off) return; - - f_t s0 = s[cone_off]; - f_t l0 = lambda[cone_off]; - f_t inv_eta_val = inv_eta[cone]; - f_t rho_val = rho[cone]; - f_t scale = sqrt(rho_val); - - f_t s_J = scale / inv_eta_val; - f_t l_J = scale * inv_eta_val; - f_t inv_s_J = f_t(1) / s_J; - f_t inv_l_J = f_t(1) / l_J; - - f_t gamma = omega[cone_off] / scale; - f_t inv_2g = f_t(1) / (f_t(2) * gamma); - f_t sb0 = s0 * inv_s_J; - f_t lb0 = l0 * inv_l_J; - f_t D = sb0 + lb0 + f_t(2) * gamma; - f_t inv_D = f_t(1) / D; - f_t c_s = (gamma + sb0) * inv_D; - f_t c_l = (gamma + lb0) * inv_D; - - f_t w_from_s = inv_2g * inv_s_J; - f_t w_from_lambda = -inv_2g * inv_l_J; - f_t omega_s_coeff = scale * c_l * inv_s_J; - f_t omega_lambda_coeff = scale * c_s * inv_l_J; - - f_t sj = s[flat_idx]; - f_t lj = lambda[flat_idx]; - w_bar[flat_idx] = w_from_s * sj + w_from_lambda * lj; - omega[flat_idx] = omega_s_coeff * sj + omega_lambda_coeff * lj; -} - -template -DI f_t cone_step_length_from_scalars(f_t u0, f_t du0, f_t du1_sq, f_t u1du1, f_t c, f_t alpha_max) -{ - f_t a = du0 * du0 - du1_sq; - f_t b = u0 * du0 - u1du1; - f_t disc = b * b - a * c; - f_t alpha = alpha_max; - - if (du0 < f_t(0)) { alpha = min(alpha, -u0 / du0); } - if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { - return alpha; - } else if (a == f_t(0)) { - if (b < f_t(0)) { alpha = min(alpha, c / (f_t(-2) * b)); } - } else if (c == f_t(0)) { - alpha = (a >= f_t(0)) ? alpha : f_t(0); - } else { - f_t t = -(b + copysign(sqrt(disc), b)); - f_t r1 = c / t; - f_t r2 = t / a; - if (r1 < f_t(0)) { r1 = alpha; } - if (r2 < f_t(0)) { r2 = alpha; } - alpha = min(alpha, min(r1, r2)); - } - return alpha; -} - -template -__global__ void step_length_pair_kernel(raft::device_span s, - raft::device_span ds, - raft::device_span lambda, - raft::device_span dlambda, - raft::device_span alpha_primal, - raft::device_span alpha_dual, - raft::device_span s_du1_sq, - raft::device_span s_u1du1, - raft::device_span s_u1_sq, - raft::device_span l_du1_sq, - raft::device_span l_u1du1, - raft::device_span l_u1_sq, - raft::device_span cone_offsets, - f_t alpha_max, - i_t K) -{ - i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (cone >= K) return; - - i_t off = cone_offsets[cone]; - f_t s_c = max(f_t(0), s[off] * s[off] - s_u1_sq[cone]); - f_t l_c = max(f_t(0), lambda[off] * lambda[off] - l_u1_sq[cone]); - - alpha_primal[cone] = cone_step_length_from_scalars( - s[off], ds[off], s_du1_sq[cone], s_u1du1[cone], s_c, alpha_max); - alpha_dual[cone] = cone_step_length_from_scalars( - lambda[off], dlambda[off], l_du1_sq[cone], l_u1du1[cone], l_c, alpha_max); -} - -/** - * Device storage for second-order cone topology, NT scaling, and iterate views. - * - * Flat arrays are packed by cone: elements [cone_offsets[i], cone_offsets[i+1]) - * belong to cone i, which has dimension cone_dims[i]. - * - * Primal/dual iterates (s, lambda) are non-owning spans, pre-sliced by the - * caller to cover the cone portion of the global x/z vectors. The caller - * must keep the underlying memory alive. - * - * Only persistent cone state lives here. Reusable per-iteration workspace sits - * under `scratch`, which keeps the mutating temporary buffers out of the - * persistent NT state. - */ -template -struct cone_data_t { - // --- Topology (set once at construction) --- - i_t K; // number of second-order cones - i_t m_c; // total cone dimension = sum of cone_dims - - rmm::device_uvector cone_offsets; // [K+1] prefix sums of cone_dims - rmm::device_uvector cone_dims; // [K] dimension q_i of each cone - rmm::device_uvector block_offsets; // [K+1] prefix sums of q_i^2 (for dense block build) - rmm::device_uvector block_entry_cone_ids; // [sum q_i^2] owning cone id for each block entry - - // --- Primal/dual cone iterates (non-owning views, set by caller) --- - raft::device_span s; // [m_c] cone slack: s_i in int(Q^{q_i}) - raft::device_span lambda; // [m_c] cone dual: lambda_i in int(Q^{q_i}) - - // --- NT scaling state (recomputed each iteration from s, lambda) --- - rmm::device_uvector - inv_eta; // [K] 1/eta_i where eta_i = (||s_i||_J / ||lambda_i||_J)^{1/2} - rmm::device_uvector inv_1pw0; // [K] cached 1/(1 + wbar_0_i) - rmm::device_uvector w_bar; // [m_c] NT scaling direction, unit J-norm, packed by cone - rmm::device_uvector omega; // [m_c] scaled variable omega_i = H_i^{-1} s_i, packed by cone - rmm::device_uvector rho; // [K] ||omega_i||^2_J = ||s_i||_J * ||lambda_i||_J - rmm::device_uvector element_cone_ids; // [m_c] owning cone id for each packed entry - cone_scratch_t scratch; - - cone_data_t(i_t K_in, - const std::vector& dims, - raft::device_span s_in, - raft::device_span lambda_in, - rmm::cuda_stream_view stream) - : K(K_in), - m_c(std::accumulate(dims.begin(), dims.end(), i_t(0))), - cone_offsets(K_in + 1, stream), - cone_dims(K_in, stream), - block_offsets(K_in + 1, stream), - block_entry_cone_ids( - std::accumulate( - dims.begin(), dims.end(), i_t(0), [](i_t acc, i_t q) { return acc + q * q; }), - stream), - s(s_in), - lambda(lambda_in), - inv_eta(K_in, stream), - inv_1pw0(K_in, stream), - w_bar(m_c, stream), - omega(m_c, stream), - rho(K_in, stream), - element_cone_ids(m_c, stream), - scratch(K_in, stream) - { - std::vector offsets(K + 1, 0); - std::vector blk_offsets(K + 1, 0); - std::vector cone_ids(m_c, 0); - std::vector block_cone_ids(block_entry_cone_ids.size(), 0); - - for (i_t i = 0; i < K; ++i) { - offsets[i + 1] = offsets[i] + dims[i]; - blk_offsets[i + 1] = blk_offsets[i] + dims[i] * dims[i]; - std::fill(cone_ids.begin() + offsets[i], cone_ids.begin() + offsets[i + 1], i); - std::fill( - block_cone_ids.begin() + blk_offsets[i], block_cone_ids.begin() + blk_offsets[i + 1], i); - } - - auto init_device_vec = [&](auto& d_vec, const auto& h_vec) { - if (!h_vec.empty()) { - d_vec.resize(h_vec.size(), stream); - raft::copy(d_vec.data(), h_vec.data(), h_vec.size(), stream); - } - }; - - raft::copy(cone_offsets.data(), offsets.data(), K + 1, stream); - raft::copy(cone_dims.data(), dims.data(), K, stream); - raft::copy(block_offsets.data(), blk_offsets.data(), K + 1, stream); - init_device_vec(block_entry_cone_ids, block_cone_ids); - init_device_vec(element_cone_ids, cone_ids); - } -}; - -template -void segmented_sum(InputIt input, - raft::device_span cone_offsets, - i_t K, - raft::device_span out, - rmm::device_uvector& workspace, - rmm::cuda_stream_view stream) -{ - if (K == 0) return; - cuopt_assert(static_cast(out.size()) == K, "segmented_sum output must match cone count"); - - std::size_t temp_storage_bytes = 0; - cub::DeviceSegmentedReduce::Sum(nullptr, - temp_storage_bytes, - input, - out.data(), - K, - cone_offsets.data(), - cone_offsets.data() + 1, - stream.value()); - if (workspace.size() < temp_storage_bytes) { workspace.resize(temp_storage_bytes, stream); } - cub::DeviceSegmentedReduce::Sum(workspace.data(), - temp_storage_bytes, - input, - out.data(), - K, - cone_offsets.data(), - cone_offsets.data() + 1, - stream.value()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void segmented_reduce(InputIt input, - raft::device_span cone_offsets, - i_t K, - rmm::device_uvector& out, - rmm::device_uvector& workspace, - ReduceOp reduce_op, - t_t initial_value, - rmm::cuda_stream_view stream) -{ - out.resize(K, stream); - if (K == 0) return; - - std::size_t temp_storage_bytes = 0; - cub::DeviceSegmentedReduce::Reduce(nullptr, - temp_storage_bytes, - input, - out.data(), - K, - cone_offsets.data(), - cone_offsets.data() + 1, - reduce_op, - initial_value, - stream.value()); - if (workspace.size() < temp_storage_bytes) { workspace.resize(temp_storage_bytes, stream); } - cub::DeviceSegmentedReduce::Reduce(workspace.data(), - temp_storage_bytes, - input, - out.data(), - K, - cone_offsets.data(), - cone_offsets.data() + 1, - reduce_op, - initial_value, - stream.value()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void apply_hinv2(raft::device_span v, - raft::device_span out, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span cone_offsets, - raft::device_span element_cone_ids, - raft::device_span tail_dot, - rmm::device_uvector& workspace, - i_t K, - rmm::cuda_stream_view stream, - f_t output_scale = f_t(1)) -{ - if (K == 0) return; - - auto span_v = v; - auto span_w_bar = w_bar; - auto span_cone_offsets = cone_offsets; - auto span_element_cone_ids = element_cone_ids; - auto tail_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_v, span_w_bar, span_cone_offsets, span_element_cone_ids] HD(i_t idx) { - i_t cone = span_element_cone_ids[idx]; - i_t cone_off = span_cone_offsets[cone]; - return (idx == cone_off) ? f_t(0) : span_w_bar[idx] * span_v[idx]; - }); - segmented_sum(tail_terms, cone_offsets, K, tail_dot, workspace, stream); - - i_t grid_dim = (static_cast(out.size()) + flat_block_dim - 1) / flat_block_dim; - apply_hinv2_write_kernel<<>>( - v, out, w_bar, inv_eta, tail_dot, cone_offsets, element_cone_ids, output_scale); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void apply_hinv2(raft::device_span v, - raft::device_span out, - cone_data_t& cones, - rmm::cuda_stream_view stream, - f_t output_scale = f_t(1)) -{ - apply_hinv2(v, - out, - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.cone_offsets), - cuopt::make_span(cones.element_cone_ids), - cones.scratch.hinv2_tail_dot(), - cones.scratch.segmented_reduce_workspace, - cones.K, - stream, - output_scale); -} - -template -void compute_affine_cone_rhs_term(cone_data_t& cones, - raft::device_span out, - rmm::cuda_stream_view stream, - f_t output_scale = f_t(1)) -{ - cuopt_assert(static_cast(out.size()) == cones.m_c, "cone rhs span must match cone size"); - if (cones.K == 0) return; - - apply_hinv2(cones.s, out, cones, stream, output_scale); -} - -template -void compute_combined_cone_rhs_term(raft::device_span dx_aff, - cone_data_t& cones, - f_t sigma_mu, - raft::device_span out, - rmm::cuda_stream_view stream, - f_t output_scale = f_t(1)) -{ - cuopt_assert(static_cast(out.size()) == cones.m_c, "cone rhs span must match cone size"); - if (cones.K == 0) return; - - auto span_dx_aff = dx_aff; - auto span_w_bar = cuopt::make_span(cones.w_bar); - auto span_omega = cuopt::make_span(cones.omega); - auto span_cone_offsets = cuopt::make_span(cones.cone_offsets); - auto span_element_cone_id = cuopt::make_span(cones.element_cone_ids); - - auto raw_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_dx_aff, span_w_bar, span_omega, span_cone_offsets, span_element_cone_id] HD(i_t idx) { - i_t cone = span_element_cone_id[idx]; - i_t cone_off = span_cone_offsets[cone]; - if (idx == cone_off) { return corrector_raw_t{f_t(0), f_t(0), f_t(0)}; } - f_t dx_aff_j = span_dx_aff[idx]; - return corrector_raw_t{ - span_w_bar[idx] * dx_aff_j, span_omega[idx] * dx_aff_j, dx_aff_j * dx_aff_j}; - }); - segmented_reduce>(raw_terms, - cuopt::make_span(cones.cone_offsets), - cones.K, - cones.scratch.corrector_raw, - cones.scratch.segmented_reduce_workspace, - corrector_raw_sum_t{}, - corrector_raw_t{f_t(0), f_t(0), f_t(0)}, - stream); - - i_t grid_dim = (cones.m_c + flat_block_dim - 1) / flat_block_dim; - fused_corrector_write_kernel - <<>>(cones.s, - cones.lambda, - dx_aff, - cuopt::make_span(cones.omega), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.inv_1pw0), - cuopt::make_span(cones.rho), - cuopt::make_span(cones.scratch.corrector_raw), - out, - cuopt::make_span(cones.cone_offsets), - cuopt::make_span(cones.element_cone_ids), - sigma_mu, - output_scale); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void recover_cone_dz_from_target(raft::device_span dx, - cone_data_t& cones, - raft::device_span cone_target, - rmm::device_uvector& hinv2_dx, - raft::device_span dz, - rmm::cuda_stream_view stream) -{ - hinv2_dx.resize(cones.m_c, stream); - if (cones.K == 0) return; - - apply_hinv2(dx, cuopt::make_span(hinv2_dx), cones, stream); - - auto span_target = cone_target; - auto span_hinv2 = cuopt::make_span(hinv2_dx); - auto span_dz = dz; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - cones.m_c, - [span_target, span_hinv2, span_dz] __device__(i_t j) { - span_dz[j] = span_target[j] - span_hinv2[j]; - }); -} - -template -void accumulate_cone_hinv2_matvec(raft::device_span x, - cone_data_t& cones, - rmm::device_uvector& hinv2_x, - raft::device_span out, - rmm::cuda_stream_view stream) -{ - hinv2_x.resize(cones.m_c, stream); - if (cones.K == 0) return; - - apply_hinv2(x, cuopt::make_span(hinv2_x), cones, stream); - - auto span_hinv2 = cuopt::make_span(hinv2_x); - auto span_out = out; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - cones.m_c, - [span_hinv2, span_out] __device__(i_t j) { span_out[j] += span_hinv2[j]; }); -} - -// --------------------------------------------------------------------------- -// Compute flat H^{-2} cone-block entries and scatter them into the augmented -// CSR value array. -// -// The caller provides one flat entry per dense cone-block element: -// - `csr_indices[e]` gives the destination slot in `augmented_x` -// - `q_values[e]` stores any pre-merged Q contribution for that slot -// -// For each flat entry we load its precomputed owning cone id, recover local -// (r, c) coordinates, evaluate H_k^{-2}(r, c), and write -// -(H_k^{-2}(r, c) + q_values[e]) -// into `augmented_x[csr_indices[e]]`. -// --------------------------------------------------------------------------- -template -__global__ void scatter_hinv2_into_augmented_kernel( - raft::device_span augmented_x, - raft::device_span csr_indices, - raft::device_span q_values, - raft::device_span w_bar, - raft::device_span inv_eta, - raft::device_span cone_offsets, - raft::device_span block_offsets, - raft::device_span block_entry_cone_ids) -{ - i_t e = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (e >= static_cast(csr_indices.size())) return; - - i_t cone = block_entry_cone_ids[e]; - i_t off = cone_offsets[cone]; - i_t q = cone_offsets[cone + 1] - off; - i_t blk_off = block_offsets[cone]; - i_t local = e - blk_off; - i_t r = local / q; - i_t c = local % q; - - f_t ie_sq = inv_eta[cone] * inv_eta[cone]; - f_t w0 = w_bar[off]; - f_t u_r = (r == 0) ? w0 : -w_bar[off + r]; - f_t u_c = (c == 0) ? w0 : -w_bar[off + c]; - f_t val = f_t(2) * u_r * ie_sq * u_c; - f_t diag_correction = (r == 0) ? -ie_sq : ie_sq; - if (r == c) { val += diag_correction; } - - augmented_x[csr_indices[e]] = -val - q_values[e]; -} - -template -void scatter_hinv2_into_augmented(const cone_data_t& cones, - rmm::device_uvector& augmented_x, - const rmm::device_uvector& csr_indices, - const rmm::device_uvector& q_values, - rmm::cuda_stream_view stream) -{ - i_t count = static_cast(csr_indices.size()); - if (count == 0) return; - - cuopt_assert(count == static_cast(cones.block_entry_cone_ids.size()), - "scatter expects one flat entry per cone-block coefficient"); - - i_t grid_dim = (count + flat_block_dim - 1) / flat_block_dim; - scatter_hinv2_into_augmented_kernel - <<>>(cuopt::make_span(augmented_x), - cuopt::make_span(csr_indices), - cuopt::make_span(q_values), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.cone_offsets), - cuopt::make_span(cones.block_offsets), - cuopt::make_span(cones.block_entry_cone_ids)); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -// --------------------------------------------------------------------------- -// Compute per-cone step lengths, then reduce them to the global maximum -// feasible primal/dual step. -// --------------------------------------------------------------------------- -template -void compute_cone_step_length_per_cone(cone_data_t& cones, - raft::device_span x_K, - raft::device_span dx_K, - raft::device_span z_K, - raft::device_span dz_K, - raft::device_span alpha_primal, - raft::device_span alpha_dual, - f_t alpha_max, - rmm::cuda_stream_view stream) -{ - cuopt_assert(static_cast(alpha_primal.size()) == cones.K && - static_cast(alpha_dual.size()) == cones.K, - "step-length outputs must match cone count"); - if (cones.K == 0) return; - - auto span_offsets = cuopt::make_span(cones.cone_offsets); - auto span_elem = cuopt::make_span(cones.element_cone_ids); - - auto s_du1_sq = cones.scratch.step_s_du1_sq(); - auto s_u1du1 = cones.scratch.step_s_u1du1(); - auto s_u1_sq = cones.scratch.step_s_u1_sq(); - auto l_du1_sq = cones.scratch.step_l_du1_sq(); - auto l_u1du1 = cones.scratch.step_l_u1du1(); - auto l_u1_sq = cones.scratch.step_l_u1_sq(); - - auto span_x_K = x_K; - auto span_dx_K = dx_K; - auto span_z_K = z_K; - auto span_dz_K = dz_K; - - auto s_du1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_dx_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_dx_K[idx] * span_dx_K[idx]; - }); - segmented_sum(s_du1_sq_terms, - span_offsets, - cones.K, - s_du1_sq, - cones.scratch.segmented_reduce_workspace, - stream); - - auto s_u1du1_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_x_K, span_dx_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_x_K[idx] * span_dx_K[idx]; - }); - segmented_sum(s_u1du1_terms, - span_offsets, - cones.K, - s_u1du1, - cones.scratch.segmented_reduce_workspace, - stream); - - auto s_u1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_x_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_x_K[idx] * span_x_K[idx]; - }); - segmented_sum(s_u1_sq_terms, - span_offsets, - cones.K, - s_u1_sq, - cones.scratch.segmented_reduce_workspace, - stream); - - auto l_du1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_dz_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_dz_K[idx] * span_dz_K[idx]; - }); - segmented_sum(l_du1_sq_terms, - span_offsets, - cones.K, - l_du1_sq, - cones.scratch.segmented_reduce_workspace, - stream); - - auto l_u1du1_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_z_K, span_dz_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_z_K[idx] * span_dz_K[idx]; - }); - segmented_sum(l_u1du1_terms, - span_offsets, - cones.K, - l_u1du1, - cones.scratch.segmented_reduce_workspace, - stream); - - auto l_u1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_z_K, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_z_K[idx] * span_z_K[idx]; - }); - segmented_sum(l_u1_sq_terms, - span_offsets, - cones.K, - l_u1_sq, - cones.scratch.segmented_reduce_workspace, - stream); - - i_t grid_dim = (cones.K + flat_block_dim - 1) / flat_block_dim; - step_length_pair_kernel<<>>(x_K, - dx_K, - z_K, - dz_K, - alpha_primal, - alpha_dual, - s_du1_sq, - s_u1du1, - s_u1_sq, - l_du1_sq, - l_u1du1, - l_u1_sq, - span_offsets, - alpha_max, - cones.K); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -std::pair compute_cone_step_length(cone_data_t& cones, - raft::device_span x_K, - raft::device_span dx_K, - raft::device_span z_K, - raft::device_span dz_K, - f_t alpha_max, - rmm::cuda_stream_view stream) -{ - if (cones.K == 0) return {alpha_max, alpha_max}; - - auto alpha_primal = cones.scratch.step_alpha_primal_span(); - auto alpha_dual = cones.scratch.step_alpha_dual_span(); - - compute_cone_step_length_per_cone( - cones, x_K, dx_K, z_K, dz_K, alpha_primal, alpha_dual, alpha_max, stream); - - f_t primal = thrust::reduce(rmm::exec_policy(stream), - alpha_primal.begin(), - alpha_primal.end(), - alpha_max, - thrust::minimum()); - f_t dual = thrust::reduce(rmm::exec_policy(stream), - alpha_dual.begin(), - alpha_dual.end(), - alpha_max, - thrust::minimum()); - return {primal, dual}; -} - -template -void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) -{ - if (cones.K == 0) return; - - auto nt_s1_sq = cones.scratch.nt_s1_sq(); - auto nt_l1_sq = cones.scratch.nt_l1_sq(); - auto nt_sl = cones.scratch.nt_sl(); - - auto span_s = cones.s; - auto span_lambda = cones.lambda; - auto span_offsets = cuopt::make_span(cones.cone_offsets); - auto span_elem = cuopt::make_span(cones.element_cone_ids); - - auto s1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_s, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_s[idx] * span_s[idx]; - }); - segmented_sum( - s1_sq_terms, span_offsets, cones.K, nt_s1_sq, cones.scratch.segmented_reduce_workspace, stream); - - auto l1_sq_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [span_lambda, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_lambda[idx] * span_lambda[idx]; - }); - segmented_sum( - l1_sq_terms, span_offsets, cones.K, nt_l1_sq, cones.scratch.segmented_reduce_workspace, stream); - - auto sl_terms = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [span_s, span_lambda, span_offsets, span_elem] HD(i_t idx) { - i_t cone = span_elem[idx]; - return (idx == span_offsets[cone]) ? f_t(0) : span_s[idx] * span_lambda[idx]; - }); - segmented_sum( - sl_terms, span_offsets, cones.K, nt_sl, cones.scratch.segmented_reduce_workspace, stream); - - i_t scalar_grid_dim = (cones.K + flat_block_dim - 1) / flat_block_dim; - nt_scaling_scalar_kernel - <<>>(cones.s, - cones.lambda, - span_offsets, - nt_s1_sq, - nt_l1_sq, - nt_sl, - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.inv_1pw0), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.omega), - cuopt::make_span(cones.rho), - cones.K); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - i_t grid_dim = (cones.m_c + flat_block_dim - 1) / flat_block_dim; - nt_scaling_tail_kernel - <<>>(cones.s, - cones.lambda, - cuopt::make_span(cones.inv_eta), - cuopt::make_span(cones.rho), - cuopt::make_span(cones.w_bar), - cuopt::make_span(cones.omega), - span_offsets, - span_elem); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/barrier/second_order_cone_kernels.cuh b/cpp/src/barrier/second_order_cone_kernels.cuh new file mode 100644 index 0000000000..42568fcc9b --- /dev/null +++ b/cpp/src/barrier/second_order_cone_kernels.cuh @@ -0,0 +1,1004 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// ============================================================================= +// SOC (second-order cone) kernels for the cuOpt barrier solver. +// +// x_soc : cone primal block +// z_soc : cone dual block +// W, W^{-1} : Nesterov-Todd scaling matrix and inverse. W is symmetric for +// SOC, so W^{-T} = W^{-1} +// H : W^{-1} W^{-T} = W^{-2}, the cone KKT block added to the +// primal-reduced system +// eta : sqrt(x_J / z_J), where x_J = sqrt(det_J(x_soc)) +// w : NT scaling direction with det_J(w) = 1 and +// w[head] = sqrt(1 + ||w_tail||^2) +// +// Cone vectors are packed flat: +// entries [cone_offsets[i], cone_offsets[i + 1]) belong to cone i. +// ============================================================================= + +namespace cuopt::linear_programming::dual_simplex { + +inline constexpr int soc_block_size = 256; + +/** + * Reusable device workspace for second-order cone kernels. + * + * The scratch object owns only temporary storage. Kernels may reuse the scalar + * slots and `temp_cone` sequentially inside a higher-level operation, but no + * persistent NT scaling or iterate state is stored here. + */ +template +struct cone_scratch_t { + i_t n_cones; // number of SOC blocks + std::size_t n_cone_entries; // total packed cone dimension + + rmm::device_uvector slots; // [n_slots * n_cones] + + // Per-cone step candidates before the final min reduction. + rmm::device_uvector step_alpha_primal; // [n_cones] + rmm::device_uvector step_alpha_dual; // [n_cones] + + // TODO: Consider moving this out to the barrier layer when we wire it in + rmm::device_uvector temp_cone; // [n_cone_entries] + + cone_scratch_t(i_t n_cones_in, std::size_t n_cone_entries_in, rmm::cuda_stream_view stream) + : n_cones(n_cones_in), + n_cone_entries(n_cone_entries_in), + slots(0, stream), + step_alpha_primal(0, stream), + step_alpha_dual(0, stream), + temp_cone(0, stream) + { + const auto n_cones_size = static_cast(n_cones); + + slots.resize(n_cones_size * static_cast(n_slots), stream); + step_alpha_primal.resize(n_cones_size, stream); + step_alpha_dual.resize(n_cones_size, stream); + temp_cone.resize(n_cone_entries, stream); + } + + template + raft::device_span get_slot() const + { + static_assert(slot_idx >= 0 && slot_idx < n_slots, "scratch slot index out of range"); + const auto n_cones_size = static_cast(n_cones); + const auto begin = static_cast(slot_idx) * n_cones_size; + const auto end = begin + n_cones_size; + return cuopt::make_span(slots, begin, end); + } + + template + raft::device_span get_slot() + { + const auto const_slot = static_cast(*this).template get_slot(); + return raft::device_span(const_cast(const_slot.data()), const_slot.size()); + } +}; + +struct to_size_t_t { + template + HD std::size_t operator()(value_t value) const + { + return value; + } +}; + +template +HD f_t cone_step_length_from_scalars( + f_t u0, f_t du0, f_t du_tail_sq, f_t u_tail_du_tail, f_t u_tail_sq, f_t alpha_max) +{ + const auto a = du0 * du0 - du_tail_sq; + const auto b = u0 * du0 - u_tail_du_tail; + const auto c_raw = u0 * u0 - u_tail_sq; + const auto c = c_raw > 0 ? c_raw : 0; + const auto disc = b * b - a * c; + auto alpha = alpha_max; + + if (du0 < 0) { alpha = cuda::std::min(alpha, -u0 / du0); } + + if ((a > 0 && b > 0) || disc < 0) { return alpha; } + + if (a == 0) { + if (b < 0) { alpha = cuda::std::min(alpha, c / (-2 * b)); } + } else if (c == 0) { + alpha = a >= 0 ? alpha : 0; + } else { + const auto t = -(b + copysign(sqrt(disc), b)); + auto r1 = c / t; + auto r2 = t / a; + if (r1 < 0) { r1 = alpha; } + if (r2 < 0) { r2 = alpha; } + alpha = cuda::std::min(alpha, cuda::std::min(r1, r2)); + } + + return alpha; +} + +template +__global__ void __launch_bounds__(soc_block_size) + step_length_single_kernel(raft::device_span u, + raft::device_span du, + raft::device_span alpha, + raft::device_span du_tail_sq, + raft::device_span u_tail_du_tail, + raft::device_span u_tail_sq, + raft::device_span cone_offsets, + f_t alpha_max, + i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const auto off = cone_offsets[cone]; + alpha[cone] = cone_step_length_from_scalars( + u[off], du[off], du_tail_sq[cone], u_tail_du_tail[cone], u_tail_sq[cone], alpha_max); +} + +/** + * Device storage for second-order cone topology, NT scaling, and iterate views. + * + * Flat arrays are packed by cone: entries + * [cone_offsets[i], cone_offsets[i + 1]) belong to cone i, whose dimension is + * cone_dimensions[i]. + * + * The primal/dual cone vectors are non-owning spans over the SOC slice of the + * solver's global x/z vectors. The caller must keep the underlying storage + * alive for the lifetime of this object. + */ +template +struct cone_data_t { + // Topology. This is immutable after construction. + i_t n_cones; // number of SOC blocks + std::size_t n_cone_entries; // total packed cone dimension = sum(cone_dimensions) + + rmm::device_uvector cone_offsets; // [n_cones + 1], prefix sum of dimensions + rmm::device_uvector cone_dimensions; // [n_cones], dimension q_i of each cone + // Owning cone per entry for upcoming flat per-entry SOC kernels. + rmm::device_uvector element_cone_ids; // [n_cone_entries] + segmented_sum_t segmented_sum; + + // Non-owning iterate views over the cone portion of x/z. + raft::device_span x; // [n_cone_entries], SOC primal block + raft::device_span z; // [n_cone_entries], SOC dual block + + // Persistent Nesterov-Todd scaling state, recomputed from x/z each iteration. + rmm::device_uvector eta; // [n_cones], sqrt(|x|_J / |z|_J) + rmm::device_uvector w; // [n_cone_entries], unit-J-norm NT direction + + cone_scratch_t scratch; + + cone_data_t(std::span cone_dimensions_host, + raft::device_span x_in, + raft::device_span z_in, + rmm::cuda_stream_view stream) + : n_cones(cone_dimensions_host.size()), + n_cone_entries( + std::reduce(cone_dimensions_host.begin(), cone_dimensions_host.end(), std::size_t{0})), + cone_offsets(n_cones + 1, stream), + cone_dimensions(n_cones, stream), + element_cone_ids(n_cone_entries, stream), + segmented_sum(cone_dimensions_host, cuopt::make_span(cone_offsets), stream), + x(x_in), + z(z_in), + eta(n_cones, stream), + w(n_cone_entries, stream), + scratch(n_cones, n_cone_entries, stream) + { + raft::copy(cone_dimensions.data(), cone_dimensions_host.data(), n_cones, stream); + cone_offsets.set_element_to_zero_async(0, stream); + auto policy = rmm::exec_policy(stream); + + auto cone_dimensions_as_offsets = + thrust::make_transform_iterator(cone_dimensions.begin(), to_size_t_t{}); + thrust::inclusive_scan(policy, + cone_dimensions_as_offsets, + cone_dimensions_as_offsets + n_cones, + cone_offsets.begin() + 1, + cuda::std::plus{}); + + thrust::upper_bound(policy, + cone_offsets.begin() + 1, + cone_offsets.end(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n_cone_entries), + element_cone_ids.begin()); + segmented_sum.template prepare_workspace(stream); + } +}; + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_scaling_scalars_kernel(raft::device_span x, + raft::device_span z, + raft::device_span x_scale, + raft::device_span z_scale, + raft::device_span eta, + raft::device_span cone_offsets, + i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const auto off = cone_offsets[cone]; + const auto x_tail_norm = sqrt(x_scale[cone]); + const auto z_tail_norm = sqrt(z_scale[cone]); + const auto x_det = (x[off] - x_tail_norm) * (x[off] + x_tail_norm); + const auto z_det = (z[off] - z_tail_norm) * (z[off] + z_tail_norm); + + x_scale[cone] = sqrt(x_det); + z_scale[cone] = sqrt(z_det); + eta[cone] = sqrt(x_scale[cone] / z_scale[cone]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_w_scale_kernel(raft::device_span normalized_dot, i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + normalized_dot[cone] = sqrt(2 + 2 * normalized_dot[cone]); +} + +/** + * Write normalized w_tail directly: + * + * w_tail = (x_tail / x_scale - z_tail / z_scale) / w_scale. + * + * The head is zeroed temporarily and overwritten after reducing + * ||w_tail||^2. + */ +template +__global__ void __launch_bounds__(soc_block_size) + nt_write_w_tail_kernel(raft::device_span x, + raft::device_span z, + raft::device_span x_scale, + raft::device_span z_scale, + raft::device_span w_scale, + raft::device_span w, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= w.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + if (idx == cone_off) { + w[idx] = 0; + return; + } + + w[idx] = (x[idx] / x_scale[cone] - z[idx] / z_scale[cone]) / w_scale[cone]; +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_head_kernel(raft::device_span w, + raft::device_span normalized_tail_sq, + raft::device_span cone_offsets, + i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + w[cone_offsets[cone]] = sqrt(1 + normalized_tail_sq[cone]); +} + +/** + * Build Nesterov-Todd scaling for packed SOC blocks. + * + * Given interior cone primal/dual blocks x and z: + * + * x_scale = sqrt(det_J(x)), z_scale = sqrt(det_J(z)) + * eta = sqrt(x_scale / z_scale) + * w_scale = sqrt(2 + 2 * dot(x / x_scale, z / z_scale)) + * w_tail = (x_tail / x_scale - z_tail / z_scale) / w_scale + * w_0 = sqrt(1 + ||w_tail||^2) to re-impose det_J(w) = 1 + * + * Scratch slots: + * 0: ||x_tail||^2 -> x_scale + * 1: ||z_tail||^2 -> z_scale + * 2: dot(x / x_scale, z / z_scale) -> w_scale -> ||w_tail||^2 + */ +template +void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) +{ + auto x_scale = cones.scratch.template get_slot<0>(); + auto z_scale = cones.scratch.template get_slot<1>(); + auto w_scale = cones.scratch.template get_slot<2>(); + + const auto span_x = cones.x; + const auto span_z = cones.z; + const auto cone_offsets = cuopt::make_span(cones.cone_offsets); + const auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + + auto x_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_x, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : span_x[idx] * span_x[idx]; + }); + cones.segmented_sum(x_tail_sq_terms, x_scale, stream); + + auto z_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_z, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : span_z[idx] * span_z[idx]; + }); + cones.segmented_sum(z_tail_sq_terms, z_scale, stream); + + const auto cone_grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + nt_finalize_scaling_scalars_kernel + <<>>( + cones.x, cones.z, x_scale, z_scale, cuopt::make_span(cones.eta), cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + const auto element_grid_dim = raft::ceildiv(cones.n_cone_entries, soc_block_size); + auto normalized_dot_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_x, span_z, x_scale, z_scale, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return span_x[idx] * span_z[idx] / (x_scale[cone] * z_scale[cone]); + }); + cones.segmented_sum(normalized_dot_terms, w_scale, stream); + + nt_finalize_w_scale_kernel + <<>>(w_scale, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + auto w = cuopt::make_span(cones.w); + nt_write_w_tail_kernel<<>>( + cones.x, cones.z, x_scale, z_scale, w_scale, w, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Reduce ||w_tail||^2 per cone. The head entries are zero, so the same + // flat iterator can feed the segmented reduction. + auto normalized_tail_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [cone_offsets, element_cone_ids, w] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * w[idx]; + }); + cones.segmented_sum(normalized_tail_terms, w_scale, stream); + + nt_finalize_head_kernel<<>>( + cuopt::make_span(cones.w), w_scale, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_w_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + const auto local_idx = idx - cone_off; + + const auto w0 = w[cone_off]; + const auto zeta = tail_dot[cone]; + const auto inv_1pw0 = 1 / (1 + w0); + const auto v0 = v[cone_off]; + + if (local_idx == 0) { + out[idx] = eta[cone] * (w0 * v0 + zeta); + return; + } + + const auto coeff = v0 + zeta * inv_1pw0; + out[idx] = eta[cone] * (v[idx] + coeff * w[idx]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_w_inv_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + const auto local_idx = idx - cone_off; + + const auto w0 = w[cone_off]; + const auto zeta = tail_dot[cone]; + const auto inv_1pw0 = 1 / (1 + w0); + const auto v0 = v[cone_off]; + const auto inv_eta = 1 / eta[cone]; + + if (local_idx == 0) { + out[idx] = inv_eta * (w0 * v0 - zeta); + return; + } + + const auto coeff = -v0 + zeta * inv_1pw0; + out[idx] = inv_eta * (v[idx] + coeff * w[idx]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_hinv2_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span bias, + f_t output_scale, + f_t bias_scale) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + const auto local_idx = idx - cone_off; + + const auto inv_eta_sq = 1 / (eta[cone] * eta[cone]); + const auto rho = w[cone_off] * v[cone_off] - tail_dot[cone]; + const auto coeff = 2 * rho * inv_eta_sq; + const int sign = (local_idx == 0) * 2 - 1; + const auto value = coeff * w[idx] - inv_eta_sq * v[idx]; + const auto h_value = output_scale * value * sign; + + out[idx] = bias.empty() ? h_value : bias_scale * bias[idx] + h_value; +} + +template +__global__ void __launch_bounds__(soc_block_size) + gather_cone_heads_kernel(raft::device_span values, + raft::device_span heads, + raft::device_span cone_offsets, + i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + heads[cone] = values[cone_offsets[cone]]; +} + +/** + * Build the Mehrotra corrector shift: + * + * d = (W^{-1} dx_aff) o (W dz_aff) - sigma_mu e. + * + * On entry, `scaled_dx` is W^{-1} dx_aff and `scaled_dz` is W dz_aff. The + * cone head uses the full dot product, and tail entries use the SOC Jordan + * product: + * + * d_0 = - sigma_mu + * d_tail = scaled_dx_0 * scaled_dz_tail + scaled_dz_0 * scaled_dx_tail. + */ +template +__global__ void __launch_bounds__(soc_block_size) + combined_cone_shift_write_kernel(raft::device_span shift, + raft::device_span scaled_dx, + raft::device_span scaled_dz, + raft::device_span full_dot, + raft::device_span scaled_dx_head, + raft::device_span scaled_dz_head, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + f_t sigma_mu) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= shift.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + const auto local_idx = idx - cone_off; + + if (local_idx == 0) { + shift[idx] = full_dot[cone] - sigma_mu; + return; + } + + shift[idx] = scaled_dx_head[cone] * scaled_dz[idx] + scaled_dz_head[cone] * scaled_dx[idx]; +} + +/** + * Per-cone scalar stage for p = lambda \ d: + * + * p_0 = (lambda_0 d_0 - ) / det_J(lambda) + * inv_lambda_0 = 1 / lambda_0. + * + * A second flat kernel writes `-p`, which lets the final W^{-1} call produce + * q = -W^{-1} p without adding an output-scale argument to W^{-1}. + */ +template +__global__ void __launch_bounds__(soc_block_size) + jordan_divide_by_lambda_scalar_kernel(raft::device_span shift, + raft::device_span nt_point, + raft::device_span lambda_tail_dot, + raft::device_span lambda_tail_sq, + raft::device_span p0, + raft::device_span inv_lambda0, + raft::device_span cone_offsets, + i_t n_cones) +{ + const auto cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const auto cone_off = cone_offsets[cone]; + const auto lambda0 = nt_point[cone_off]; + const auto lambda_tail_norm = sqrt(lambda_tail_sq[cone]); + const auto det_lambda = (lambda0 - lambda_tail_norm) * (lambda0 + lambda_tail_norm); + + p0[cone] = (lambda0 * shift[cone_off] - lambda_tail_dot[cone]) / det_lambda; + inv_lambda0[cone] = 1 / lambda0; +} + +template +__global__ void __launch_bounds__(soc_block_size) + jordan_divide_by_lambda_write_kernel(raft::device_span shift, + raft::device_span nt_point, + raft::device_span p0, + raft::device_span inv_lambda0, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span out) +{ + const auto idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const auto cone = element_cone_ids[idx]; + const auto cone_off = cone_offsets[cone]; + const auto local_idx = idx - cone_off; + + if (local_idx == 0) { + out[idx] = -p0[cone]; + return; + } + + out[idx] = (p0[cone] * nt_point[idx] - shift[idx]) * inv_lambda0[cone]; +} + +/** + * Apply the Nesterov-Todd scaling matrix: out = W v. + * + * For each cone: + * zeta = + * (Wv)_0 = eta * (w_0 v_0 + zeta) + * (Wv)_tail = eta * (v_tail + (v_0 + zeta / (1 + w_0)) w_tail) + */ +template +void apply_w(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto tail_dot = cones.scratch.template get_slot<0>(); + + auto tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [v, w, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * v[idx]; + }); + cones.segmented_sum(tail_terms, tail_dot, stream); + + const auto grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_w_write_kernel<<>>( + v, out, w, eta, tail_dot, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Apply the inverse Nesterov-Todd scaling matrix: + * out = W^{-1} v. + * + * For each cone, + * zeta = + * (W^{-1}v)_0 = eta^{-1} * (w_0 v_0 - zeta) + * (W^{-1}v)_tail = + * eta^{-1} * (v_tail + (-v_0 + zeta / (1 + w_0)) w_tail) + */ +template +void apply_w_inv(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto tail_dot = cones.scratch.template get_slot<0>(); + + auto tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [v, w, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * v[idx]; + }); + cones.segmented_sum(tail_terms, tail_dot, stream); + + const auto grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_w_inv_write_kernel<<>>( + v, out, w, eta, tail_dot, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Apply the cone KKT block H = W^{-1} W^{-T} = W^{-2}. + * + * With zeta = and rho = w_0 v_0 - zeta: + * (Hv)_0 = eta^{-2} (2 w_0 rho - v_0) + * (Hv)_tail = eta^{-2} (v_tail - 2 w_tail rho) + */ +template +void apply_hinv2(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream, + f_t output_scale = 1, + raft::device_span bias = {}, + f_t bias_scale = 0) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto tail_dot = cones.scratch.template get_slot<0>(); + + auto tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [v, w, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * v[idx]; + }); + cones.segmented_sum(tail_terms, tail_dot, stream); + + const auto grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_hinv2_write_kernel<<>>( + v, out, w, eta, tail_dot, cone_offsets, element_cone_ids, bias, output_scale, bias_scale); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Recover the SOC dual direction after the reduced KKT solve. + * + * The reduced solve gives `dx`; the cone equation supplies the target RHS. + * This function applies the cone block H = W^{-2} and writes: + * dz = cone_target - H dx. + */ +template +void recover_cone_dz_from_target(raft::device_span dx, + cone_data_t& cones, + raft::device_span cone_target, + raft::device_span dz, + rmm::cuda_stream_view stream) +{ + apply_hinv2(dx, dz, cones, stream, -1, cone_target, 1); +} + +/** + * Accumulate the SOC cone-block matvec into an existing output vector. + * + * Used by matrix-free products with the primal-reduced KKT block: + * out += H x, where H = W^{-2}. + */ +template +void accumulate_cone_hinv2_matvec(raft::device_span x, + cone_data_t& cones, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + auto out_input = raft::device_span(out.data(), out.size()); + apply_hinv2(x, out, cones, stream, 1, out_input, 1); +} + +template +__global__ void __launch_bounds__(soc_block_size) + scatter_hinv2_into_augmented_kernel(raft::device_span augmented_x, + raft::device_span csr_indices, + raft::device_span q_values, + raft::device_span w, + raft::device_span eta, + raft::device_span cone_offsets, + raft::device_span block_offsets, + i_t n_cones) +{ + const auto e = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (e >= csr_indices.size()) { return; } + + i_t lo = 0; + i_t hi = n_cones; + while (lo < hi) { + const i_t mid = lo + (hi - lo) / 2; + if (block_offsets[mid + 1] <= e) { + lo = mid + 1; + } else { + hi = mid; + } + } + + const auto cone = lo; + const auto off = cone_offsets[cone]; + const auto q = cone_offsets[cone + 1] - off; + const auto blk_off = block_offsets[cone]; + const auto local = e - blk_off; + const auto r = local / q; + const auto c = local % q; + + const auto inv_eta_sq = 1 / (eta[cone] * eta[cone]); + const auto w0 = w[off]; + const auto u_r = (r == 0) ? w0 : -w[off + r]; + const auto u_c = (c == 0) ? w0 : -w[off + c]; + auto val = f_t{2} * u_r * inv_eta_sq * u_c; + const auto diag_correction = (r == 0) ? -inv_eta_sq : inv_eta_sq; + if (r == c) { val += diag_correction; } + + augmented_x[csr_indices[e]] = -val - q_values[e]; +} + +template +void scatter_hinv2_into_augmented(const cone_data_t& cones, + rmm::device_uvector& augmented_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& q_values, + rmm::cuda_stream_view stream) +{ + const auto count = csr_indices.size(); + if (count == 0) { return; } + cuopt_assert(count == q_values.size(), "cone CSR index and Q-value arrays must match"); + + rmm::device_uvector block_offsets(cones.n_cones + 1, stream); + block_offsets.set_element_to_zero_async(0, stream); + + auto block_sizes = thrust::make_transform_iterator( + cones.cone_dimensions.begin(), + [] HD(i_t q) -> std::size_t { return static_cast(q) * q; }); + thrust::inclusive_scan( + rmm::exec_policy(stream), block_sizes, block_sizes + cones.n_cones, block_offsets.begin() + 1); + + const auto grid = raft::ceildiv(count, soc_block_size); + scatter_hinv2_into_augmented_kernel + <<>>(cuopt::make_span(augmented_x), + cuopt::make_span(csr_indices), + cuopt::make_span(q_values), + cuopt::make_span(cones.w), + cuopt::make_span(cones.eta), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(block_offsets), + cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Compute the maximum primal and dual step lengths that keep SOC blocks + * feasible: + * + * x + alpha dx in Q, z + alpha dz in Q, alpha <= alpha_max. + * + * For one cone u + alpha du, feasibility is + * + * u_0 + alpha du_0 >= ||u_tail + alpha du_tail||. + * + * Squaring gives the quadratic + * + * c + 2 b alpha + a alpha^2 >= 0, + * + * where c = det_J(u), b = u_0 du_0 - , and + * a = det_J(du). The per-cone kernel below solves for the first boundary + * crossing, and the final reductions take the global minimum over cones. + */ +template +std::pair compute_cone_step_length(cone_data_t& cones, + raft::device_span dx, + raft::device_span dz, + f_t alpha_max, + rmm::cuda_stream_view stream) +{ + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto slot_0 = cones.scratch.template get_slot<0>(); + auto slot_1 = cones.scratch.template get_slot<1>(); + auto slot_2 = cones.scratch.template get_slot<2>(); + + auto run_pass = [&](raft::device_span u, + raft::device_span du, + raft::device_span alpha) { + auto du_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [du, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : du[idx] * du[idx]; + }); + cones.segmented_sum(du_tail_sq_terms, slot_0, stream); + + auto u_tail_du_tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [u, du, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : u[idx] * du[idx]; + }); + cones.segmented_sum(u_tail_du_tail_terms, slot_1, stream); + + auto u_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [u, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : u[idx] * u[idx]; + }); + cones.segmented_sum(u_tail_sq_terms, slot_2, stream); + + const auto grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + step_length_single_kernel<<>>( + u, du, alpha, slot_0, slot_1, slot_2, cone_offsets, alpha_max, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + }; + + auto alpha_primal = cuopt::make_span(cones.scratch.step_alpha_primal); + auto alpha_dual = cuopt::make_span(cones.scratch.step_alpha_dual); + + run_pass(cones.x, dx, alpha_primal); + run_pass(cones.z, dz, alpha_dual); + + const auto primal = thrust::reduce(rmm::exec_policy(stream), + alpha_primal.begin(), + alpha_primal.end(), + alpha_max, + thrust::minimum()); + const auto dual = thrust::reduce(rmm::exec_policy(stream), + alpha_dual.begin(), + alpha_dual.end(), + alpha_max, + thrust::minimum()); + + return {primal, dual}; +} + +/** + * Build the SOC corrector target for the reduced KKT solve. + * + * Mehrotra's corrector uses affine cone directions to form + * + * d = (W^{-1} dx_aff) o (W dz_aff) - sigma_mu e, + * + * where `o` is the SOC Jordan product and `e = (1, 0, ..., 0)` per cone. + * The reduced KKT solve needs the cone target + * + * q = -W^{-1} p, where p = lambda \ d and lambda = W z. + * + * On return, `out` holds `q`. Internally, `out` is reused for `W dz_aff` and + * then `d`; `scratch.temp_cone` is reused for `W^{-1} dx_aff`, then `lambda`, + * then `-p`. + */ +template +void compute_combined_cone_rhs_term(raft::device_span dx_aff, + raft::device_span dz_aff, + cone_data_t& cones, + f_t sigma_mu, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + + auto scratch_cone = cuopt::make_span(cones.scratch.temp_cone); + auto scaled_dx = raft::device_span(scratch_cone.data(), scratch_cone.size()); + auto scaled_dz = raft::device_span(out.data(), out.size()); + auto slot_0 = cones.scratch.template get_slot<0>(); + auto slot_1 = cones.scratch.template get_slot<1>(); + auto slot_2 = cones.scratch.template get_slot<2>(); + + apply_w_inv(dx_aff, scratch_cone, cones, stream); + apply_w(dz_aff, out, cones, stream); + + auto full_product_terms = thrust::make_transform_iterator( + thrust::make_zip_iterator(scaled_dx.begin(), scaled_dz.begin()), + thrust::make_zip_function([] HD(f_t dx, f_t dz) -> f_t { return dx * dz; })); + cones.segmented_sum(full_product_terms, slot_0, stream); + + // `out` currently aliases W dz_aff and is about to be overwritten with d. + // Stage both head vectors first because every tail entry needs them. + const auto cone_grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + gather_cone_heads_kernel<<>>( + scaled_dx, slot_1, cone_offsets, cones.n_cones); + gather_cone_heads_kernel<<>>( + scaled_dz, slot_2, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + const auto element_grid_dim = raft::ceildiv(cones.n_cone_entries, soc_block_size); + combined_cone_shift_write_kernel + <<>>( + out, scaled_dx, scaled_dz, slot_0, slot_1, slot_2, cone_offsets, element_cone_ids, sigma_mu); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Form lambda = W z in scratch_cone. At this point W^{-1} dx_aff is dead. + apply_w(cones.z, scratch_cone, cones, stream); + + auto shift = raft::device_span(out.data(), out.size()); + auto nt_point = raft::device_span(scratch_cone.data(), scratch_cone.size()); + + auto lambda_tail_dot_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [shift, nt_point, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : nt_point[idx] * shift[idx]; + }); + cones.segmented_sum(lambda_tail_dot_terms, slot_0, stream); + + auto lambda_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [nt_point, cone_offsets, element_cone_ids] HD(std::size_t idx) -> f_t { + const auto cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : nt_point[idx] * nt_point[idx]; + }); + cones.segmented_sum(lambda_tail_sq_terms, slot_1, stream); + + jordan_divide_by_lambda_scalar_kernel + <<>>( + shift, nt_point, slot_0, slot_1, slot_0, slot_1, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + jordan_divide_by_lambda_write_kernel + <<>>( + shift, nt_point, slot_0, slot_1, cone_offsets, element_cone_ids, scratch_cone); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + apply_w_inv(scratch_cone, out, cones, stream); +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/barrier/second_order_cone_reduction.cuh b/cpp/src/barrier/second_order_cone_reduction.cuh new file mode 100644 index 0000000000..ada9fcfb6a --- /dev/null +++ b/cpp/src/barrier/second_order_cone_reduction.cuh @@ -0,0 +1,261 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +__global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) + warp_per_cone_reduce_kernel(InputIt input, + raft::device_span small_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init); + +/** + * Segmented-sum dispatcher for packed second-order cone vectors. + * + * Cone dimensions are fixed for a solve, so the constructor partitions cone + * ids once by reduction strategy. Each call then reuses those partitions: + * small cones use one warp per cone, medium cones use CUB DeviceSegmentedReduce, + * and large cones use CUB DeviceReduce one cone at a time. The object owns the + * CUB workspace for those medium/large paths. Call `prepare_workspace` once + * before using a CUB-backed path. + */ +template +struct segmented_sum_t { + static_assert(warp_cone_dim > 0); + static_assert(large_cone_cutoff > warp_cone_dim); + + raft::device_span cone_offsets; + rmm::device_uvector small_cone_ids; // cone dimension <= warp_cone_dim + rmm::device_uvector medium_cone_ids; // warp_cone_dim < cone dimension <= large_cone_cutoff + + std::vector large_cone_offsets; + std::vector large_cone_ids; + std::vector large_cone_dimensions; + + // Maximum CUB temporary storage needed by prepared medium/large reductions. + std::size_t cub_workspace_bytes = 0; + rmm::device_buffer cub_workspace; + + private: + template + void prepare_workspace_for_type(rmm::cuda_stream_view stream) + { + auto input = thrust::make_constant_iterator(value_t{}); + auto output = thrust::make_discard_iterator(); + + if (!medium_cone_ids.is_empty()) { + const auto medium_begin_offsets = + thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); + const auto medium_end_offsets = + thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); + + std::size_t temp_storage_bytes = 0; + RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, + temp_storage_bytes, + input, + output, + medium_cone_ids.size(), + medium_begin_offsets, + medium_end_offsets, + stream.value())); + cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); + } + + for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { + std::size_t temp_storage_bytes = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Sum(nullptr, + temp_storage_bytes, + input + large_cone_offsets[i], + output + large_cone_ids[i], + large_cone_dimensions[i], + stream.value())); + cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); + } + + if (cub_workspace.size() < cub_workspace_bytes) { + cub_workspace.resize(cub_workspace_bytes, stream); + } + } + + public: + template + void prepare_workspace(rmm::cuda_stream_view stream) + { + prepare_workspace_for_type(stream); + (prepare_workspace_for_type(stream), ...); + } + + template + void operator()(InputIt input, OutputIt output, value_t init, rmm::cuda_stream_view stream) + { + if (!small_cone_ids.is_empty()) { + // Each warp reduces one small cone. `warps_per_cta` only controls how + // many independent cone reductions are packed into one CTA; the default + // of 8 gives a conventional 256-thread block. + const auto n_small = small_cone_ids.size(); + const auto grid = (n_small + warps_per_cta - 1) / warps_per_cta; + warp_per_cone_reduce_kernel + <<>>( + input, cuopt::make_span(small_cone_ids), cone_offsets, output, init); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + if (!medium_cone_ids.is_empty()) { + cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, + "segmented_sum_t::prepare_workspace must be called before reducing medium or " + "large cones"); + + const auto medium_output = thrust::make_permutation_iterator(output, medium_cone_ids.begin()); + const auto medium_begin_offsets = + thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); + const auto medium_end_offsets = + thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); + + std::size_t temp_storage_bytes = cub_workspace_bytes; + RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(cub_workspace.data(), + temp_storage_bytes, + input, + medium_output, + medium_cone_ids.size(), + medium_begin_offsets, + medium_end_offsets, + stream.value())); + } + + if (!large_cone_ids.empty()) { + cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, + "segmented_sum_t::prepare_workspace must be called before reducing medium or " + "large cones"); + + for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { + std::size_t temp_storage_bytes = cub_workspace_bytes; + RAFT_CUDA_TRY(cub::DeviceReduce::Sum(cub_workspace.data(), + temp_storage_bytes, + input + large_cone_offsets[i], + output + large_cone_ids[i], + large_cone_dimensions[i], + stream.value())); + } + } + } + + template + void operator()(InputIt input, raft::device_span output, rmm::cuda_stream_view stream) + { + operator()(input, output.data(), f_t{0}, stream); + } + + segmented_sum_t(std::span cone_dimensions_host, + raft::device_span cone_offsets_in, + rmm::cuda_stream_view stream) + : cone_offsets(cone_offsets_in), + small_cone_ids(0, stream), + medium_cone_ids(0, stream), + cub_workspace(0, stream) + { + std::vector small_cone_ids_host; + std::vector medium_cone_ids_host; + + std::size_t cone_offset = 0; + i_t cone = 0; + for (const auto cone_dimension : cone_dimensions_host) { + if (cone_dimension <= warp_cone_dim) { + small_cone_ids_host.push_back(cone); + } else if (cone_dimension <= large_cone_cutoff) { + medium_cone_ids_host.push_back(cone); + } else { + large_cone_ids.push_back(cone); + large_cone_offsets.push_back(cone_offset); + large_cone_dimensions.push_back(cone_dimension); + } + cone_offset += cone_dimension; + ++cone; + } + + bool need_sync = false; + if (!small_cone_ids_host.empty()) { + cuopt::device_copy(small_cone_ids, small_cone_ids_host, stream); + need_sync = true; + } + if (!medium_cone_ids_host.empty()) { + cuopt::device_copy(medium_cone_ids, medium_cone_ids_host, stream); + need_sync = true; + } + if (need_sync) { stream.synchronize(); } + } +}; + +template +__global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) + warp_per_cone_reduce_kernel(InputIt input, + raft::device_span small_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init) +{ + static_assert(warps_per_cta > 0); + static_assert(warps_per_cta * raft::WarpSize <= 1024); + + using warp_reduce_t = cub::WarpReduce; + __shared__ typename warp_reduce_t::TempStorage temp_storage[warps_per_cta]; + + const auto lane_id = raft::laneId(); + const auto warp_idx = threadIdx.x / raft::WarpSize; + const auto slot = blockIdx.x * warps_per_cta + warp_idx; + if (slot >= small_cone_ids.size()) { return; } + + const auto cone = small_cone_ids[slot]; + const auto off = cone_offsets[cone]; + const auto dim = cone_offsets[cone + 1] - off; + + auto sum = init; + for (std::size_t i = lane_id; i < dim; i += raft::WarpSize) { + sum = sum + input[off + i]; + } + + sum = warp_reduce_t(temp_storage[warp_idx]).Sum(sum); + if (lane_id == 0) { output[cone] = sum; } +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/tests/dual_simplex/CMakeLists.txt b/cpp/tests/dual_simplex/CMakeLists.txt index 1abeb62ded..f6dff93227 100644 --- a/cpp/tests/dual_simplex/CMakeLists.txt +++ b/cpp/tests/dual_simplex/CMakeLists.txt @@ -6,5 +6,5 @@ ConfigureTest(DUAL_SIMPLEX_TEST ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/solve.cpp ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/solve_barrier.cu - ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/second_order_cone_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/unit_tests/second_order_cone_kernels.cu ) diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_kernels.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_kernels.cu new file mode 100644 index 0000000000..b8e6eef487 --- /dev/null +++ b/cpp/tests/dual_simplex/unit_tests/second_order_cone_kernels.cu @@ -0,0 +1,625 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex::test { + +double host_cone_step_length_from_scalars(double u0, + double du0, + double du_tail_sq, + double u_tail_du_tail, + double u_tail_sq, + double alpha_max) +{ + const auto a = du0 * du0 - du_tail_sq; + const auto b = u0 * du0 - u_tail_du_tail; + const auto c_raw = u0 * u0 - u_tail_sq; + const auto c = c_raw > 0.0 ? c_raw : 0.0; + const auto disc = b * b - a * c; + auto alpha = alpha_max; + + if (du0 < 0.0) { alpha = std::min(alpha, -u0 / du0); } + + if ((a > 0.0 && b > 0.0) || disc < 0.0) { return alpha; } + + if (a == 0.0) { + if (b < 0.0) { alpha = std::min(alpha, c / (-2.0 * b)); } + } else if (c == 0.0) { + alpha = a >= 0.0 ? alpha : 0.0; + } else { + const auto t = -(b + std::copysign(std::sqrt(disc), b)); + auto r1 = c / t; + auto r2 = t / a; + if (r1 < 0.0) { r1 = alpha; } + if (r2 < 0.0) { r2 = alpha; } + alpha = std::min(alpha, std::min(r1, r2)); + } + + return alpha; +} + +TEST(second_order_cone_kernels, topology_and_scratch_layout) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 2, 5}; + rmm::device_uvector x(10, stream); + rmm::device_uvector z(10, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + EXPECT_EQ(cones.n_cones, std::size_t{3}); + EXPECT_EQ(cones.n_cone_entries, std::size_t{10}); + EXPECT_EQ(cones.x.data(), x.data()); + EXPECT_EQ(cones.z.data(), z.data()); + + EXPECT_EQ(cuopt::host_copy(cones.cone_offsets, stream), (std::vector{0, 3, 5, 10})); + EXPECT_EQ(cuopt::host_copy(cones.cone_dimensions, stream), cone_dimensions); + EXPECT_EQ(cuopt::host_copy(cones.element_cone_ids, stream), + (std::vector{0, 0, 0, 1, 1, 2, 2, 2, 2, 2})); + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.small_cone_ids, stream), + (std::vector{0, 1, 2})); + EXPECT_TRUE(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream).empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_ids.empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_offsets.empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_dimensions.empty()); + + EXPECT_EQ(cones.eta.size(), 3); + EXPECT_EQ(cones.w.size(), 10); + + auto& scratch = cones.scratch; + EXPECT_EQ(scratch.n_cones, cones.n_cones); + EXPECT_EQ(scratch.n_cone_entries, cones.n_cone_entries); + EXPECT_EQ(scratch.slots.size(), 3 * cone_dimensions.size()); + EXPECT_EQ(scratch.step_alpha_primal.size(), cone_dimensions.size()); + EXPECT_EQ(scratch.step_alpha_dual.size(), cone_dimensions.size()); + EXPECT_EQ(scratch.temp_cone.size(), x.size()); + + EXPECT_EQ(scratch.get_slot<0>().size(), cone_dimensions.size()); + EXPECT_EQ(scratch.get_slot<1>().data(), scratch.get_slot<0>().data() + cones.n_cones); + EXPECT_EQ(scratch.get_slot<2>().data(), scratch.get_slot<1>().data() + cones.n_cones); +} + +TEST(second_order_cone_kernels, segmented_sum_uses_all_cone_size_buckets) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{65, 3, 66, 32769}; + rmm::device_uvector x(32903, stream); + rmm::device_uvector z(32903, stream); + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.small_cone_ids, stream), (std::vector{1})); + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream), + (std::vector{0, 2})); + EXPECT_EQ(cones.segmented_sum.large_cone_ids, (std::vector{3})); + EXPECT_EQ(cones.segmented_sum.large_cone_offsets, (std::vector{134})); + EXPECT_EQ(cones.segmented_sum.large_cone_dimensions, (std::vector{32769})); + + std::vector values_host(cones.n_cone_entries, 1.0); + rmm::device_uvector values(values_host.size(), stream); + rmm::device_uvector sums(cone_dimensions.size(), stream); + raft::copy(values.data(), values_host.data(), values_host.size(), stream); + + EXPECT_GT(cones.segmented_sum.cub_workspace_bytes, 0); + const auto workspace_size = cones.segmented_sum.cub_workspace.size(); + EXPECT_GT(workspace_size, 0); + + cones.segmented_sum(values.data(), cuopt::make_span(sums), stream); + + EXPECT_EQ(cuopt::host_copy(sums, stream), (std::vector{65.0, 3.0, 66.0, 32769.0})); + EXPECT_EQ(cones.segmented_sum.cub_workspace.size(), workspace_size); +} + +TEST(second_order_cone_kernels, nt_scaling_matches_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 100.0 + static_cast(cone); + z_host[offset] = 80.0 + static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + x_host[offset + local_idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[offset + local_idx] = 0.0015 * static_cast((local_idx % 7) + 1); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + const auto workspace_size = cones.segmented_sum.cub_workspace.size(); + EXPECT_GT(workspace_size, 0); + + launch_nt_scaling(cones, stream); + EXPECT_EQ(cones.segmented_sum.cub_workspace.size(), workspace_size); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + + std::vector expected_eta(cone_dimensions.size()); + std::vector expected_w(n_cone_entries); + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double x_tail_sq = 0.0; + double z_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_tail_sq += x_host[idx] * x_host[idx]; + z_tail_sq += z_host[idx] * z_host[idx]; + } + + const auto x_tail_norm = std::sqrt(x_tail_sq); + const auto z_tail_norm = std::sqrt(z_tail_sq); + const auto x_det = (x_host[offset] - x_tail_norm) * (x_host[offset] + x_tail_norm); + const auto z_det = (z_host[offset] - z_tail_norm) * (z_host[offset] + z_tail_norm); + ASSERT_GT(x_det, 0.0) << "cone " << cone; + ASSERT_GT(z_det, 0.0) << "cone " << cone; + + const auto x_scale = std::sqrt(x_det); + const auto z_scale = std::sqrt(z_det); + + expected_eta[cone] = std::sqrt(x_scale / z_scale); + + double normalized_xz_dot = 0.0; + for (int local_idx = 0; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_xz_dot += x_host[idx] * z_host[idx] / (x_scale * z_scale); + } + const auto w_det = 2.0 + 2.0 * normalized_xz_dot; + ASSERT_GT(w_det, 0.0) << "cone " << cone; + const auto w_scale = std::sqrt(w_det); + + expected_w[offset] = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + expected_w[idx] = (x_host[idx] / x_scale - z_host[idx] / z_scale) / w_scale; + } + + double normalized_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_tail_sq += expected_w[idx] * expected_w[idx]; + } + expected_w[offset] = std::sqrt(1.0 + normalized_tail_sq); + + offset += static_cast(dim); + } + + for (std::size_t i = 0; i < expected_eta.size(); ++i) { + EXPECT_NEAR(eta_host[i], expected_eta[i], 1e-10) << "cone " << i; + } + + for (std::size_t i = 0; i < expected_w.size(); ++i) { + EXPECT_NEAR(w_host[i], expected_w[i], 1e-10) << "entry " << i; + } + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + tail_sq += w_host[idx] * w_host[idx]; + } + + EXPECT_NEAR(w_host[offset] * w_host[offset] - tail_sq, 1.0, 1e-10) << "cone " << cone; + offset += static_cast(dim); + } +} + +TEST(second_order_cone_kernels, cone_step_length_matches_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector dx_host(n_cone_entries); + std::vector dz_host(n_cone_entries); + + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + x_host[offset] = 12.0 + static_cast(cone); + z_host[offset] = 14.0 + static_cast(cone); + dx_host[offset] = (cone == 0) ? -30.0 : 0.2; + dz_host[offset] = (cone == 1) ? -25.0 : 0.15; + + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + dx_host[idx] = 0.02 * static_cast((local_idx % 5) - 2); + dz_host[idx] = -0.015 * static_cast((local_idx % 7) - 3); + } + + offset += static_cast(dim); + } + + auto compute_expected_step = [&](std::vector const& u, + std::vector const& du, + double alpha_max, + std::vector& per_cone_alpha) { + auto global_alpha = alpha_max; + std::size_t off = 0; + + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double du_tail_sq = 0.0; + double u_tail_du_tail = 0.0; + double u_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + du_tail_sq += du[idx] * du[idx]; + u_tail_du_tail += u[idx] * du[idx]; + u_tail_sq += u[idx] * u[idx]; + } + + per_cone_alpha[cone] = host_cone_step_length_from_scalars( + u[off], du[off], du_tail_sq, u_tail_du_tail, u_tail_sq, alpha_max); + global_alpha = std::min(global_alpha, per_cone_alpha[cone]); + + off += static_cast(dim); + } + + return global_alpha; + }; + + constexpr double alpha_max = 0.99; + std::vector expected_primal_per_cone(cone_dimensions.size()); + std::vector expected_dual_per_cone(cone_dimensions.size()); + const auto expected_primal = + compute_expected_step(x_host, dx_host, alpha_max, expected_primal_per_cone); + const auto expected_dual = + compute_expected_step(z_host, dz_host, alpha_max, expected_dual_per_cone); + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto dx = cuopt::device_copy(dx_host, stream); + auto dz = cuopt::device_copy(dz_host, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + const auto [step_primal, step_dual] = + compute_cone_step_length(cones, + raft::device_span(dx.data(), dx.size()), + raft::device_span(dz.data(), dz.size()), + alpha_max, + stream); + + EXPECT_LT(expected_primal, alpha_max); + EXPECT_LT(expected_dual, alpha_max); + EXPECT_NEAR(step_primal, expected_primal, 1e-12); + EXPECT_NEAR(step_dual, expected_dual, 1e-12); + + const auto primal_per_cone = cuopt::host_copy(cones.scratch.step_alpha_primal, stream); + const auto dual_per_cone = cuopt::host_copy(cones.scratch.step_alpha_dual, stream); + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + EXPECT_NEAR(primal_per_cone[cone], expected_primal_per_cone[cone], 1e-12) + << "primal cone " << cone; + EXPECT_NEAR(dual_per_cone[cone], expected_dual_per_cone[cone], 1e-12) << "dual cone " << cone; + } +} + +TEST(second_order_cone_kernels, scaling_operators_match_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector v_host(n_cone_entries); + std::vector cone_target_host(n_cone_entries); + std::vector accum_initial_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 100.0 + static_cast(cone); + z_host[offset] = 80.0 + static_cast(cone); + v_host[offset] = 0.75 + 0.1 * static_cast(cone); + cone_target_host[offset] = 0.4 + 0.03 * static_cast(cone); + accum_initial_host[offset] = -0.2 + 0.02 * static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + v_host[idx] = 0.002 * static_cast((local_idx % 11) - 5); + cone_target_host[idx] = 0.003 * static_cast((local_idx % 13) - 6); + accum_initial_host[idx] = 0.004 * static_cast((local_idx % 17) - 8); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto v = cuopt::device_copy(v_host, stream); + auto cone_target = cuopt::device_copy(cone_target_host, stream); + auto accum = cuopt::device_copy(accum_initial_host, stream); + rmm::device_uvector w_out(n_cone_entries, stream); + rmm::device_uvector w_inv_out(n_cone_entries, stream); + rmm::device_uvector h_out(n_cone_entries, stream); + rmm::device_uvector w_inv_tmp(n_cone_entries, stream); + rmm::device_uvector h_from_w_inv(n_cone_entries, stream); + rmm::device_uvector recovered_dz(n_cone_entries, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + launch_nt_scaling(cones, stream); + + auto v_span = raft::device_span(v.data(), v.size()); + apply_w(v_span, cuopt::make_span(w_out), cones, stream); + apply_w_inv(v_span, cuopt::make_span(w_inv_out), cones, stream); + apply_hinv2(v_span, cuopt::make_span(h_out), cones, stream); + recover_cone_dz_from_target( + v_span, + cones, + raft::device_span(cone_target.data(), cone_target.size()), + cuopt::make_span(recovered_dz), + stream); + accumulate_cone_hinv2_matvec(v_span, cones, cuopt::make_span(accum), stream); + apply_w_inv(v_span, cuopt::make_span(w_inv_tmp), cones, stream); + apply_w_inv(raft::device_span(w_inv_tmp.data(), w_inv_tmp.size()), + cuopt::make_span(h_from_w_inv), + cones, + stream); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto w_out_host = cuopt::host_copy(w_out, stream); + auto w_inv_out_host = cuopt::host_copy(w_inv_out, stream); + auto h_out_host = cuopt::host_copy(h_out, stream); + auto h_identity_host = cuopt::host_copy(h_from_w_inv, stream); + auto recovered_dz_host = cuopt::host_copy(recovered_dz, stream); + auto accum_host = cuopt::host_copy(accum, stream); + + std::vector expected_w(n_cone_entries); + std::vector expected_w_inv(n_cone_entries); + std::vector expected_h(n_cone_entries); + std::vector expected_h_unscaled(n_cone_entries); + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[offset]; + const auto v0 = v_host[offset]; + const auto eta = eta_host[cone]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + tail_dot += w_host[idx] * v_host[idx]; + } + + expected_w[offset] = eta * (w0 * v0 + tail_dot); + expected_w_inv[offset] = (w0 * v0 - tail_dot) / eta; + + const auto rho = w0 * v0 - tail_dot; + expected_h_unscaled[offset] = (2.0 * w0 * rho - v0) / (eta * eta); + expected_h[offset] = expected_h_unscaled[offset]; + + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + + expected_w[idx] = eta * (v_host[idx] + (v0 + tail_dot / (1.0 + w0)) * w_host[idx]); + expected_w_inv[idx] = (v_host[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta; + expected_h_unscaled[idx] = (v_host[idx] - 2.0 * w_host[idx] * rho) / (eta * eta); + expected_h[idx] = expected_h_unscaled[idx]; + } + + offset += static_cast(dim); + } + + for (std::size_t i = 0; i < n_cone_entries; ++i) { + EXPECT_NEAR(w_out_host[i], expected_w[i], 1e-9) << "W entry " << i; + EXPECT_NEAR(w_inv_out_host[i], expected_w_inv[i], 1e-9) << "W inverse entry " << i; + EXPECT_NEAR(h_out_host[i], expected_h[i], 1e-9) << "H entry " << i; + EXPECT_NEAR(h_out_host[i], h_identity_host[i], 1e-9) << "H identity entry " << i; + EXPECT_NEAR(recovered_dz_host[i], cone_target_host[i] - expected_h_unscaled[i], 1e-9) + << "recovered dz entry " << i; + EXPECT_NEAR(accum_host[i], accum_initial_host[i] + expected_h_unscaled[i], 1e-9) + << "accumulated H entry " << i; + } +} + +TEST(second_order_cone_kernels, combined_cone_rhs_matches_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector dx_aff_host(n_cone_entries); + std::vector dz_aff_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 120.0 + static_cast(cone); + z_host[offset] = 90.0 + static_cast(cone); + dx_aff_host[offset] = 0.25 + 0.05 * static_cast(cone); + dz_aff_host[offset] = -0.3 + 0.04 * static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + dx_aff_host[idx] = 0.002 * static_cast((local_idx % 11) - 5); + dz_aff_host[idx] = 0.001 * static_cast((local_idx % 13) - 6); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto dx_aff = cuopt::device_copy(dx_aff_host, stream); + auto dz_aff = cuopt::device_copy(dz_aff_host, stream); + rmm::device_uvector out(n_cone_entries, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + launch_nt_scaling(cones, stream); + + constexpr double sigma_mu = 0.37; + compute_combined_cone_rhs_term(raft::device_span(dx_aff.data(), dx_aff.size()), + raft::device_span(dz_aff.data(), dz_aff.size()), + cones, + sigma_mu, + cuopt::make_span(out), + stream); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto out_host = cuopt::host_copy(out, stream); + + auto apply_w_ref = [&](std::vector const& v) { + std::vector result(n_cone_entries); + std::size_t off = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[off]; + const auto v0 = v[off]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + tail_dot += w_host[idx] * v[idx]; + } + + result[off] = eta_host[cone] * (w0 * v0 + tail_dot); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + result[idx] = eta_host[cone] * (v[idx] + (v0 + tail_dot / (1.0 + w0)) * w_host[idx]); + } + + off += static_cast(dim); + } + return result; + }; + + auto apply_w_inv_ref = [&](std::vector const& v) { + std::vector result(n_cone_entries); + std::size_t off = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[off]; + const auto v0 = v[off]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + tail_dot += w_host[idx] * v[idx]; + } + + result[off] = (w0 * v0 - tail_dot) / eta_host[cone]; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + result[idx] = (v[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta_host[cone]; + } + + off += static_cast(dim); + } + return result; + }; + + auto scaled_dx = apply_w_inv_ref(dx_aff_host); + auto scaled_dz = apply_w_ref(dz_aff_host); + auto nt_point = apply_w_ref(z_host); + + std::vector shift(n_cone_entries); + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double head_dot = 0.0; + for (int local_idx = 0; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + head_dot += scaled_dx[idx] * scaled_dz[idx]; + } + + shift[offset] = head_dot - sigma_mu; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + shift[idx] = scaled_dx[offset] * scaled_dz[idx] + scaled_dz[offset] * scaled_dx[idx]; + } + + offset += static_cast(dim); + } + + std::vector minus_p(n_cone_entries); + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto lambda0 = nt_point[offset]; + + double lambda_tail_dot = 0.0; + double lambda_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + lambda_tail_dot += nt_point[idx] * shift[idx]; + lambda_tail_sq += nt_point[idx] * nt_point[idx]; + } + + const auto lambda_tail_norm = std::sqrt(lambda_tail_sq); + const auto det_lambda = (lambda0 - lambda_tail_norm) * (lambda0 + lambda_tail_norm); + ASSERT_GT(lambda0, 0.0) << "cone " << cone; + ASSERT_GT(det_lambda, 0.0) << "cone " << cone; + + const auto p_head = (lambda0 * shift[offset] - lambda_tail_dot) / det_lambda; + minus_p[offset] = -p_head; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + minus_p[idx] = (p_head * nt_point[idx] - shift[idx]) / lambda0; + } + + offset += static_cast(dim); + } + + auto expected = apply_w_inv_ref(minus_p); + for (std::size_t i = 0; i < n_cone_entries; ++i) { + EXPECT_NEAR(out_host[i], expected[i], 1e-8) << "entry " << i; + } +} + +} // namespace cuopt::linear_programming::dual_simplex::test diff --git a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu b/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu deleted file mode 100644 index b3e8974e8b..0000000000 --- a/cpp/tests/dual_simplex/unit_tests/second_order_cone_test.cu +++ /dev/null @@ -1,1416 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -#include -#include - -#include - -#include -#include -#include -#include - -namespace cuopt::linear_programming::dual_simplex::test { -namespace { - -template -auto build_offsets(const std::vector& dims) -> std::vector -{ - std::vector offsets(dims.size() + 1, 0); - for (std::size_t i = 0; i < dims.size(); ++i) { - offsets[i + 1] = offsets[i] + dims[i]; - } - return offsets; -} - -template -auto pack_cones(const std::vector>& cones) -> std::vector -{ - std::size_t total_size = 0; - for (const auto& cone : cones) { - total_size += cone.size(); - } - - std::vector packed; - packed.reserve(total_size); - for (const auto& cone : cones) { - packed.insert(packed.end(), cone.begin(), cone.end()); - } - return packed; -} - -template -auto slice_cone(const std::vector& packed, const std::vector& offsets, i_t cone) - -> std::vector -{ - auto begin = packed.begin() + offsets[cone]; - auto end = packed.begin() + offsets[cone + 1]; - return std::vector(begin, end); -} - -template -auto j_norm_sq(const std::vector& u) -> f_t -{ - if (u.empty()) { return f_t(0); } - - f_t tail_sq = f_t(0); - for (std::size_t j = 1; j < u.size(); ++j) { - tail_sq += u[j] * u[j]; - } - return u[0] * u[0] - tail_sq; -} - -template -auto tail_norm(const std::vector& u) -> f_t -{ - f_t tail_sq = f_t(0); - for (std::size_t j = 1; j < u.size(); ++j) { - tail_sq += u[j] * u[j]; - } - return std::sqrt(tail_sq); -} - -template -auto ref_apply_hinv_single(const std::vector& z, - const std::vector& w_bar, - f_t inv_eta, - f_t inv_1pw0) -> std::vector -{ - std::vector out(z.size(), f_t(0)); - if (z.empty()) { return out; } - - f_t zeta = f_t(0); - for (std::size_t j = 1; j < z.size(); ++j) { - zeta += w_bar[j] * z[j]; - } - - f_t coeff = -z[0] + zeta * inv_1pw0; - out[0] = (w_bar[0] * z[0] - zeta) * inv_eta; - for (std::size_t j = 1; j < z.size(); ++j) { - out[j] = (z[j] + coeff * w_bar[j]) * inv_eta; - } - return out; -} - -template -auto ref_apply_H_single(const std::vector& z, - const std::vector& w_bar, - f_t eta, - f_t inv_1pw0) -> std::vector -{ - std::vector out(z.size(), f_t(0)); - if (z.empty()) { return out; } - - f_t zeta = f_t(0); - for (std::size_t j = 1; j < z.size(); ++j) { - zeta += w_bar[j] * z[j]; - } - - f_t coeff = z[0] + zeta * inv_1pw0; - out[0] = (w_bar[0] * z[0] + zeta) * eta; - for (std::size_t j = 1; j < z.size(); ++j) { - out[j] = (z[j] + coeff * w_bar[j]) * eta; - } - return out; -} - -template -auto ref_build_hinv2_block_single(const std::vector& w_bar, f_t inv_eta) -> std::vector -{ - std::size_t q = w_bar.size(); - std::vector block(q * q, f_t(0)); - f_t ie_sq = inv_eta * inv_eta; - - for (std::size_t r = 0; r < q; ++r) { - f_t u_r = (r == 0) ? w_bar[0] : -w_bar[r]; - for (std::size_t c = 0; c < q; ++c) { - f_t u_c = (c == 0) ? w_bar[0] : -w_bar[c]; - f_t j_rc = (r == c) ? ((r == 0) ? f_t(1) : f_t(-1)) : f_t(0); - block[r * q + c] = ie_sq * (f_t(2) * u_r * u_c - j_rc); - } - } - return block; -} - -template -auto ref_apply_hinv2_single(const std::vector& v, const std::vector& w_bar, f_t inv_eta) - -> std::vector -{ - std::vector out(v.size(), f_t(0)); - if (v.empty()) { return out; } - - f_t uTv = w_bar[0] * v[0]; - for (std::size_t j = 1; j < v.size(); ++j) { - uTv -= w_bar[j] * v[j]; - } - - f_t ie_sq = inv_eta * inv_eta; - out[0] = ie_sq * (f_t(2) * w_bar[0] * uTv - v[0]); - for (std::size_t j = 1; j < v.size(); ++j) { - out[j] = ie_sq * (-f_t(2) * w_bar[j] * uTv + v[j]); - } - return out; -} - -template -struct nt_scaling_reference_t { - f_t eta{}; - f_t inv_eta{}; - f_t inv_1pw0{}; - f_t rho{}; - std::vector w_bar; - std::vector omega; -}; - -template -auto ref_nt_scaling_single(const std::vector& s, const std::vector& lambda) - -> nt_scaling_reference_t -{ - EXPECT_EQ(s.size(), lambda.size()); - EXPECT_FALSE(s.empty()); - - f_t s_j_norm_sq = j_norm_sq(s); - f_t l_j_norm_sq = j_norm_sq(lambda); - EXPECT_GT(s_j_norm_sq, f_t(0)); - EXPECT_GT(l_j_norm_sq, f_t(0)); - - f_t s_j_norm = std::sqrt(s_j_norm_sq); - f_t l_j_norm = std::sqrt(l_j_norm_sq); - f_t inv_s_j_norm = f_t(1) / s_j_norm; - f_t inv_l_j_norm = f_t(1) / l_j_norm; - - f_t dot_bar = (s[0] * lambda[0]) * inv_s_j_norm * inv_l_j_norm; - for (std::size_t j = 1; j < s.size(); ++j) { - dot_bar += (s[j] * lambda[j]) * inv_s_j_norm * inv_l_j_norm; - } - - f_t gamma = std::sqrt(std::max(f_t(0), (f_t(1) + dot_bar) * f_t(0.5))); - f_t inv_2g = f_t(1) / (f_t(2) * gamma); - - nt_scaling_reference_t ref{}; - ref.eta = std::sqrt(s_j_norm / l_j_norm); - ref.inv_eta = f_t(1) / ref.eta; - ref.rho = s_j_norm * l_j_norm; - ref.w_bar.assign(s.size(), f_t(0)); - - f_t w1_sq = f_t(0); - for (std::size_t j = 1; j < s.size(); ++j) { - ref.w_bar[j] = inv_2g * (s[j] * inv_s_j_norm - lambda[j] * inv_l_j_norm); - w1_sq += ref.w_bar[j] * ref.w_bar[j]; - } - - // Match the kernel's numerical cleanup path for w_bar[0]. - ref.w_bar[0] = std::sqrt(f_t(1) + w1_sq); - ref.inv_1pw0 = f_t(1) / (f_t(1) + ref.w_bar[0]); - ref.omega = ref_apply_hinv_single(s, ref.w_bar, ref.inv_eta, ref.inv_1pw0); - return ref; -} - -template -auto ref_step_length_single(const std::vector& u, const std::vector& du, f_t alpha_max) - -> f_t -{ - EXPECT_EQ(u.size(), du.size()); - EXPECT_FALSE(u.empty()); - - f_t du1_sq = f_t(0); - f_t u1du1 = f_t(0); - f_t u1_sq = f_t(0); - for (std::size_t j = 1; j < u.size(); ++j) { - du1_sq += du[j] * du[j]; - u1du1 += u[j] * du[j]; - u1_sq += u[j] * u[j]; - } - - f_t a = du[0] * du[0] - du1_sq; - f_t b = u[0] * du[0] - u1du1; - f_t c = std::max(f_t(0), u[0] * u[0] - u1_sq); - f_t disc = b * b - a * c; - - f_t alpha = alpha_max; - if (du[0] < f_t(0)) { alpha = std::min(alpha, -u[0] / du[0]); } - - if ((a > f_t(0) && b > f_t(0)) || disc < f_t(0)) { - // No positive root (parabola stays non-negative for alpha > 0). - } else if (a < f_t(0)) { - alpha = std::min(alpha, (b + std::sqrt(std::max(f_t(0), disc))) / (-a)); - } else if (a == f_t(0)) { - if (b < f_t(0)) { alpha = std::min(alpha, c / (f_t(-2) * b)); } - } else if (c == f_t(0)) { - alpha = (a >= f_t(0)) ? alpha : f_t(0); - } else if (b < f_t(0) && disc > f_t(0)) { - alpha = std::min(alpha, (-b - std::sqrt(disc)) / a); - } - - return alpha; -} - -template -auto ref_jordan_product_single(const std::vector& a, const std::vector& b) - -> std::vector -{ - EXPECT_EQ(a.size(), b.size()); - std::vector out(a.size(), f_t(0)); - if (a.empty()) { return out; } - - f_t dot = f_t(0); - for (std::size_t j = 0; j < a.size(); ++j) { - dot += a[j] * b[j]; - } - out[0] = dot; - for (std::size_t j = 1; j < a.size(); ++j) { - out[j] = a[0] * b[j] + b[0] * a[j]; - } - return out; -} - -template -auto ref_inverse_jordan_product_single(const std::vector& omega, - const std::vector& r, - f_t rho_val) -> std::vector -{ - EXPECT_EQ(omega.size(), r.size()); - std::vector out(omega.size(), f_t(0)); - if (omega.empty()) { return out; } - - f_t nu = f_t(0); - for (std::size_t j = 1; j < omega.size(); ++j) { - nu += omega[j] * r[j]; - } - - f_t inv_rho = f_t(1) / rho_val; - f_t omega_0 = omega[0]; - out[0] = (omega_0 * r[0] - nu) * inv_rho; - - f_t c_omega = ((nu / omega_0) - r[0]) * inv_rho; - f_t c_r = f_t(1) / omega_0; - for (std::size_t j = 1; j < omega.size(); ++j) { - out[j] = c_omega * omega[j] + c_r * r[j]; - } - return out; -} - -template -auto ref_fused_corrector_single(const std::vector& dx_aff, - const std::vector& omega, - const std::vector& w_bar, - f_t inv_eta, - f_t inv_1pw0, - f_t rho_val, - f_t sigma_mu) -> std::vector -{ - auto dx = ref_apply_hinv_single(dx_aff, w_bar, inv_eta, inv_1pw0); - - std::vector dz(dx.size()); - for (std::size_t j = 0; j < dx.size(); ++j) { - dz[j] = -omega[j] - dx[j]; - } - - auto r_K_1 = ref_jordan_product_single(omega, omega); - auto r_K_2 = ref_jordan_product_single(dx, dz); - - std::vector r_K(dx.size()); - for (std::size_t j = 0; j < dx.size(); ++j) { - r_K[j] = r_K_1[j] + r_K_2[j]; - } - r_K[0] -= sigma_mu; - - auto corr = ref_inverse_jordan_product_single(omega, r_K, rho_val); - return ref_apply_hinv_single(corr, w_bar, inv_eta, inv_1pw0); -} - -template -auto make_patterned_cone(int q, f_t head, f_t scale) -> std::vector -{ - std::vector cone(q, f_t(0)); - cone[0] = head; - for (int j = 1; j < q; ++j) { - f_t sign = (j % 2 == 0) ? f_t(1) : f_t(-1); - cone[j] = sign * scale * static_cast((j % 7) + 1); - } - return cone; -} - -} // namespace - -class second_order_cone_test : public ::testing::Test { - protected: - using i_t = int; - using f_t = double; - static constexpr int dim = 256; - - raft::handle_t handle_; - rmm::cuda_stream_view stream_ = handle_.get_stream(); - - template - auto make_device_vector(const std::vector& host) -> rmm::device_uvector - { - rmm::device_uvector device(host.size(), stream_); - if (!host.empty()) { raft::copy(device.data(), host.data(), host.size(), stream_); } - sync(); - return device; - } - - template - auto copy_to_host(const rmm::device_uvector& device) -> std::vector - { - std::vector host(device.size()); - if (!host.empty()) { raft::copy(host.data(), device.data(), host.size(), stream_); } - sync(); - return host; - } - - template - void copy_to_device(rmm::device_uvector& device, const std::vector& host) - { - ASSERT_EQ(device.size(), host.size()); - if (!host.empty()) { raft::copy(device.data(), host.data(), host.size(), stream_); } - sync(); - } - - void sync() { RAFT_CUDA_TRY(cudaStreamSynchronize(stream_.value())); } - - template - void expect_vector_near(const std::vector& actual, - const std::vector& expected, - t_t atol, - t_t rtol, - const char* label) - { - ASSERT_EQ(actual.size(), expected.size()) << label << " size mismatch"; - for (std::size_t i = 0; i < actual.size(); ++i) { - EXPECT_NEAR(actual[i], expected[i], atol + rtol * std::abs(expected[i])) - << label << "[" << i << "]"; - } - } - - void launch_step_length(rmm::device_uvector& s, - rmm::device_uvector& ds, - rmm::device_uvector& lambda, - rmm::device_uvector& dlambda, - rmm::device_uvector& alpha, - const rmm::device_uvector& cone_offsets, - i_t k, - f_t alpha_max) - { - auto h_offsets = copy_to_host(cone_offsets); - std::vector dims(k, 0); - for (i_t cone = 0; cone < k; ++cone) { - dims[cone] = h_offsets[cone + 1] - h_offsets[cone]; - } - cone_data_t cones(k, dims, cuopt::make_span(s), cuopt::make_span(lambda), stream_); - rmm::device_uvector alpha_dual(k, stream_); - launch_nt_scaling(cones, stream_); - compute_cone_step_length_per_cone(cones, - cuopt::make_span(s), - cuopt::make_span(ds), - cuopt::make_span(lambda), - cuopt::make_span(dlambda), - cuopt::make_span(alpha), - cuopt::make_span(alpha_dual), - alpha_max, - stream_); - sync(); - auto h_primal = copy_to_host(alpha); - auto h_dual = copy_to_host(alpha_dual); - for (i_t i = 0; i < k; ++i) { - h_primal[i] = std::min(h_primal[i], h_dual[i]); - } - copy_to_device(alpha, h_primal); - } - - void launch_apply_hinv2(const rmm::device_uvector& v, - rmm::device_uvector& out, - const rmm::device_uvector& w_bar, - const rmm::device_uvector& inv_eta, - const rmm::device_uvector& cone_offsets, - i_t k) - { - auto h_offsets = copy_to_host(cone_offsets); - std::vector h_element_cone_ids(v.size(), 0); - for (i_t cone = 0; cone < k; ++cone) { - std::fill(h_element_cone_ids.begin() + h_offsets[cone], - h_element_cone_ids.begin() + h_offsets[cone + 1], - cone); - } - auto d_element_cone_ids = make_device_vector(h_element_cone_ids); - rmm::device_uvector tail_dot(k, stream_); - rmm::device_uvector workspace(0, stream_); - apply_hinv2(cuopt::make_span(v), - cuopt::make_span(out), - cuopt::make_span(w_bar), - cuopt::make_span(inv_eta), - cuopt::make_span(cone_offsets), - cuopt::make_span(d_element_cone_ids), - cuopt::make_span(tail_dot), - workspace, - k, - stream_); - sync(); - } - - void launch_fused_corrector(const rmm::device_uvector& dx_aff, - cone_data_t& cones, - f_t sigma_mu, - rmm::device_uvector& out) - { - out.resize(cones.m_c, stream_); - compute_combined_cone_rhs_term( - cuopt::make_span(dx_aff), cones, sigma_mu, cuopt::make_span(out), stream_); - sync(); - } - - void launch_affine_cone_rhs(cone_data_t& cones, rmm::device_uvector& out) - { - out.resize(cones.m_c, stream_); - compute_affine_cone_rhs_term(cones, cuopt::make_span(out), stream_); - sync(); - } - - void launch_recover_cone_dz_from_target(const rmm::device_uvector& dx, - cone_data_t& cones, - const rmm::device_uvector& cone_target, - rmm::device_uvector& hinv2_dx, - rmm::device_uvector& dz) - { - recover_cone_dz_from_target(cuopt::make_span(dx), - cones, - cuopt::make_span(cone_target), - hinv2_dx, - cuopt::make_span(dz), - stream_); - sync(); - } - - void launch_accumulate_cone_hinv2(const rmm::device_uvector& x, - cone_data_t& cones, - rmm::device_uvector& hinv2_x, - rmm::device_uvector& out) - { - accumulate_cone_hinv2_matvec( - cuopt::make_span(x), cones, hinv2_x, cuopt::make_span(out), stream_); - sync(); - } - - void launch_cone_block_scatter(const cone_data_t& cones, - rmm::device_uvector& aug_x, - const rmm::device_uvector& csr_indices, - const rmm::device_uvector& q_values) - { - scatter_hinv2_into_augmented(cones, aug_x, csr_indices, q_values, stream_); - sync(); - } -}; - -TEST_F(second_order_cone_test, cone_data_topology_and_flat_index_maps) -{ - std::vector dims{1, 2, 3, 4}; - cone_data_t cones(static_cast(dims.size()), dims, {}, {}, stream_); - - auto expected_offsets = build_offsets(dims); - auto actual_offsets = copy_to_host(cones.cone_offsets); - auto actual_dims = copy_to_host(cones.cone_dims); - auto element_cone_ids = copy_to_host(cones.element_cone_ids); - auto block_cone_ids = copy_to_host(cones.block_entry_cone_ids); - - std::vector expected_element_cone_ids; - for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { - expected_element_cone_ids.insert(expected_element_cone_ids.end(), dims[cone], cone); - } - - std::vector expected_block_cone_ids; - for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { - expected_block_cone_ids.insert(expected_block_cone_ids.end(), dims[cone] * dims[cone], cone); - } - - EXPECT_EQ(cones.K, static_cast(dims.size())); - EXPECT_EQ(cones.m_c, expected_offsets.back()); - EXPECT_EQ(actual_offsets, expected_offsets); - EXPECT_EQ(actual_dims, dims); - EXPECT_EQ(element_cone_ids, expected_element_cone_ids); - EXPECT_EQ(block_cone_ids, expected_block_cone_ids); -} - -TEST_F(second_order_cone_test, cone_data_reuses_named_scratch_slots) -{ - std::vector> s_cones{{5.0, 1.0, 1.0}, {6.0, 1.0, -0.5, 0.25, 0.1}}; - std::vector> ds_cones{{-0.5, 0.1, 0.1}, {-0.2, 0.05, 0.03, -0.02, 0.01}}; - std::vector> lambda_cones{{5.0, 1.0, 1.0}, {4.0, 0.2, 0.3, -0.1, 0.05}}; - std::vector> dlambda_cones{{-0.5, 0.1, 0.1}, {-0.1, 0.02, -0.03, 0.01, -0.01}}; - std::vector dims{3, 5}; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_ds = make_device_vector(pack_cones(ds_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - auto d_dlambda = make_device_vector(pack_cones(dlambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - EXPECT_EQ(cones.scratch.step_s_du1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_s_u1du1().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_l_du1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_l_u1du1().size(), dims.size()); - EXPECT_EQ(cones.scratch.hinv2_tail_dot().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_s_u1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_l_u1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.nt_s1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.nt_l1_sq().size(), dims.size()); - EXPECT_EQ(cones.scratch.nt_sl().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_alpha_primal_span().size(), dims.size()); - EXPECT_EQ(cones.scratch.step_alpha_dual_span().size(), dims.size()); - - auto s_du1_ptr = cones.scratch.step_s_du1_sq().data(); - auto s_u1du1_ptr = cones.scratch.step_s_u1du1().data(); - auto l_du1_ptr = cones.scratch.step_l_du1_sq().data(); - auto l_u1du1_ptr = cones.scratch.step_l_u1du1().data(); - auto hinv2_ptr = cones.scratch.hinv2_tail_dot().data(); - auto s_u1_sq_ptr = cones.scratch.step_s_u1_sq().data(); - auto l_u1_sq_ptr = cones.scratch.step_l_u1_sq().data(); - auto nt_s1_ptr = cones.scratch.nt_s1_sq().data(); - auto nt_l1_ptr = cones.scratch.nt_l1_sq().data(); - auto nt_sl_ptr = cones.scratch.nt_sl().data(); - auto alpha_p_ptr = cones.scratch.step_alpha_primal_span().data(); - auto alpha_d_ptr = cones.scratch.step_alpha_dual_span().data(); - - EXPECT_EQ(hinv2_ptr, s_du1_ptr); - EXPECT_EQ(s_du1_ptr, nt_s1_ptr); - EXPECT_EQ(s_u1du1_ptr, nt_l1_ptr); - EXPECT_EQ(s_u1_sq_ptr, nt_sl_ptr); - - compute_cone_step_length_per_cone(cones, - cuopt::make_span(d_s), - cuopt::make_span(d_ds), - cuopt::make_span(d_lambda), - cuopt::make_span(d_dlambda), - cones.scratch.step_alpha_primal_span(), - cones.scratch.step_alpha_dual_span(), - f_t(10.0), - stream_); - sync(); - - EXPECT_EQ(s_du1_ptr, cones.scratch.step_s_du1_sq().data()); - EXPECT_EQ(s_u1du1_ptr, cones.scratch.step_s_u1du1().data()); - EXPECT_EQ(l_du1_ptr, cones.scratch.step_l_du1_sq().data()); - EXPECT_EQ(l_u1du1_ptr, cones.scratch.step_l_u1du1().data()); - EXPECT_EQ(hinv2_ptr, cones.scratch.hinv2_tail_dot().data()); - EXPECT_EQ(s_u1_sq_ptr, cones.scratch.step_s_u1_sq().data()); - EXPECT_EQ(l_u1_sq_ptr, cones.scratch.step_l_u1_sq().data()); - EXPECT_EQ(nt_s1_ptr, cones.scratch.nt_s1_sq().data()); - EXPECT_EQ(nt_l1_ptr, cones.scratch.nt_l1_sq().data()); - EXPECT_EQ(nt_sl_ptr, cones.scratch.nt_sl().data()); - EXPECT_EQ(alpha_p_ptr, cones.scratch.step_alpha_primal_span().data()); - EXPECT_EQ(alpha_d_ptr, cones.scratch.step_alpha_dual_span().data()); -} - -TEST_F(second_order_cone_test, nt_scaling_matches_reference_for_small_cone) -{ - // Fixed small-cone fixture validated against the host-side NT formulas. - std::vector> s_cones{{1.5, 0.3, 0.4}}; - std::vector> lambda_cones{{2.0, 0.5, 0.5}}; - std::vector dims{3}; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto inv_eta = copy_to_host(cones.inv_eta); - auto inv_1pw0 = copy_to_host(cones.inv_1pw0); - auto rho = copy_to_host(cones.rho); - auto w_bar = copy_to_host(cones.w_bar); - auto omega = copy_to_host(cones.omega); - - auto ref = ref_nt_scaling_single(s_cones[0], lambda_cones[0]); - - EXPECT_NEAR(inv_eta[0], ref.inv_eta, 1e-12); - EXPECT_NEAR(inv_1pw0[0], ref.inv_1pw0, 1e-12); - EXPECT_NEAR(rho[0], ref.rho, 1e-12); - expect_vector_near(w_bar, ref.w_bar, 1e-12, 1e-10, "w_bar"); - expect_vector_near(omega, ref.omega, 1e-12, 1e-10, "omega"); - - EXPECT_NEAR(j_norm_sq(w_bar), f_t(1), 1e-12); - EXPECT_NEAR(j_norm_sq(omega), rho[0], 1e-12); - - auto omega_from_apply_hinv = ref_apply_hinv_single(s_cones[0], w_bar, inv_eta[0], inv_1pw0[0]); - expect_vector_near(omega, omega_from_apply_hinv, 1e-12, 1e-10, "omega_consistency"); -} - -TEST_F(second_order_cone_test, nt_scaling_matches_reference_across_bucket_sizes) -{ - std::vector> s_cones{ - {2.0}, make_patterned_cone(33, 4.0, 0.01), make_patterned_cone(2049, 5.0, 0.001)}; - std::vector> lambda_cones{ - {0.5}, make_patterned_cone(33, 3.0, 0.0075), make_patterned_cone(2049, 4.0, 0.00075)}; - std::vector dims{1, 33, 2049}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - launch_nt_scaling(cones, stream_); - - auto inv_eta = copy_to_host(cones.inv_eta); - auto inv_1pw0 = copy_to_host(cones.inv_1pw0); - auto rho = copy_to_host(cones.rho); - auto w_bar = copy_to_host(cones.w_bar); - auto omega = copy_to_host(cones.omega); - - for (i_t cone = 0; cone < static_cast(dims.size()); ++cone) { - auto ref = ref_nt_scaling_single(s_cones[cone], lambda_cones[cone]); - - EXPECT_NEAR(inv_eta[cone], ref.inv_eta, 1e-10) << "cone " << cone; - EXPECT_NEAR(inv_1pw0[cone], ref.inv_1pw0, 1e-10) << "cone " << cone; - EXPECT_NEAR(rho[cone], ref.rho, 1e-10) << "cone " << cone; - - auto actual_w_bar = slice_cone(w_bar, offsets, cone); - auto actual_omega = slice_cone(omega, offsets, cone); - expect_vector_near(actual_w_bar, ref.w_bar, 1e-10, 1e-8, "w_bar"); - expect_vector_near(actual_omega, ref.omega, 1e-10, 1e-8, "omega"); - - EXPECT_NEAR(j_norm_sq(actual_w_bar), f_t(1), 1e-10) << "cone " << cone; - EXPECT_NEAR(j_norm_sq(actual_omega), rho[cone], 1e-10) << "cone " << cone; - } -} - -TEST_F(second_order_cone_test, nt_scaling_omega_equals_H_times_lambda) -{ - std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; - std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; - std::vector dims{5}; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto inv_eta = copy_to_host(cones.inv_eta); - auto inv_1pw0 = copy_to_host(cones.inv_1pw0); - auto w_bar = copy_to_host(cones.w_bar); - auto omega = copy_to_host(cones.omega); - - // NT symmetry: omega should equal both H^{-1}s and H*lambda. - auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, f_t(1) / inv_eta[0], inv_1pw0[0]); - expect_vector_near(omega, H_lambda, 1e-10, 1e-8, "omega_vs_H_lambda"); -} - -TEST_F(second_order_cone_test, nt_scaling_tail_identities_match_heads) -{ - std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; - std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; - std::vector dims{5}; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto inv_eta = copy_to_host(cones.inv_eta); - auto rho = copy_to_host(cones.rho); - auto w_bar = copy_to_host(cones.w_bar); - auto omega = copy_to_host(cones.omega); - - f_t s_J = std::sqrt(j_norm_sq(s_cones[0])); - f_t l_J = std::sqrt(j_norm_sq(lambda_cones[0])); - f_t s_dot_raw = s_cones[0][0] * lambda_cones[0][0]; - for (std::size_t j = 1; j < s_cones[0].size(); ++j) { - s_dot_raw += s_cones[0][j] * lambda_cones[0][j]; - } - f_t s_dot_l = s_dot_raw / (s_J * l_J); - f_t gamma = std::sqrt(std::max(f_t(0), (f_t(1) + s_dot_l) * f_t(0.5))); - f_t w0_from_heads = (s_cones[0][0] / s_J + lambda_cones[0][0] / l_J) / (f_t(2) * gamma); - - f_t omega_tail_sq = f_t(0); - f_t w_omega_tail = f_t(0); - for (std::size_t j = 1; j < omega.size(); ++j) { - omega_tail_sq += omega[j] * omega[j]; - w_omega_tail += w_bar[j] * omega[j]; - } - - EXPECT_NEAR(omega_tail_sq, omega[0] * omega[0] - rho[0], 1e-10) - << "||omega_1||^2 should be derived from omega_0 and rho"; - EXPECT_NEAR(w_bar[0], w0_from_heads, 1e-10) - << "w_bar_0 should be derived directly from normalized cone heads"; - - f_t derived_w_omega = f_t(0.5) * (inv_eta[0] * s_cones[0][0] - lambda_cones[0][0] / inv_eta[0]); - EXPECT_NEAR(w_omega_tail, derived_w_omega, 1e-10) - << "w_bar_1^T omega_1 should be derived from cone heads"; -} - -TEST_F(second_order_cone_test, nt_scaling_near_boundary_is_stable) -{ - // s and lambda barely inside the cone: ||tail||^2 ≈ head^2. - std::vector> s_cones{{1.00002, 0.6, 0.8, 1e-4, -2e-4}}; - std::vector> lambda_cones{{1.000015, 0.8, 0.6, -3e-5, 2e-5}}; - std::vector dims{5}; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto inv_eta = copy_to_host(cones.inv_eta); - auto inv_1pw0 = copy_to_host(cones.inv_1pw0); - auto w_bar = copy_to_host(cones.w_bar); - auto omega = copy_to_host(cones.omega); - - f_t eta_val = f_t(1) / inv_eta[0]; - - EXPECT_NEAR(j_norm_sq(w_bar), f_t(1), 1e-8) << "w_bar J-norm not 1 near boundary"; - EXPECT_GT(w_bar[0], tail_norm(w_bar)) << "w_bar not interior near boundary"; - - // Round-trip: H(omega) should equal s. - auto H_omega = ref_apply_H_single(omega, w_bar, eta_val, inv_1pw0[0]); - expect_vector_near(H_omega, pack_cones(s_cones), 1e-8, 1e-6, "H_omega_vs_s_near_boundary"); - - // Symmetry: omega should also equal H*lambda. - auto H_lambda = ref_apply_H_single(lambda_cones[0], w_bar, eta_val, inv_1pw0[0]); - expect_vector_near(omega, H_lambda, 1e-8, 1e-6, "omega_vs_H_lambda_near_boundary"); -} - -TEST_F(second_order_cone_test, step_length_matches_reference_and_handles_q1) -{ - std::vector dims{1, 3}; - auto offsets = build_offsets(dims); - - std::vector> s_cones{{2.0}, {5.0, 1.0, 1.0}}; - std::vector> ds_cones{{-3.0}, {-0.5, 0.1, 0.1}}; - std::vector> lambda_cones{{5.0}, {5.0, 1.0, 1.0}}; - std::vector> dlambda_cones{{1.0}, {-0.5, 0.1, 0.1}}; - f_t alpha_max = 10.0; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(dims.size(), stream_); - - launch_step_length( - s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); - - auto actual_alpha = copy_to_host(alpha); - std::vector expected_alpha(dims.size(), alpha_max); - for (std::size_t cone = 0; cone < dims.size(); ++cone) { - expected_alpha[cone] = - std::min(ref_step_length_single(s_cones[cone], ds_cones[cone], alpha_max), - ref_step_length_single(lambda_cones[cone], dlambda_cones[cone], alpha_max)); - } - - expect_vector_near(actual_alpha, expected_alpha, 1e-12, 1e-10, "step_length"); - EXPECT_NEAR(actual_alpha[0], 2.0 / 3.0, 1e-12); - EXPECT_NEAR(actual_alpha[1], 5.5903758157691508, 1e-10); -} - -TEST_F(second_order_cone_test, step_length_matches_reference_for_large_cone) -{ - std::vector dims{513}; - auto offsets = build_offsets(dims); - - std::vector> s_cones{{make_patterned_cone(dims[0], 5.0, 0.01)}}; - std::vector> ds_cones{{make_patterned_cone(dims[0], -0.25, 0.002)}}; - std::vector> lambda_cones{{make_patterned_cone(dims[0], 6.0, 0.009)}}; - std::vector> dlambda_cones{{make_patterned_cone(dims[0], -0.15, 0.0015)}}; - f_t alpha_max = 20.0; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(dims.size(), stream_); - - launch_step_length( - s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); - - auto actual_alpha = copy_to_host(alpha); - std::vector expected_alpha(dims.size(), alpha_max); - for (std::size_t cone = 0; cone < dims.size(); ++cone) { - expected_alpha[cone] = - std::min(ref_step_length_single(s_cones[cone], ds_cones[cone], alpha_max), - ref_step_length_single(lambda_cones[cone], dlambda_cones[cone], alpha_max)); - } - - expect_vector_near(actual_alpha, expected_alpha, 1e-12, 1e-10, "step_length_large"); - EXPECT_GT(actual_alpha[0], 0.0); - EXPECT_LT(actual_alpha[0], alpha_max); -} - -TEST_F(second_order_cone_test, step_length_boundary_c_zero_returns_zero) -{ - std::vector dims{3}; - auto offsets = build_offsets(dims); - - // Boundary point: c = u^T J u = 1^2 - 1^2 - 0^2 = 0. - // Direction: a = du^T J du = 1^2 - 1^2 - 1^2 = -1 < 0. - // The step length is 0 in this case because the direction leaves the cone - // immediately. - std::vector> s_cones{{1.0, 1.0, 0.0}}; - std::vector> ds_cones{{1.0, 1.0, 1.0}}; - std::vector> lambda_cones{{1.0, 1.0, 0.0}}; - std::vector> dlambda_cones{{1.0, 1.0, 1.0}}; - f_t alpha_max = 10.0; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(dims.size(), stream_); - - launch_step_length( - s, ds, lambda, dlambda, alpha, d_offsets, static_cast(dims.size()), alpha_max); - - auto actual_alpha = copy_to_host(alpha); - ASSERT_EQ(actual_alpha.size(), 1); - EXPECT_EQ(actual_alpha[0], 0.0); -} - -TEST_F(second_order_cone_test, step_length_degenerate_a_zero) -{ - std::vector dims{2}; - auto offsets = build_offsets(dims); - - // u=(2,0), du=(-1,1): a = du_0^2 - du_1^2 = 1 - 1 = 0 (degenerate quadratic). - // Linear constraint: alpha <= 2. Degenerate branch: alpha = c/(-2b) = 4/2 = 2. - // But the linear constraint also gives alpha <= 2, so result is min(2, 2) = 2... - // Actually b = u0*du0 - u1*du1 = 2*(-1) - 0 = -2, c = u0^2 - u1^2 = 4. - // Degenerate: alpha = c/(-2b) = 4/4 = 1. And linear: alpha <= -u0/du0 = 2. - // So alpha = 1. - std::vector> s_cones{{2.0, 0.0}}; - std::vector> ds_cones{{-1.0, 1.0}}; - std::vector> lambda_cones{{5.0, 0.0}}; - std::vector> dlambda_cones{{0.0, 0.0}}; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(1, stream_); - - launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 10.0); - - auto actual = copy_to_host(alpha); - EXPECT_NEAR(actual[0], 1.0, 1e-14); -} - -TEST_F(second_order_cone_test, step_length_safe_direction_returns_alpha_max) -{ - std::vector dims{3}; - auto offsets = build_offsets(dims); - - // Interior point with direction along the identity element — stays in cone forever. - std::vector> s_cones{{10.0, 0.0, 0.0}}; - std::vector> ds_cones{{1.0, 0.0, 0.0}}; - std::vector> lambda_cones{{10.0, 0.0, 0.0}}; - std::vector> dlambda_cones{{0.0, 0.1, 0.0}}; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(1, stream_); - - launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 1.0); - - auto actual = copy_to_host(alpha); - EXPECT_DOUBLE_EQ(actual[0], 1.0); -} - -TEST_F(second_order_cone_test, step_length_boundary_tightness) -{ - std::vector dims{5}; - auto offsets = build_offsets(dims); - - std::vector> s_cones{{4.0, 1.0, -1.0, 0.5, 0.3}}; - std::vector> ds_cones{{-2.0, 1.0, 0.5, -0.3, 0.1}}; - std::vector> lambda_cones{{5.0, 0.5, 1.0, -0.3, 0.2}}; - std::vector> dlambda_cones{{-1.0, 2.0, 1.0, -0.5, 0.4}}; - - auto s = make_device_vector(pack_cones(s_cones)); - auto ds = make_device_vector(pack_cones(ds_cones)); - auto lambda = make_device_vector(pack_cones(lambda_cones)); - auto dlambda = make_device_vector(pack_cones(dlambda_cones)); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector alpha(1, stream_); - - launch_step_length(s, ds, lambda, dlambda, alpha, d_offsets, 1, 100.0); - - auto a = copy_to_host(alpha)[0]; - ASSERT_GT(a, 0.0); - - // At alpha, at least one of (s, lambda) should be on the cone boundary. - auto s_bnd = s_cones[0]; - auto l_bnd = lambda_cones[0]; - for (std::size_t j = 0; j < s_bnd.size(); ++j) { - s_bnd[j] += a * ds_cones[0][j]; - l_bnd[j] += a * dlambda_cones[0][j]; - } - f_t res_s = j_norm_sq(s_bnd); - f_t res_l = j_norm_sq(l_bnd); - EXPECT_GE(res_s, -1e-10) << "s left the cone"; - EXPECT_GE(res_l, -1e-10) << "lambda left the cone"; - EXPECT_NEAR(std::min(res_s, res_l), 0.0, 1e-10) << "neither hit the boundary"; - - // At (1 − ε) α, both should be strictly interior. - f_t a_int = a * (1.0 - 1e-8); - auto s_int = s_cones[0]; - auto l_int = lambda_cones[0]; - for (std::size_t j = 0; j < s_int.size(); ++j) { - s_int[j] += a_int * ds_cones[0][j]; - l_int[j] += a_int * dlambda_cones[0][j]; - } - EXPECT_GT(j_norm_sq(s_int), 0.0) << "s not interior at (1-eps)*alpha"; - EXPECT_GT(j_norm_sq(l_int), 0.0) << "lambda not interior at (1-eps)*alpha"; -} - -TEST_F(second_order_cone_test, apply_hinv2_matches_reference_for_packed_cones) -{ - std::vector dims{1, 3, 5}; - auto offsets = build_offsets(dims); - - std::vector> v_cones{{3.0}, {2.0, -1.0, 0.5}, {1.0, 0.25, -0.75, 0.5, -0.125}}; - std::vector> w_bar_cones{ - {1.0}, {0.0, 0.15, -0.05}, {0.0, 0.10, -0.20, 0.05, 0.15}}; - std::vector inv_eta_host{0.5, 1.25, 0.75}; - - for (std::size_t cone = 0; cone < w_bar_cones.size(); ++cone) { - f_t w1_sq = f_t(0); - for (std::size_t j = 1; j < w_bar_cones[cone].size(); ++j) { - w1_sq += w_bar_cones[cone][j] * w_bar_cones[cone][j]; - } - w_bar_cones[cone][0] = std::sqrt(f_t(1) + w1_sq); - } - - auto v = make_device_vector(pack_cones(v_cones)); - auto w_bar = make_device_vector(pack_cones(w_bar_cones)); - auto inv_eta = make_device_vector(inv_eta_host); - auto d_offsets = make_device_vector(offsets); - rmm::device_uvector out(v.size(), stream_); - - launch_apply_hinv2(v, out, w_bar, inv_eta, d_offsets, static_cast(dims.size())); - - auto actual = copy_to_host(out); - auto expected = pack_cones(std::vector>{ - ref_apply_hinv2_single(v_cones[0], w_bar_cones[0], inv_eta_host[0]), - ref_apply_hinv2_single(v_cones[1], w_bar_cones[1], inv_eta_host[1]), - ref_apply_hinv2_single(v_cones[2], w_bar_cones[2], inv_eta_host[2])}); - - expect_vector_near(actual, expected, 1e-12, 1e-10, "apply_hinv2"); -} - -TEST_F(second_order_cone_test, affine_cone_rhs_matches_hinv2_of_primal) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - launch_nt_scaling(cones, stream_); - - rmm::device_uvector d_out(cones.m_c, stream_); - launch_affine_cone_rhs(cones, d_out); - - auto actual = copy_to_host(d_out); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto ref = ref_apply_hinv2_single(s_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); - auto act = slice_cone(actual, offsets, c); - expect_vector_near(act, ref, 1e-10, 1e-8, "affine_cone_rhs"); - } -} - -TEST_F(second_order_cone_test, accumulate_cone_hinv2_matvec_matches_reference) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector> x_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; - std::vector> base_cones{{1.0, 2.0, 3.0}, {0.5, -0.5, 0.25, -0.25, 0.75}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - auto d_x = make_device_vector(pack_cones(x_cones)); - auto d_out = make_device_vector(pack_cones(base_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - launch_nt_scaling(cones, stream_); - - rmm::device_uvector d_hinv2_x(cones.m_c, stream_); - launch_accumulate_cone_hinv2(d_x, cones, d_hinv2_x, d_out); - - auto actual = copy_to_host(d_out); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto ref_hinv2 = - ref_apply_hinv2_single(x_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); - auto actual_c = slice_cone(actual, offsets, c); - std::vector ref(actual_c.size()); - for (i_t j = 0; j < static_cast(ref.size()); ++j) { - ref[j] = base_cones[c][j] + ref_hinv2[j]; - } - expect_vector_near(actual_c, ref, 1e-10, 1e-8, "accumulate_cone_hinv2"); - } -} - -TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_matches_reference_with_nt_scaling) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - launch_nt_scaling(cones, stream_); - - auto block_offsets_host = copy_to_host(cones.block_offsets); - i_t total_blk = dims[0] * dims[0] + dims[1] * dims[1]; - std::vector q_vals(total_blk, f_t(0)); - std::vector csr_indices(total_blk); - constexpr i_t aug_offset = 2; - for (i_t e = 0; e < total_blk; ++e) { - csr_indices[e] = aug_offset + (total_blk - 1 - e); - } - auto d_csr_indices = make_device_vector(csr_indices); - auto d_q_values = make_device_vector(q_vals); - rmm::device_uvector d_aug_x(total_blk + aug_offset, stream_); - RAFT_CUDA_TRY( - cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * (total_blk + aug_offset), stream_)); - launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); - - auto actual = copy_to_host(d_aug_x); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - for (i_t e = 0; e < aug_offset; ++e) { - EXPECT_EQ(actual[e], f_t(0)) << "untouched prefix entry " << e; - } - - i_t blk_off = 0; - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto w_c = slice_cone(w_bar_host, offsets, c); - auto ref = ref_build_hinv2_block_single(w_c, inv_eta_h[c]); - i_t blk_sz = dims[c] * dims[c]; - for (i_t e = 0; e < blk_sz; ++e) { - EXPECT_NEAR(actual[csr_indices[blk_off + e]], -ref[e], 1e-10 + 1e-8 * std::abs(ref[e])) - << "cone " << c << " entry " << e; - } - blk_off += blk_sz; - } -} - -TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_matvec_matches_apply_hinv2) -{ - std::vector> s_cones{{5.0, 1.0, -1.0, 0.5, 0.3}}; - std::vector> lambda_cones{{4.0, 0.5, 1.0, -0.3, 0.2}}; - std::vector dims{5}; - i_t q = dims[0]; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - i_t total_blk = q * q; - std::vector csr_indices(total_blk); - std::iota(csr_indices.begin(), csr_indices.end(), 0); - std::vector q_vals(total_blk, f_t(0)); - auto d_csr_indices = make_device_vector(csr_indices); - auto d_q_values = make_device_vector(q_vals); - rmm::device_uvector d_aug_x(total_blk, stream_); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_blk, stream_)); - launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); - - auto scattered = copy_to_host(d_aug_x); - std::vector block(total_blk); - for (i_t e = 0; e < total_blk; ++e) { - block[e] = -scattered[e]; - } - - std::vector> test_vectors{ - {1.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0, 0.0}, {0.3, -0.1, 0.2, -0.5, 0.15}}; - - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - for (const auto& v : test_vectors) { - // Host mat-vec: y = block * v - std::vector y(q, f_t(0)); - for (i_t r = 0; r < q; ++r) { - for (i_t c = 0; c < q; ++c) { - y[r] += block[r * q + c] * v[c]; - } - } - - auto ref = ref_apply_hinv2_single(v, w_bar_host, inv_eta_h[0]); - expect_vector_near(y, ref, 1e-10, 1e-8, "block_matvec_vs_apply"); - } -} - -TEST_F(second_order_cone_test, scatter_hinv2_into_augmented_large_cone) -{ - std::vector dims{513}; - - auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); - auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); - - auto d_s = make_device_vector(s_cone); - auto d_lambda = make_device_vector(lambda_cone); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - i_t total_blk = dims[0] * dims[0]; - std::vector csr_indices(total_blk); - std::iota(csr_indices.begin(), csr_indices.end(), 0); - std::vector q_vals(total_blk, f_t(0)); - auto d_csr_indices = make_device_vector(csr_indices); - auto d_q_values = make_device_vector(q_vals); - rmm::device_uvector d_aug_x(total_blk, stream_); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_blk, stream_)); - launch_cone_block_scatter(cones, d_aug_x, d_csr_indices, d_q_values); - - auto scattered = copy_to_host(d_aug_x); - std::vector block(total_blk); - for (i_t e = 0; e < total_blk; ++e) { - block[e] = -scattered[e]; - } - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - // Spot-check: block * e_0 should match apply_Hinv2(e_0) - i_t q = dims[0]; - std::vector col0(q); - for (i_t r = 0; r < q; ++r) { - col0[r] = block[r * q]; - } - std::vector e0(q, f_t(0)); - e0[0] = f_t(1); - auto ref = ref_apply_hinv2_single(e0, w_bar_host, inv_eta_h[0]); - expect_vector_near(col0, ref, 1e-8, 1e-6, "hinv2_block_col0_large"); - - // Symmetry check: block[r][c] == block[c][r] - for (i_t r = 0; r < std::min(q, i_t(50)); ++r) { - for (i_t c = r + 1; c < std::min(q, i_t(50)); ++c) { - EXPECT_NEAR(block[r * q + c], block[c * q + r], 1e-10) - << "asymmetry at (" << r << "," << c << ")"; - } - } -} - -TEST_F(second_order_cone_test, fused_corrector_matches_reference_with_nt_scaling) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - - launch_nt_scaling(cones, stream_); - - std::vector> dx_aff_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; - f_t sigma_mu = 0.1; - - auto d_dx_aff = make_device_vector(pack_cones(dx_aff_cones)); - rmm::device_uvector d_out(cones.omega.size(), stream_); - - launch_fused_corrector(d_dx_aff, cones, sigma_mu, d_out); - - auto actual = copy_to_host(d_out); - auto omega_host = copy_to_host(cones.omega); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - auto inv_1pw0_h = copy_to_host(cones.inv_1pw0); - auto rho_h = copy_to_host(cones.rho); - - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto ref = ref_fused_corrector_single(dx_aff_cones[c], - slice_cone(omega_host, offsets, c), - slice_cone(w_bar_host, offsets, c), - inv_eta_h[c], - inv_1pw0_h[c], - rho_h[c], - sigma_mu); - auto act = slice_cone(actual, offsets, c); - expect_vector_near(act, ref, 1e-10, 1e-8, "fused_corrector"); - } -} - -TEST_F(second_order_cone_test, fused_corrector_strided_loop_for_large_cone) -{ - std::vector dims{513}; - auto offsets = build_offsets(dims); - - auto s_cone = make_patterned_cone(dims[0], 5.0, 0.005); - auto lambda_cone = make_patterned_cone(dims[0], 4.0, 0.004); - - auto d_s = make_device_vector(s_cone); - auto d_lambda = make_device_vector(lambda_cone); - cone_data_t cones(1, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - - launch_nt_scaling(cones, stream_); - - auto dx_aff_cone = make_patterned_cone(dims[0], 0.5, 0.003); - f_t sigma_mu = 0.25; - - auto d_dx_aff = make_device_vector(dx_aff_cone); - rmm::device_uvector d_out(cones.omega.size(), stream_); - - launch_fused_corrector(d_dx_aff, cones, sigma_mu, d_out); - - auto actual = copy_to_host(d_out); - auto omega_host = copy_to_host(cones.omega); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - auto inv_1pw0_h = copy_to_host(cones.inv_1pw0); - auto rho_h = copy_to_host(cones.rho); - - auto ref = ref_fused_corrector_single( - dx_aff_cone, omega_host, w_bar_host, inv_eta_h[0], inv_1pw0_h[0], rho_h[0], sigma_mu); - expect_vector_near(actual, ref, 1e-8, 1e-6, "fused_corrector_large"); -} - -TEST_F(second_order_cone_test, cone_block_scatter_with_q_overlap) -{ - std::vector> s_cones{{3.0, 0.5, -0.3}}; - std::vector> lambda_cones{{2.0, -0.2, 0.4}}; - std::vector dims{3}; - i_t K = 1; - i_t q_k = 3; - i_t total_block_nnz = q_k * q_k; - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - cone_data_t cones(K, dims, cuopt::make_span(d_s), cuopt::make_span(d_lambda), stream_); - launch_nt_scaling(cones, stream_); - - f_t dual_perturb = 1e-6; - std::vector q_vals(total_block_nnz, f_t(0)); - q_vals[0] = 0.5 + dual_perturb; - q_vals[4] = 0.3 + dual_perturb; - q_vals[8] = 0.1 + dual_perturb; - q_vals[1] = 0.05; - q_vals[3] = 0.05; - - std::vector cone_csr_indices(total_block_nnz); - std::iota(cone_csr_indices.begin(), cone_csr_indices.end(), 0); - auto d_cone_csr_indices = make_device_vector(cone_csr_indices); - auto d_cone_Q_values = make_device_vector(q_vals); - - rmm::device_uvector d_aug_x(total_block_nnz, stream_); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aug_x.data(), 0, sizeof(f_t) * total_block_nnz, stream_)); - - launch_cone_block_scatter(cones, d_aug_x, d_cone_csr_indices, d_cone_Q_values); - - auto actual = copy_to_host(d_aug_x); - auto w_bar_h = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - auto ref_block = ref_build_hinv2_block_single(w_bar_h, inv_eta_h[0]); - - for (i_t e = 0; e < total_block_nnz; ++e) { - f_t expected = -ref_block[e] - q_vals[e]; - EXPECT_NEAR(actual[e], expected, 1e-10 + 1e-8 * std::abs(expected)) - << "entry " << e << " (Q overlap test)"; - } -} - -TEST_F(second_order_cone_test, recover_cone_dz_from_target_matches_reference) -{ - std::vector> s_cones{{2.0, 0.5, 0.25}, {3.0, 0.25, -0.5, 0.75, -0.25}}; - std::vector> lambda_cones{{1.5, -0.25, 0.1}, {2.5, -0.1, 0.3, -0.2, 0.15}}; - std::vector> dx_cones{{0.3, -0.1, 0.2}, {-0.5, 0.2, 0.1, -0.3, 0.15}}; - std::vector dims{3, 5}; - auto offsets = build_offsets(dims); - - auto d_s = make_device_vector(pack_cones(s_cones)); - auto d_lambda = make_device_vector(pack_cones(lambda_cones)); - auto d_dx = make_device_vector(pack_cones(dx_cones)); - cone_data_t cones(static_cast(dims.size()), - dims, - cuopt::make_span(d_s), - cuopt::make_span(d_lambda), - stream_); - launch_nt_scaling(cones, stream_); - - rmm::device_uvector d_rhs(cones.m_c, stream_); - launch_affine_cone_rhs(cones, d_rhs); - auto rhs_actual = copy_to_host(d_rhs); - - std::vector target_host(rhs_actual.size(), f_t(0)); - for (std::size_t j = 0; j < rhs_actual.size(); ++j) { - target_host[j] = -rhs_actual[j]; - } - auto d_target = make_device_vector(target_host); - - rmm::device_uvector d_hinv2_dx(cones.m_c, stream_); - rmm::device_uvector d_dz(cones.m_c, stream_); - launch_recover_cone_dz_from_target(d_dx, cones, d_target, d_hinv2_dx, d_dz); - - auto actual = copy_to_host(d_dz); - auto w_bar_host = copy_to_host(cones.w_bar); - auto inv_eta_h = copy_to_host(cones.inv_eta); - - for (i_t c = 0; c < static_cast(dims.size()); ++c) { - auto ref_hinv2 = - ref_apply_hinv2_single(dx_cones[c], slice_cone(w_bar_host, offsets, c), inv_eta_h[c]); - auto rhs_c = slice_cone(rhs_actual, offsets, c); - auto act = slice_cone(actual, offsets, c); - std::vector ref(act.size()); - for (i_t j = 0; j < static_cast(ref.size()); ++j) { - ref[j] = -rhs_c[j] - ref_hinv2[j]; - } - expect_vector_near(act, ref, 1e-10, 1e-8, "recover_cone_dz_from_target"); - } -} - -} // namespace cuopt::linear_programming::dual_simplex::test From a20ac270a29893f3fdc5051e5d6ada96d6de8b96 Mon Sep 17 00:00:00 2001 From: Yan Zaretskiy Date: Tue, 28 Apr 2026 12:22:58 -0700 Subject: [PATCH 22/22] Support direct MPS SOC conversion --- cpp/src/barrier/barrier.cu | 5 +- cpp/src/dual_simplex/presolve.cpp | 66 --- cpp/src/dual_simplex/solve.cpp | 45 -- cpp/src/dual_simplex/user_problem.hpp | 2 - cpp/src/pdlp/translate.hpp | 184 ++++++-- .../dual_simplex/unit_tests/solve_barrier.cu | 402 ------------------ 6 files changed, 147 insertions(+), 557 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 475879c8fd..bdb571ec32 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -2626,8 +2626,9 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t 0 && data.Q_diagonal) { + // In ADAT mode, diagonal Q is folded into D. In augmented mode, Q is an explicit block in the + // KKT matrix, so adding it here would double count the quadratic objective. + if (!use_augmented && data.Q.n > 0 && data.Q_diagonal) { cub::DeviceTransform::Transform( cuda::std::make_tuple(data.d_Q_diag_.data(), data.d_diag_.data()), data.d_diag_.data(), diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index 7adee4525a..ca98dc9da5 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -15,7 +15,6 @@ #include #include -#include namespace cuopt::linear_programming::dual_simplex { @@ -25,68 +24,6 @@ static i_t linear_var_count(const lp_problem_t& problem) return problem.second_order_cone_dims.empty() ? problem.num_cols : problem.cone_var_start; } -template -static void lift_second_order_cone_rows(const user_problem_t& user_problem, - std::vector& row_sense, - lp_problem_t& problem) -{ - if (user_problem.second_order_cone_row_dims.empty()) { return; } - - auto& dims = user_problem.second_order_cone_row_dims; - const i_t lifted_row_count = std::accumulate(dims.begin(), dims.end(), i_t{0}); - const i_t cone_row_start = user_problem.cone_row_start; - - const i_t old_num_cols = problem.num_cols; - const i_t new_num_cols = old_num_cols + lifted_row_count; - const i_t old_nnz = problem.A.col_start[old_num_cols]; - const i_t new_nnz = old_nnz + lifted_row_count; - - auto old_A = problem.A; - csc_matrix_t lifted_A(problem.num_rows, new_num_cols, new_nnz); - - i_t nz = 0; - for (i_t j = 0; j < old_num_cols; ++j) { - lifted_A.col_start[j] = nz; - for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { - lifted_A.i[nz] = old_A.i[p]; - lifted_A.x[nz] = old_A.x[p]; - ++nz; - } - } - - for (i_t offset = 0; offset < lifted_row_count; ++offset) { - const i_t j = old_num_cols + offset; - const i_t row = cone_row_start + offset; - lifted_A.col_start[j] = nz; - lifted_A.i[nz] = row; - lifted_A.x[nz] = 1.0; - row_sense[row] = 'E'; - ++nz; - } - lifted_A.col_start[new_num_cols] = nz; - assert(nz == new_nnz); - - std::vector objective(new_num_cols, 0.0); - std::vector lower(new_num_cols, 0.0); - std::vector upper(new_num_cols, inf); - for (i_t j = 0; j < old_num_cols; ++j) { - objective[j] = problem.objective[j]; - lower[j] = problem.lower[j]; - upper[j] = problem.upper[j]; - } - - problem.A = lifted_A; - problem.A.n = new_num_cols; - problem.objective = objective; - problem.lower = lower; - problem.upper = upper; - problem.num_cols = new_num_cols; - if (problem.second_order_cone_dims.empty()) { problem.cone_var_start = old_num_cols; } - problem.second_order_cone_dims.insert(problem.second_order_cone_dims.end(), - user_problem.second_order_cone_row_dims.begin(), - user_problem.second_order_cone_row_dims.end()); -} - template i_t remove_empty_cols(lp_problem_t& problem, i_t& num_empty_cols, @@ -759,9 +696,6 @@ void convert_user_problem(const user_problem_t& user_problem, // Make a copy of row_sense so we can modify it std::vector row_sense = user_problem.row_sense; - if (settings.barrier && !user_problem.second_order_cone_row_dims.empty()) { - lift_second_order_cone_rows(user_problem, row_sense, problem); - } // The original problem can have constraints in the form // a_i^T x >= b, a_i^T x <= b, and a_i^T x == b diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 7d18d192eb..3755931a12 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -37,48 +37,6 @@ namespace cuopt::linear_programming::dual_simplex { namespace { -template -bool validate_second_order_cone_row_metadata(const user_problem_t& user_problem, - const simplex_solver_settings_t& settings) -{ - if (user_problem.second_order_cone_row_dims.empty()) { return true; } - - i_t lifted_row_count = 0; - for (auto q_k : user_problem.second_order_cone_row_dims) { - if (q_k < 0) { - settings.log.printf("Error: second-order cone row dimensions must be nonnegative\n"); - return false; - } - lifted_row_count += q_k; - } - - if (user_problem.cone_row_start < 0) { - settings.log.printf("Error: cone_row_start must be nonnegative\n"); - return false; - } - - const i_t cone_row_end = user_problem.cone_row_start + lifted_row_count; - if (cone_row_end > user_problem.num_rows) { - settings.log.printf("Error: second-order cone row block exceeds the number of rows\n"); - return false; - } - - if (user_problem.num_range_rows > static_cast(user_problem.range_rows.size())) { - settings.log.printf("Error: range row metadata is inconsistent\n"); - return false; - } - - for (i_t k = 0; k < user_problem.num_range_rows; ++k) { - const i_t row = user_problem.range_rows[k]; - if (row >= user_problem.cone_row_start && row < cone_row_end) { - settings.log.printf("Error: range rows cannot intersect the second-order cone row block\n"); - return false; - } - } - - return true; -} - template void write_matlab(const std::string& filename, const dual_simplex::lp_problem_t& lp) { @@ -429,9 +387,6 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us simplex_solver_settings_t barrier_settings = settings; barrier_settings.barrier_presolve = true; dualize_info_t dualize_info; - if (!validate_second_order_cone_row_metadata(user_problem, settings)) { - return lp_status_t::NUMERICAL_ISSUES; - } convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks, dualize_info); if (!validate_barrier_cone_layout(original_lp, settings)) { return lp_status_t::NUMERICAL_ISSUES; diff --git a/cpp/src/dual_simplex/user_problem.hpp b/cpp/src/dual_simplex/user_problem.hpp index db62891185..8b0588064c 100644 --- a/cpp/src/dual_simplex/user_problem.hpp +++ b/cpp/src/dual_simplex/user_problem.hpp @@ -54,8 +54,6 @@ struct user_problem_t { std::vector Q_values; i_t cone_var_start{0}; std::vector second_order_cone_dims; - i_t cone_row_start{0}; - std::vector second_order_cone_row_dims; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index 4e5c0be991..1b39dffcaa 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -15,6 +15,10 @@ #include +#include +#include +#include + namespace cuopt::linear_programming { template @@ -110,18 +114,18 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( error_type_t::ValidationError, "Quadratic-constraint flag is set, but no constraints were provided"); - const i_t original_rows = static_cast(user_problem.num_rows); // Use a practical tolerance for text-parsed MPS numeric values. const f_t tol = std::numeric_limits::epsilon() * 2; - // SOC: Q is n×n diagonal CSR (offsets length n+1). Exactly q_n = nnz on the main diagonal, at - // q_n distinct variable indices: one −1 (head) and (q_n−1) +1 (tails). Lifting: q_n rows, each - // with −1 in one column; the **first** row must be the head (variable with Q = −1); order of - // remaining rows (+1 diagonals) is unconstrained (CSR row scan order). - - const i_t old_nnz = csr_A.row_start[original_rows]; - std::vector row_cone_dims{}; - row_cone_dims.reserve(qcs.size()); + // SOC conversion accepts only diagonal Lorentz-form QCMATRIX rows: + // -x_head^2 + sum_i x_tail_i^2 <= 0. + // The barrier consumes SOCs as trailing variable blocks [head, tails...], so we validate all + // QCMATRIX blocks first, then apply a single column permutation to the linear model. + std::vector> cone_vars; + std::vector cone_dims; + std::vector is_cone_var(static_cast(n), 0); + cone_vars.reserve(qcs.size()); + cone_dims.reserve(qcs.size()); for (const auto& qc : qcs) { cuopt_expects(qc.constraint_row_type == 'L', @@ -155,7 +159,7 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( cuopt_expects( qc.quadratic_offsets.size() == static_cast(n) + 1, error_type_t::ValidationError, - "Quadratic constraint '%s' Q must be n×n in CSR: expected %zu CSR row pointers (offsets " + "Quadratic constraint '%s' Q must be n by n in CSR: expected %zu CSR row pointers (offsets " "length n+1), got %zu (n = %d)", qc.constraint_row_name.c_str(), static_cast(n) + 1, @@ -174,7 +178,7 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( "Quadratic constraint '%s' Q CSR offsets[0] must be 0", qc.constraint_row_name.c_str()); - // Verify Q: n×n CSR, diagonal entries only, Lorentz pattern, then build the lift. + // Verify Q: n by n CSR, diagonal entries only, Lorentz pattern. // Scan each row r: empty or one nnz on (r,r) with value -1 (head) or +1 (tail); // tail order follows this scan; no requirement that diagonal indices be sorted. i_t head = static_cast(-1); @@ -185,6 +189,13 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( for (i_t r = 0; r < n; ++r) { const i_t p_beg = qc.quadratic_offsets[static_cast(r)]; const i_t p_end = qc.quadratic_offsets[static_cast(r + 1)]; + cuopt_expects(p_beg >= 0 && p_beg <= p_end && p_end <= q_n, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d has invalid CSR offsets [%d, %d)", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(p_beg), + static_cast(p_end)); if (p_beg == p_end) { continue; } @@ -247,44 +258,137 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( "Quadratic constraint '%s' SOC Q: internal error (head index invalid)", qc.constraint_row_name.c_str()); - row_cone_dims.push_back(q_n); - dual_simplex::csr_matrix_t lift_block(q_n, n, q_n); - for (i_t t = 0; t <= q_n; ++t) { - lift_block.row_start[t] = t; + std::vector cone; + cone.reserve(static_cast(q_n)); + cone.push_back(head); + cone.insert(cone.end(), tail_row_vars.begin(), tail_row_vars.end()); + for (const i_t var : cone) { + cuopt_expects(!is_cone_var[static_cast(var)], + error_type_t::ValidationError, + "Variable %d appears in more than one SOC QCMATRIX block; overlapping cones " + "are not supported", + static_cast(var)); + is_cone_var[static_cast(var)] = 1; } + cone_dims.push_back(q_n); + cone_vars.push_back(std::move(cone)); + } - // One lift row per cone component: -1 in column head, then -1 in each tail column (since our - // slack variable is done by + s form) (order matches tail_row_vars from the Q scan). - lift_block.j[0] = head; - lift_block.x[0] = f_t(-1); - for (i_t t = 0; t < q_n - 1; ++t) { - lift_block.j[static_cast(t) + 1U] = tail_row_vars[static_cast(t)]; - lift_block.x[static_cast(t) + 1U] = f_t(-1); + std::vector old_to_new(static_cast(n), i_t{-1}); + std::vector new_to_old; + new_to_old.reserve(static_cast(n)); + for (i_t j = 0; j < n; ++j) { + if (is_cone_var[static_cast(j)]) { continue; } + old_to_new[static_cast(j)] = static_cast(new_to_old.size()); + new_to_old.push_back(j); + } + const i_t cone_var_start = static_cast(new_to_old.size()); + for (const auto& cone : cone_vars) { + for (const i_t old_j : cone) { + old_to_new[static_cast(old_j)] = static_cast(new_to_old.size()); + new_to_old.push_back(old_j); } - cuopt_expects(csr_A.append_rows(lift_block) == 0, - error_type_t::RuntimeError, - "Internal error while appending SOC lifting rows to CSR A"); } - - // Update user_problem to include the new SOC rows - const i_t next_row = static_cast(csr_A.m); - const i_t lifted_rows = next_row - original_rows; - const i_t new_nnz = old_nnz + lifted_rows; - cuopt_expects(csr_A.row_start[next_row] == new_nnz, + cuopt_expects(static_cast(new_to_old.size()) == n, error_type_t::RuntimeError, - "Internal error while building SOC lifting rows in CSR A"); + "Internal error while building SOC variable permutation"); + + for (i_t row = 0; row < csr_A.m; ++row) { + for (i_t p = csr_A.row_start[static_cast(row)]; + p < csr_A.row_start[static_cast(row + 1)]; + ++p) { + const i_t old_j = csr_A.j[static_cast(p)]; + cuopt_expects(old_j >= 0 && old_j < n, + error_type_t::ValidationError, + "Linear constraint matrix column index %d is outside [0, %d)", + static_cast(old_j), + static_cast(n)); + csr_A.j[static_cast(p)] = old_to_new[static_cast(old_j)]; + } + } - user_problem.rhs.resize(next_row, f_t(0)); - user_problem.row_sense.resize(next_row, 'E'); - if (user_problem.row_names.size() == static_cast(original_rows)) { - for (i_t r = original_rows; r < next_row; ++r) { - user_problem.row_names.push_back("_CUOPT_soc_row_" + std::to_string(r - original_rows)); + auto permute_dense_by_old_to_new = [&](auto& values, const char* name) { + if (values.empty()) { return; } + using value_t = typename std::decay_t::value_type; + cuopt_expects(values.size() == static_cast(n), + error_type_t::ValidationError, + "%s length %zu does not match number of variables %d", + name, + values.size(), + static_cast(n)); + std::vector permuted(values.size()); + for (i_t old_j = 0; old_j < n; ++old_j) { + permuted[static_cast(old_to_new[static_cast(old_j)])] = + std::move(values[static_cast(old_j)]); } + values = std::move(permuted); + }; + + permute_dense_by_old_to_new(user_problem.objective, "objective"); + permute_dense_by_old_to_new(user_problem.lower, "lower bounds"); + permute_dense_by_old_to_new(user_problem.upper, "upper bounds"); + permute_dense_by_old_to_new(user_problem.var_types, "variable types"); + permute_dense_by_old_to_new(user_problem.col_names, "column names"); + + if (!user_problem.Q_values.empty()) { + cuopt_expects(user_problem.Q_indices.size() == user_problem.Q_values.size(), + error_type_t::ValidationError, + "Quadratic objective indices and values length mismatch"); + cuopt_expects(user_problem.Q_offsets.size() == static_cast(n) + 1, + error_type_t::ValidationError, + "Quadratic objective CSR offsets length must be n+1 when SOC QCMATRIX " + "conversion permutes variables"); + cuopt_expects(user_problem.Q_offsets[0] == 0, + error_type_t::ValidationError, + "Quadratic objective CSR offsets[0] must be 0"); + cuopt_expects(user_problem.Q_offsets[static_cast(n)] == + static_cast(user_problem.Q_values.size()), + error_type_t::ValidationError, + "Quadratic objective CSR last offset must equal number of nonzeros"); + + std::vector q_offsets(static_cast(n) + 1, 0); + for (i_t old_row = 0; old_row < n; ++old_row) { + const i_t p_beg = user_problem.Q_offsets[static_cast(old_row)]; + const i_t p_end = user_problem.Q_offsets[static_cast(old_row + 1)]; + cuopt_expects( + p_beg >= 0 && p_beg <= p_end && p_end <= static_cast(user_problem.Q_values.size()), + error_type_t::ValidationError, + "Quadratic objective CSR offsets are invalid at row %d", + static_cast(old_row)); + const i_t new_row = old_to_new[static_cast(old_row)]; + q_offsets[static_cast(new_row + 1)] = p_end - p_beg; + } + for (i_t row = 0; row < n; ++row) { + q_offsets[static_cast(row + 1)] += q_offsets[static_cast(row)]; + } + + std::vector q_indices(user_problem.Q_indices.size()); + std::vector q_values(user_problem.Q_values.size()); + auto q_write = q_offsets; + for (i_t old_row = 0; old_row < n; ++old_row) { + const i_t new_row = old_to_new[static_cast(old_row)]; + for (i_t p = user_problem.Q_offsets[static_cast(old_row)]; + p < user_problem.Q_offsets[static_cast(old_row + 1)]; + ++p) { + const i_t old_col = user_problem.Q_indices[static_cast(p)]; + cuopt_expects(old_col >= 0 && old_col < n, + error_type_t::ValidationError, + "Quadratic objective column index %d is outside [0, %d)", + static_cast(old_col), + static_cast(n)); + const i_t dst = q_write[static_cast(new_row)]++; + q_indices[static_cast(dst)] = old_to_new[static_cast(old_col)]; + q_values[static_cast(dst)] = user_problem.Q_values[static_cast(p)]; + } + } + + user_problem.Q_offsets = std::move(q_offsets); + user_problem.Q_indices = std::move(q_indices); + user_problem.Q_values = std::move(q_values); } - user_problem.num_rows = next_row; - user_problem.cone_row_start = original_rows; - user_problem.second_order_cone_row_dims = std::move(row_cone_dims); + user_problem.cone_var_start = cone_var_start; + user_problem.second_order_cone_dims = std::move(cone_dims); } csr_A.to_compressed_col(user_problem.A); diff --git a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu index 1b4a866221..d630bfac18 100644 --- a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu +++ b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu @@ -32,72 +32,6 @@ static void init_handler(const raft::handle_t* handle_ptr) handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); } -template -static void populate_basic_qp_socp_problem(user_problem_t& user_problem, - bool explicit_cone_variables) -{ - constexpr i_t num_rows = 9; - constexpr f_t p00 = static_cast(1.4652521089139698); - constexpr f_t p01 = static_cast(0.6137176286085666); - constexpr f_t p02 = static_cast(-1.1527861771130112); - constexpr f_t p11 = static_cast(2.219109946678485); - constexpr f_t p12 = static_cast(-1.4400420548730628); - constexpr f_t p22 = static_cast(1.6014483534926371); - - user_problem.num_rows = num_rows; - user_problem.rhs = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0}; - user_problem.row_sense = {'L', 'L', 'L', 'L', 'L', 'L', 'E', 'E', 'E'}; - user_problem.num_range_rows = 0; - - if (explicit_cone_variables) { - user_problem.num_cols = 6; - user_problem.objective = {0.1, -2.0, 1.0, 0.0, 0.0, 0.0}; - - user_problem.A.m = num_rows; - user_problem.A.n = user_problem.num_cols; - user_problem.A.nz_max = 12; - user_problem.A.reallocate(12); - user_problem.A.col_start = {0, 3, 6, 9, 10, 11, 12}; - user_problem.A.i = {0, 3, 6, 1, 4, 7, 2, 5, 8, 6, 7, 8}; - user_problem.A.x = {2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 1.0, 1.0, 1.0}; - - user_problem.lower = {-inf, -inf, -inf, 0.0, 0.0, 0.0}; - user_problem.upper.assign(user_problem.num_cols, inf); - - user_problem.Q_offsets = {0, 3, 6, 9, 9, 9, 9}; - user_problem.Q_indices = {0, 1, 2, 0, 1, 2, 0, 1, 2}; - user_problem.Q_values = {p00, p01, p02, p01, p11, p12, p02, p12, p22}; - - user_problem.cone_var_start = 3; - user_problem.second_order_cone_dims = {3}; - user_problem.problem_name = "basic_qp_socp_explicit_cone"; - } else { - user_problem.num_cols = 3; - user_problem.objective = {0.1, -2.0, 1.0}; - - user_problem.A.m = num_rows; - user_problem.A.n = user_problem.num_cols; - user_problem.A.nz_max = 9; - user_problem.A.reallocate(9); - user_problem.A.col_start = {0, 3, 6, 9}; - user_problem.A.i = {0, 3, 6, 1, 4, 7, 2, 5, 8}; - user_problem.A.x = {2.0, -2.0, 1.0, 2.0, -2.0, 1.0, 2.0, -2.0, 1.0}; - - user_problem.lower.assign(user_problem.num_cols, -inf); - user_problem.upper.assign(user_problem.num_cols, inf); - - user_problem.Q_offsets = {0, 3, 6, 9}; - user_problem.Q_indices = {0, 1, 2, 0, 1, 2, 0, 1, 2}; - user_problem.Q_values = {p00, p01, p02, p01, p11, p12, p02, p12, p22}; - - user_problem.cone_row_start = 6; - user_problem.second_order_cone_row_dims = {3}; - user_problem.problem_name = "basic_qp_socp_row_cone"; - } - - user_problem.var_types.assign(user_problem.num_cols, variable_type_t::CONTINUOUS); -} - TEST(barrier, chess_set) { namespace dual_simplex = cuopt::linear_programming::dual_simplex; @@ -299,225 +233,6 @@ TEST(barrier, cone_metadata_reindexed_when_slack_is_inserted_before_cones) EXPECT_EQ(barrier_lp.cone_var_start, 2); } -TEST(barrier, row_cone_block_is_lifted_into_trailing_cone_variables) -{ - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t user_problem(&handle); - - constexpr int m = 3; - constexpr int n = 2; - constexpr int nz = 4; - - user_problem.num_rows = m; - user_problem.num_cols = n; - user_problem.objective = {0.0, 0.0}; - - user_problem.A.m = m; - user_problem.A.n = n; - user_problem.A.nz_max = nz; - user_problem.A.reallocate(nz); - user_problem.A.col_start = {0, 2, 4}; - user_problem.A.i[0] = 0; - user_problem.A.x[0] = 1.0; - user_problem.A.i[1] = 2; - user_problem.A.x[1] = 1.0; - user_problem.A.i[2] = 1; - user_problem.A.x[2] = -1.0; - user_problem.A.i[3] = 2; - user_problem.A.x[3] = 2.0; - - user_problem.rhs = {3.0, 1.0, 4.0}; - user_problem.row_sense = {'E', 'E', 'E'}; - user_problem.lower.assign(n, 0.0); - user_problem.upper.assign(n, inf); - user_problem.num_range_rows = 0; - user_problem.cone_row_start = 0; - user_problem.second_order_cone_row_dims = {3}; - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = true; - settings.dualize = 0; - settings.scale_columns = false; - - std::vector new_slacks; - dualize_info_t dualize_info; - lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); - convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); - - EXPECT_TRUE(new_slacks.empty()); - EXPECT_EQ(original_lp.num_cols, 5); - EXPECT_EQ(original_lp.cone_var_start, 2); - EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3})); - - for (int j = 2; j < 5; ++j) { - EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 2); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); - EXPECT_EQ(original_lp.objective[j], 0.0); - EXPECT_EQ(original_lp.lower[j], 0.0); - EXPECT_EQ(original_lp.upper[j], inf); - } -} - -TEST(barrier, row_cone_block_and_scalar_inequality_order_as_linear_slack_then_cone) -{ - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t user_problem(&handle); - - constexpr int m = 4; - constexpr int n = 1; - constexpr int nz = 2; - - user_problem.num_rows = m; - user_problem.num_cols = n; - user_problem.objective = {1.0}; - - user_problem.A.m = m; - user_problem.A.n = n; - user_problem.A.nz_max = nz; - user_problem.A.reallocate(nz); - user_problem.A.col_start = {0, 2}; - user_problem.A.i[0] = 0; - user_problem.A.x[0] = 1.0; - user_problem.A.i[1] = 1; - user_problem.A.x[1] = -1.0; - - user_problem.rhs = {2.0, 0.0, 1.0, 0.0}; - user_problem.row_sense = {'L', 'E', 'E', 'E'}; - user_problem.lower = {0.0}; - user_problem.upper = {inf}; - user_problem.num_range_rows = 0; - user_problem.cone_row_start = 1; - user_problem.second_order_cone_row_dims = {3}; - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = true; - settings.dualize = 0; - settings.scale_columns = false; - - std::vector new_slacks; - dualize_info_t dualize_info; - lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); - convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); - - ASSERT_EQ(new_slacks.size(), 1); - EXPECT_EQ(new_slacks[0], 1); - EXPECT_EQ(original_lp.num_cols, 5); - EXPECT_EQ(original_lp.cone_var_start, 2); - EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3})); - - EXPECT_EQ(original_lp.A.col_start[1] - original_lp.A.col_start[0], 2); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[0]], 0); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[0]], 1.0); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[0] + 1], 1); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[0] + 1], -1.0); - - EXPECT_EQ(original_lp.A.col_start[2] - original_lp.A.col_start[1], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[1]], 0); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[1]], 1.0); - - for (int j = 2; j < 5; ++j) { - EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 1); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); - EXPECT_EQ(original_lp.objective[j], 0.0); - EXPECT_EQ(original_lp.lower[j], 0.0); - EXPECT_EQ(original_lp.upper[j], inf); - } -} - -TEST(barrier, explicit_and_lifted_cones_stay_contiguous_after_scalar_slack_insertion) -{ - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t user_problem(&handle); - - constexpr int m = 4; - constexpr int n = 4; - constexpr int nz = 5; - - user_problem.num_rows = m; - user_problem.num_cols = n; - user_problem.objective = {0.0, 0.0, 0.0, 0.0}; - - user_problem.A.m = m; - user_problem.A.n = n; - user_problem.A.nz_max = nz; - user_problem.A.reallocate(nz); - user_problem.A.col_start = {0, 2, 3, 4, 5}; - user_problem.A.i[0] = 0; - user_problem.A.x[0] = 1.0; - user_problem.A.i[1] = 1; - user_problem.A.x[1] = -1.0; - user_problem.A.i[2] = 1; - user_problem.A.x[2] = 1.0; - user_problem.A.i[3] = 2; - user_problem.A.x[3] = 2.0; - user_problem.A.i[4] = 3; - user_problem.A.x[4] = -3.0; - - user_problem.rhs = {2.0, 0.0, 0.0, 0.0}; - user_problem.row_sense = {'L', 'E', 'E', 'E'}; - user_problem.lower.assign(n, 0.0); - user_problem.upper.assign(n, inf); - user_problem.num_range_rows = 0; - user_problem.cone_var_start = 1; - user_problem.second_order_cone_dims = {3}; - user_problem.cone_row_start = 1; - user_problem.second_order_cone_row_dims = {3}; - user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = true; - settings.dualize = 0; - settings.scale_columns = false; - - std::vector new_slacks; - dualize_info_t dualize_info; - lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); - convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); - - ASSERT_EQ(new_slacks.size(), 1); - EXPECT_EQ(new_slacks[0], 1); - EXPECT_EQ(original_lp.num_cols, 8); - EXPECT_EQ(original_lp.cone_var_start, 2); - EXPECT_EQ(original_lp.second_order_cone_dims, std::vector({3, 3})); - - EXPECT_EQ(original_lp.A.col_start[2] - original_lp.A.col_start[1], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[1]], 0); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[1]], 1.0); - - EXPECT_EQ(original_lp.A.col_start[3] - original_lp.A.col_start[2], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[2]], 1); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[2]], 1.0); - EXPECT_EQ(original_lp.A.col_start[4] - original_lp.A.col_start[3], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[3]], 2); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[3]], 2.0); - EXPECT_EQ(original_lp.A.col_start[5] - original_lp.A.col_start[4], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[4]], 3); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[4]], -3.0); - - for (int j = 5; j < 8; ++j) { - EXPECT_EQ(original_lp.A.col_start[j + 1] - original_lp.A.col_start[j], 1); - EXPECT_EQ(original_lp.A.i[original_lp.A.col_start[j]], j - 4); - EXPECT_EQ(original_lp.A.x[original_lp.A.col_start[j]], 1.0); - EXPECT_EQ(original_lp.objective[j], 0.0); - EXPECT_EQ(original_lp.lower[j], 0.0); - EXPECT_EQ(original_lp.upper[j], inf); - } -} - TEST(barrier, presolve_reindexes_cone_start_after_empty_column_removal) { raft::handle_t handle{}; @@ -779,123 +494,6 @@ TEST(barrier, socp_min_x0_subject_to_norm_constraint) EXPECT_NEAR(std::abs(solution.x[2]), 0.0, 1e-4); } -TEST(barrier, socp_min_x_subject_to_row_cone_metadata) -{ - // minimize x - // subject to -x + s_0 = 0 - // s_1 = 1 - // s_2 = 0 - // (s_0, s_1, s_2) in Q^3 - // - // Optimal: x* = 1, obj* = 1. - - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t user_problem(&handle); - - constexpr int m = 3; - constexpr int n = 1; - constexpr int nz = 1; - - user_problem.num_rows = m; - user_problem.num_cols = n; - user_problem.objective = {1.0}; - - user_problem.A.m = m; - user_problem.A.n = n; - user_problem.A.nz_max = nz; - user_problem.A.reallocate(nz); - user_problem.A.col_start = {0, 1}; - user_problem.A.i[0] = 0; - user_problem.A.x[0] = -1.0; - - user_problem.rhs = {0.0, 1.0, 0.0}; - user_problem.row_sense = {'E', 'E', 'E'}; - user_problem.lower = {0.0}; - user_problem.upper = {inf}; - user_problem.num_range_rows = 0; - user_problem.problem_name = "socp_row_cone_metadata"; - user_problem.cone_row_start = 0; - user_problem.second_order_cone_row_dims = {3}; - user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = false; - settings.dualize = 0; - settings.scale_columns = false; - - lp_solution_t solution(m, n); - auto status = solve_linear_program_with_barrier(user_problem, settings, solution); - - EXPECT_EQ(status, lp_status_t::OPTIMAL); - EXPECT_NEAR(solution.objective, 1.0, 1e-4); - EXPECT_NEAR(solution.x[0], 1.0, 1e-4); -} - -TEST(barrier, basic_qp_socp_row_cone_matches_reference_solution) -{ - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t user_problem(&handle); - populate_basic_qp_socp_problem(user_problem, false); - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = false; - settings.dualize = 0; - settings.scale_columns = false; - - lp_solution_t solution(user_problem.num_rows, user_problem.num_cols); - auto status = solve_linear_program_with_barrier(user_problem, settings, solution); - - EXPECT_EQ(status, lp_status_t::OPTIMAL); - EXPECT_NEAR(solution.x[0], -0.5, 1e-3); - EXPECT_NEAR(solution.x[1], 0.435603, 1e-3); - EXPECT_NEAR(solution.x[2], -0.245459, 1e-3); - EXPECT_NEAR(solution.objective, -0.84590, 1e-3); -} - -TEST(barrier, basic_qp_socp_row_cone_matches_explicit_cone_formulation) -{ - raft::handle_t handle{}; - init_handler(&handle); - - using namespace cuopt::linear_programming::dual_simplex; - user_problem_t row_cone_problem(&handle); - user_problem_t explicit_cone_problem(&handle); - populate_basic_qp_socp_problem(row_cone_problem, false); - populate_basic_qp_socp_problem(explicit_cone_problem, true); - - simplex_solver_settings_t settings; - settings.barrier = true; - settings.barrier_presolve = false; - settings.dualize = 0; - settings.scale_columns = false; - - lp_solution_t row_cone_solution(row_cone_problem.num_rows, - row_cone_problem.num_cols); - lp_solution_t explicit_cone_solution(explicit_cone_problem.num_rows, - explicit_cone_problem.num_cols); - - auto row_cone_status = - solve_linear_program_with_barrier(row_cone_problem, settings, row_cone_solution); - auto explicit_status = - solve_linear_program_with_barrier(explicit_cone_problem, settings, explicit_cone_solution); - - EXPECT_EQ(row_cone_status, lp_status_t::OPTIMAL); - EXPECT_EQ(explicit_status, lp_status_t::OPTIMAL); - EXPECT_NEAR(row_cone_solution.objective, explicit_cone_solution.objective, 1e-4); - EXPECT_NEAR(row_cone_solution.objective, -0.84590, 1e-3); - for (int j = 0; j < 3; ++j) { - EXPECT_NEAR(row_cone_solution.x[j], explicit_cone_solution.x[j], 1e-4); - } -} - TEST(barrier, mixed_linear_and_soc_block) { // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3.