Latency reductions for concurrent mode (#1083)

srib · web-flow · commit 3f0d27ff44ac · 2026-04-16T22:04:21.000Z
These changes reduce concurrent-halt latency in the LP solve path by checking the halt flag at more points before long synchronous work and by moving expensive cleanup off the return path when we exit with `CONCURRENT_LIMIT`. - Added earlier concurrent-halt checks in barrier and dual simplex around expensive non-interruptible steps, including barrier matrix/factorization setup, phase 2 initialization, and basis refactorization/transposes. - Changed `basis_update_mpf_t `and `barrier_solver_t` from stack-owned temporaries to `std::unique_ptr` in the affected solve paths so their destruction can be deferred on `CONCURRENT_LIMIT`. - On `CONCURRENT_LIMIT`, detached cleanup threads now take ownership of large temporary solver state so the main solve path can return sooner instead of blocking on teardown. - Preserved solver progress metadata on early exit where applicable. The intended behavior is unchanged aside from returning more quickly when a concurrent halt is requested, particularly in paths that previously spent significant time in setup or destruction before exiting. (Description co-authored with Codex) ## Results case | Presolve without | Presolve with | Delta presolve | Solve without | Solve with | Delta solve | Overhead without | Overhead with | Delta overhead | Improvement % | Total without | Total with | Delta total -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- Dual2_5000 | 10.11 | 10.27 | 0.16 | 24.57 | 24.52 | -0.05 | 9.81 | 4.61 | -5.2 | -53% | 34.37 | 29.12 | -5.25 L2CTA3D | 4.45 | 4.02 | -0.43 | 6.19 | 5.08 | -1.11 | 4.68 | 2.9 | -1.79 | -38% | 10.87 | 7.98 | -2.9 a2864 | 1.97 | 1.33 | -0.64 | 2.09 | 1.45 | -0.64 | 0.21 | 0.12 | -0.09 | -43% | 2.3 | 1.57 | -0.73 square41 | 0.9 | 0.67 | -0.23 | 6.22 | 5.64 | -0.58 | 0 | 0 | 0 | 0% | 6.22 | 5.64 | -0.58 scpm1 | 0.35 | 0.35 | 0 | 1.65 | 1.41 | -0.24 | 0.55 | 0.61 | 0.06 | 11% | 2.2 | 2.02 | -0.18 woodlands09 | 0.31 | 0.31 | 0 | 0.44 | 0.45 | 0.01 | 0.51 | 0.44 | -0.08 | -16% | 0.95 | 0.88 | -0.07 graph40-40 | 0.15 | 0.15 | 0 | 0.22 | 0.21 | 0 | 0.54 | 0.48 | -0.06 | -11% | 0.76 | 0.69 | -0.07 savsched1 | 0.21 | 0.22 | 0.01 | 0.4 | 0.4 | 0 | 0.72 | 0.66 | -0.05 | -7% | 1.12 | 1.07 | -0.05 datt256_lp | 0.14 | 0.15 | 0.01 | 0.3 | 0.31 | 0.01 | 0.22 | 0.16 | -0.06 | -27% | 0.52 | 0.47 | -0.04 neos-3025225 | 0.52 | 0.53 | 0.01 | 2.31 | 2.35 | 0.04 | 0.03 | 0.03 | 0 | 0% | 2.35 | 2.39 | 0.04 ex10 | 0.11 | 0.11 | 0 | 0.26 | 0.24 | -0.02 | 0.16 | 0.19 | 0.03 | 19% | 0.42 | 0.43 | 0.02 neos-5251015 | 0.18 | 0.17 | -0.01 | 0.52 | 0.51 | -0.01 | 0.67 | 0.73 | 0.07 | 10% | 1.18 | 1.24 | 0.06 set-cover-model | 1.23 | 1.3 | 0.07 | 7.38 | 7.57 | 0.19 | 0.19 | 0.77 | 0.57 | 300% | 7.57 | 8.33 | 0.77 dlr1 | 2.32 | 2.37 | 0.05 | 12.02 | 12.67 | 0.65 | 0.05 | 0.06 | 0.01 | 20% | 12.07 | 12.72 | 0.66 thk_48 | 3.39 | 3.77 | 0.38 | 15.96 | 17.22 | 1.26 | 0.72 | 0.61 | -0.11 | -15% | 16.68 | 17.82 | 1.15 thk_63 | 3.64 | 4.68 | 1.04 | 12.12 | 13.55 | 1.42 | 0.7 | 0.73 | 0.04 | 6% | 12.82 | 14.28 | 1.46 @rg20 wanted me to note this here: With regards to moving the destructor to a separate thread, we think there is an underlying issue that we probably want to understand first and fix any bug. The changes for moving the destructor to a detached thread has been removed. (Please see comments below) cc @chris-maes ## Issue Authors: - Sri Kainkaryam (https://github.com/srib) Approvers: - Chris Maes (https://github.com/chris-maes) - Rajesh Gandham (https://github.com/rg20) URL: #1083
diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
@@ -1094,6 +1094,7 @@ class iteration_data_t {
     std::sort(column_nz_permutation.begin(),
               column_nz_permutation.end(),
               [&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; });
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     // We then compute the exact sparsity pattern for columns of A whose where
     // the number of nonzeros is less than a threshold. This part can be done
@@ -1124,6 +1125,7 @@ class iteration_data_t {
     // The best way to do that is to have A stored in CSR format.
     csr_matrix_t<i_t, f_t> A_row(0, 0, 0);
     A.to_compressed_row(A_row);
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     std::vector<i_t> histogram(m + 1, 0);
     for (i_t j = 0; j < n; j++) {
@@ -1253,6 +1255,7 @@ class iteration_data_t {
     std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) {
       return delta_nz[i] < delta_nz[j];
     });
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     // Now we make a forward pass and compute the number of nonzeros in C
     // assuming we had included column j
@@ -2297,6 +2300,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     if (use_augmented) {
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
       data.form_augmented();
+      // Check halt after form_augmented (synchronous) and before factorize (~1s).
+      // If halt was set while form_augmented ran, we catch it here and skip the
+      // expensive factorization entirely.
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        return CONCURRENT_HALT_RETURN;
+      }
       status = data.chol->factorize(data.device_augmented);
 
 #ifdef CHOLESKY_DEBUG_CHECK
@@ -2305,6 +2314,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     } else {
       // compute ADAT = A Dinv * A^T
       data.form_adat();
+      // Check halt after form_adat (synchronous) and before factorize (~1s).
+      // If halt was set while form_adat ran, we catch it here and skip the
+      // expensive Cholesky factorization entirely.
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        return CONCURRENT_HALT_RETURN;
+      }
       // factorize
       status = data.chol->factorize(data.device_ADAT);
     }
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
@@ -2431,7 +2431,22 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
   assert(q.size() == A.m);
   reorder_basic_list(q, basic_list);  // We no longer need q after reordering the basic list
   work_estimate_ += 3 * q.size();
-  reset();
+
+  // Check halt before the transpose operations: these can take hundreds of ms
+  // on large problems (L0 and U0 each have O(fill-in) nonzeros) and have no
+  // internal halt checks.  Catching the flag here avoids the dead zone.
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return CONCURRENT_HALT_RETURN;
+  }
+  // Inline reset() so we can check halt between the two transposes.
+  clear();
+  L0_.transpose(L0_transpose_);
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return CONCURRENT_HALT_RETURN;
+  }
+  U0_.transpose(U0_transpose_);
+  work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n];
+  reset_stats();
   return 0;
 }
 
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
@@ -2488,7 +2488,6 @@ dual::status_t dual_phase2(i_t phase,
   const i_t n = lp.num_cols;
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
-  std::vector<i_t> superbasic_list;
   basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
   const bool initialize_basis = true;
   return dual_phase2_with_advanced_basis(phase,
@@ -2688,6 +2687,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                         vector_norm2<i_t, f_t>(delta_y_steepest_edge));
   }
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   if (phase == 2) {
     settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
   }
@@ -2735,10 +2738,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
 #endif
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
   lp.A.transpose(A_transpose);
   phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   f_t obj = compute_objective(lp, x);
   phase2_work_estimate += 2 * n;
 
@@ -2908,6 +2919,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
     }
     timers.btran_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
     const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
     phase2_work_estimate += 2 * delta_y_sparse.i.size();
@@ -2966,6 +2980,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       }
     }
     timers.delta_z_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
 #ifdef COMPUTE_DUAL_RESIDUAL
     std::vector<f_t> dual_residual;
@@ -3301,6 +3318,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     }
 
     timers.ftran_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
 #ifdef CHECK_PRIMAL_STEP
     std::vector<f_t> residual(m);
@@ -3331,6 +3351,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
     assert(steepest_edge_status == 0);
     timers.se_norms_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
     timers.start_timer();
     // x <- x + delta_x
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
@@ -120,16 +120,17 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t<i_t, f_t>& original
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
   basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
-  return solve_linear_program_with_advanced_basis(original_lp,
-                                                  start_time,
-                                                  settings,
-                                                  original_solution,
-                                                  ft,
-                                                  basic_list,
-                                                  nonbasic_list,
-                                                  vstatus,
-                                                  edge_norms,
-                                                  work_unit_context);
+  lp_status_t result = solve_linear_program_with_advanced_basis(original_lp,
+                                                                start_time,
+                                                                settings,
+                                                                original_solution,
+                                                                ft,
+                                                                basic_list,
+                                                                nonbasic_list,
+                                                                vstatus,
+                                                                edge_norms,
+                                                                work_unit_context);
+  return result;
 }
 
 template <typename i_t, typename f_t>
@@ -222,7 +223,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
   if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; }
   if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; }
   if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; }
-  if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
+  if (phase1_status == dual::status_t::CONCURRENT_LIMIT) {
+    original_solution.iterations = iter;
+    return lp_status_t::CONCURRENT_LIMIT;
+  }
   phase1_obj = phase1_solution.objective;
   if (phase1_obj > -settings.primal_tol) {
     settings.log.printf("Dual feasible solution found.\n");
@@ -309,7 +313,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
     if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; }
     if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; }
     if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; }
-    if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; }
+    if (status == dual::status_t::CONCURRENT_LIMIT) {
+      original_solution.iterations = iter;
+      return lp_status_t::CONCURRENT_LIMIT;
+    }
     if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; }
     if (status == dual::status_t::CUTOFF) { lp_status = lp_status_t::CUTOFF; }
     original_solution.iterations = iter;
@@ -581,6 +588,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& us
     solution.iterations         = barrier_solution.iterations;
   }
 
+  if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
+
   // If we aren't doing crossover, we're done
   if (!settings.crossover || barrier_lp.Q.n > 0) { return barrier_status; }
 
@@ -681,6 +690,10 @@ lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
   std::vector<f_t> edge_norms;
   lp_status_t status = solve_linear_program_advanced(
     original_lp, start_time, settings, lp_solution, vstatus, edge_norms);
+  if (status == lp_status_t::CONCURRENT_LIMIT) {
+    solution.iterations = lp_solution.iterations;
+    return lp_status_t::CONCURRENT_LIMIT;
+  }
   uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x);
   uncrush_dual_solution(
     user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z);