Skip to content

Commit 3f0d27f

Browse files
authored
Latency reductions for concurrent mode (#1083)
These changes reduce concurrent-halt latency in the LP solve path by checking the halt flag at more points before long synchronous work and by moving expensive cleanup off the return path when we exit with `CONCURRENT_LIMIT`. - Added earlier concurrent-halt checks in barrier and dual simplex around expensive non-interruptible steps, including barrier matrix/factorization setup, phase 2 initialization, and basis refactorization/transposes. - Changed `basis_update_mpf_t `and `barrier_solver_t` from stack-owned temporaries to `std::unique_ptr` in the affected solve paths so their destruction can be deferred on `CONCURRENT_LIMIT`. - On `CONCURRENT_LIMIT`, detached cleanup threads now take ownership of large temporary solver state so the main solve path can return sooner instead of blocking on teardown. - Preserved solver progress metadata on early exit where applicable. The intended behavior is unchanged aside from returning more quickly when a concurrent halt is requested, particularly in paths that previously spent significant time in setup or destruction before exiting. (Description co-authored with Codex) ## Results case | Presolve without | Presolve with | Delta presolve | Solve without | Solve with | Delta solve | Overhead without | Overhead with | Delta overhead | Improvement % | Total without | Total with | Delta total -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- Dual2_5000 | 10.11 | 10.27 | 0.16 | 24.57 | 24.52 | -0.05 | 9.81 | 4.61 | -5.2 | -53% | 34.37 | 29.12 | -5.25 L2CTA3D | 4.45 | 4.02 | -0.43 | 6.19 | 5.08 | -1.11 | 4.68 | 2.9 | -1.79 | -38% | 10.87 | 7.98 | -2.9 a2864 | 1.97 | 1.33 | -0.64 | 2.09 | 1.45 | -0.64 | 0.21 | 0.12 | -0.09 | -43% | 2.3 | 1.57 | -0.73 square41 | 0.9 | 0.67 | -0.23 | 6.22 | 5.64 | -0.58 | 0 | 0 | 0 | 0% | 6.22 | 5.64 | -0.58 scpm1 | 0.35 | 0.35 | 0 | 1.65 | 1.41 | -0.24 | 0.55 | 0.61 | 0.06 | 11% | 2.2 | 2.02 | -0.18 woodlands09 | 0.31 | 0.31 | 0 | 0.44 | 0.45 | 0.01 | 0.51 | 0.44 | -0.08 | -16% | 0.95 | 0.88 | -0.07 graph40-40 | 0.15 | 0.15 | 0 | 0.22 | 0.21 | 0 | 0.54 | 0.48 | -0.06 | -11% | 0.76 | 0.69 | -0.07 savsched1 | 0.21 | 0.22 | 0.01 | 0.4 | 0.4 | 0 | 0.72 | 0.66 | -0.05 | -7% | 1.12 | 1.07 | -0.05 datt256_lp | 0.14 | 0.15 | 0.01 | 0.3 | 0.31 | 0.01 | 0.22 | 0.16 | -0.06 | -27% | 0.52 | 0.47 | -0.04 neos-3025225 | 0.52 | 0.53 | 0.01 | 2.31 | 2.35 | 0.04 | 0.03 | 0.03 | 0 | 0% | 2.35 | 2.39 | 0.04 ex10 | 0.11 | 0.11 | 0 | 0.26 | 0.24 | -0.02 | 0.16 | 0.19 | 0.03 | 19% | 0.42 | 0.43 | 0.02 neos-5251015 | 0.18 | 0.17 | -0.01 | 0.52 | 0.51 | -0.01 | 0.67 | 0.73 | 0.07 | 10% | 1.18 | 1.24 | 0.06 set-cover-model | 1.23 | 1.3 | 0.07 | 7.38 | 7.57 | 0.19 | 0.19 | 0.77 | 0.57 | 300% | 7.57 | 8.33 | 0.77 dlr1 | 2.32 | 2.37 | 0.05 | 12.02 | 12.67 | 0.65 | 0.05 | 0.06 | 0.01 | 20% | 12.07 | 12.72 | 0.66 thk_48 | 3.39 | 3.77 | 0.38 | 15.96 | 17.22 | 1.26 | 0.72 | 0.61 | -0.11 | -15% | 16.68 | 17.82 | 1.15 thk_63 | 3.64 | 4.68 | 1.04 | 12.12 | 13.55 | 1.42 | 0.7 | 0.73 | 0.04 | 6% | 12.82 | 14.28 | 1.46 @rg20 wanted me to note this here: With regards to moving the destructor to a separate thread, we think there is an underlying issue that we probably want to understand first and fix any bug. The changes for moving the destructor to a detached thread has been removed. (Please see comments below) cc @chris-maes ## Issue Authors: - Sri Kainkaryam (https://github.com/srib) Approvers: - Chris Maes (https://github.com/chris-maes) - Rajesh Gandham (https://github.com/rg20) URL: #1083
1 parent 46b809a commit 3f0d27f

4 files changed

Lines changed: 80 additions & 14 deletions

File tree

cpp/src/barrier/barrier.cu

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,6 +1094,7 @@ class iteration_data_t {
10941094
std::sort(column_nz_permutation.begin(),
10951095
column_nz_permutation.end(),
10961096
[&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; });
1097+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
10971098

10981099
// We then compute the exact sparsity pattern for columns of A whose where
10991100
// the number of nonzeros is less than a threshold. This part can be done
@@ -1124,6 +1125,7 @@ class iteration_data_t {
11241125
// The best way to do that is to have A stored in CSR format.
11251126
csr_matrix_t<i_t, f_t> A_row(0, 0, 0);
11261127
A.to_compressed_row(A_row);
1128+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
11271129

11281130
std::vector<i_t> histogram(m + 1, 0);
11291131
for (i_t j = 0; j < n; j++) {
@@ -1253,6 +1255,7 @@ class iteration_data_t {
12531255
std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) {
12541256
return delta_nz[i] < delta_nz[j];
12551257
});
1258+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
12561259

12571260
// Now we make a forward pass and compute the number of nonzeros in C
12581261
// assuming we had included column j
@@ -2297,6 +2300,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
22972300
if (use_augmented) {
22982301
RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
22992302
data.form_augmented();
2303+
// Check halt after form_augmented (synchronous) and before factorize (~1s).
2304+
// If halt was set while form_augmented ran, we catch it here and skip the
2305+
// expensive factorization entirely.
2306+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2307+
return CONCURRENT_HALT_RETURN;
2308+
}
23002309
status = data.chol->factorize(data.device_augmented);
23012310

23022311
#ifdef CHOLESKY_DEBUG_CHECK
@@ -2305,6 +2314,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
23052314
} else {
23062315
// compute ADAT = A Dinv * A^T
23072316
data.form_adat();
2317+
// Check halt after form_adat (synchronous) and before factorize (~1s).
2318+
// If halt was set while form_adat ran, we catch it here and skip the
2319+
// expensive Cholesky factorization entirely.
2320+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2321+
return CONCURRENT_HALT_RETURN;
2322+
}
23082323
// factorize
23092324
status = data.chol->factorize(data.device_ADAT);
23102325
}

cpp/src/dual_simplex/basis_updates.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2431,7 +2431,22 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
24312431
assert(q.size() == A.m);
24322432
reorder_basic_list(q, basic_list); // We no longer need q after reordering the basic list
24332433
work_estimate_ += 3 * q.size();
2434-
reset();
2434+
2435+
// Check halt before the transpose operations: these can take hundreds of ms
2436+
// on large problems (L0 and U0 each have O(fill-in) nonzeros) and have no
2437+
// internal halt checks. Catching the flag here avoids the dead zone.
2438+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2439+
return CONCURRENT_HALT_RETURN;
2440+
}
2441+
// Inline reset() so we can check halt between the two transposes.
2442+
clear();
2443+
L0_.transpose(L0_transpose_);
2444+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2445+
return CONCURRENT_HALT_RETURN;
2446+
}
2447+
U0_.transpose(U0_transpose_);
2448+
work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n];
2449+
reset_stats();
24352450
return 0;
24362451
}
24372452

cpp/src/dual_simplex/phase2.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2488,7 +2488,6 @@ dual::status_t dual_phase2(i_t phase,
24882488
const i_t n = lp.num_cols;
24892489
std::vector<i_t> basic_list(m);
24902490
std::vector<i_t> nonbasic_list;
2491-
std::vector<i_t> superbasic_list;
24922491
basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
24932492
const bool initialize_basis = true;
24942493
return dual_phase2_with_advanced_basis(phase,
@@ -2688,6 +2687,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
26882687
vector_norm2<i_t, f_t>(delta_y_steepest_edge));
26892688
}
26902689

2690+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2691+
return dual::status_t::CONCURRENT_LIMIT;
2692+
}
2693+
26912694
if (phase == 2) {
26922695
settings.log.printf(" Iter Objective Num Inf. Sum Inf. Perturb Time\n");
26932696
}
@@ -2735,10 +2738,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
27352738
phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
27362739
#endif
27372740

2741+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2742+
return dual::status_t::CONCURRENT_LIMIT;
2743+
}
2744+
27382745
csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
27392746
lp.A.transpose(A_transpose);
27402747
phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
27412748

2749+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2750+
return dual::status_t::CONCURRENT_LIMIT;
2751+
}
2752+
27422753
f_t obj = compute_objective(lp, x);
27432754
phase2_work_estimate += 2 * n;
27442755

@@ -2908,6 +2919,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
29082919
phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
29092920
}
29102921
timers.btran_time += timers.stop_timer();
2922+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2923+
return dual::status_t::CONCURRENT_LIMIT;
2924+
}
29112925

29122926
const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
29132927
phase2_work_estimate += 2 * delta_y_sparse.i.size();
@@ -2966,6 +2980,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
29662980
}
29672981
}
29682982
timers.delta_z_time += timers.stop_timer();
2983+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
2984+
return dual::status_t::CONCURRENT_LIMIT;
2985+
}
29692986

29702987
#ifdef COMPUTE_DUAL_RESIDUAL
29712988
std::vector<f_t> dual_residual;
@@ -3301,6 +3318,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
33013318
}
33023319

33033320
timers.ftran_time += timers.stop_timer();
3321+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
3322+
return dual::status_t::CONCURRENT_LIMIT;
3323+
}
33043324

33053325
#ifdef CHECK_PRIMAL_STEP
33063326
std::vector<f_t> residual(m);
@@ -3331,6 +3351,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
33313351
#endif
33323352
assert(steepest_edge_status == 0);
33333353
timers.se_norms_time += timers.stop_timer();
3354+
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
3355+
return dual::status_t::CONCURRENT_LIMIT;
3356+
}
33343357

33353358
timers.start_timer();
33363359
// x <- x + delta_x

cpp/src/dual_simplex/solve.cpp

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -120,16 +120,17 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t<i_t, f_t>& original
120120
std::vector<i_t> basic_list(m);
121121
std::vector<i_t> nonbasic_list;
122122
basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
123-
return solve_linear_program_with_advanced_basis(original_lp,
124-
start_time,
125-
settings,
126-
original_solution,
127-
ft,
128-
basic_list,
129-
nonbasic_list,
130-
vstatus,
131-
edge_norms,
132-
work_unit_context);
123+
lp_status_t result = solve_linear_program_with_advanced_basis(original_lp,
124+
start_time,
125+
settings,
126+
original_solution,
127+
ft,
128+
basic_list,
129+
nonbasic_list,
130+
vstatus,
131+
edge_norms,
132+
work_unit_context);
133+
return result;
133134
}
134135

135136
template <typename i_t, typename f_t>
@@ -222,7 +223,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
222223
if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; }
223224
if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; }
224225
if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; }
225-
if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
226+
if (phase1_status == dual::status_t::CONCURRENT_LIMIT) {
227+
original_solution.iterations = iter;
228+
return lp_status_t::CONCURRENT_LIMIT;
229+
}
226230
phase1_obj = phase1_solution.objective;
227231
if (phase1_obj > -settings.primal_tol) {
228232
settings.log.printf("Dual feasible solution found.\n");
@@ -309,7 +313,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
309313
if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; }
310314
if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; }
311315
if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; }
312-
if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; }
316+
if (status == dual::status_t::CONCURRENT_LIMIT) {
317+
original_solution.iterations = iter;
318+
return lp_status_t::CONCURRENT_LIMIT;
319+
}
313320
if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; }
314321
if (status == dual::status_t::CUTOFF) { lp_status = lp_status_t::CUTOFF; }
315322
original_solution.iterations = iter;
@@ -581,6 +588,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& us
581588
solution.iterations = barrier_solution.iterations;
582589
}
583590

591+
if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
592+
584593
// If we aren't doing crossover, we're done
585594
if (!settings.crossover || barrier_lp.Q.n > 0) { return barrier_status; }
586595

@@ -681,6 +690,10 @@ lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
681690
std::vector<f_t> edge_norms;
682691
lp_status_t status = solve_linear_program_advanced(
683692
original_lp, start_time, settings, lp_solution, vstatus, edge_norms);
693+
if (status == lp_status_t::CONCURRENT_LIMIT) {
694+
solution.iterations = lp_solution.iterations;
695+
return lp_status_t::CONCURRENT_LIMIT;
696+
}
684697
uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x);
685698
uncrush_dual_solution(
686699
user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z);

0 commit comments

Comments
 (0)