Skip to content

Commit a6e640d

Browse files
committed
skip computeAvg for GPU
1 parent 19ceb12 commit a6e640d

1 file changed

Lines changed: 97 additions & 58 deletions

File tree

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 97 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,22 @@ void PDLPSolver::preprocessLp() {
340340
scaling_.passLp(&processed_lp);
341341
unscaled_processed_lp_ = processed_lp;
342342

343-
// 8. Compute and store norms of unscaled cost and rhs
343+
// 8. Compute cuPDLPx-style norms of the working problem data.
344+
//
345+
// Using ||row_lower|| alone is not compatible with the residual
346+
// normalization used by cuPDLPx after our slack-variable reformulation:
347+
// equalities should contribute once, one-sided constraints should contribute
348+
// their finite side, and transformed slack equalities contribute zero. The
349+
// PID restart controller is sensitive to this scaling.
344350
unscaled_c_norm_ = linalg::vectorNorm(processed_lp.col_cost_);
345-
unscaled_rhs_norm_ = linalg::vectorNorm(processed_lp.row_lower_);
351+
double rhs_norm_sq = 0.0;
352+
for (HighsInt i = 0; i < processed_lp.num_row_; ++i) {
353+
const double lower = processed_lp.row_lower_[i];
354+
const double upper = processed_lp.row_upper_[i];
355+
if (std::isfinite(lower) && lower != upper) rhs_norm_sq += lower * lower;
356+
if (std::isfinite(upper)) rhs_norm_sq += upper * upper;
357+
}
358+
unscaled_rhs_norm_ = std::sqrt(rhs_norm_sq);
346359

347360
logger_.detailed("Preprocessing complete. New dimensions: " +
348361
std::to_string(processed_lp.num_row_) + " rows, " +
@@ -502,6 +515,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
502515
setupGpu();
503516
#endif
504517
initializeStepSizes();
518+
#ifdef CUPDLP_GPU
519+
CUDA_CHECK(cudaMemcpyAsync(d_primal_step_size_, &stepsize_.primal_step,
520+
sizeof(double), cudaMemcpyHostToDevice,
521+
gpu_stream_));
522+
CUDA_CHECK(cudaMemcpyAsync(d_dual_step_size_, &stepsize_.dual_step,
523+
sizeof(double), cudaMemcpyHostToDevice,
524+
gpu_stream_));
525+
#endif
505526
initialize(); // Resets vectors, caches, and sets initial x_current,
506527
// y_current
507528
best_primal_weight_ = primal_weight_;
@@ -519,27 +540,29 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
519540
x_anchor_ = x_current_;
520541
y_anchor_ = y_current_;
521542
halpern_iteration_ = 0;
522-
#ifdef CUPDLP_GPU
523-
CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_x_current_,
524-
lp_.num_col_ * sizeof(double),
525-
cudaMemcpyDeviceToDevice));
526-
CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_y_current_,
527-
lp_.num_row_ * sizeof(double),
528-
cudaMemcpyDeviceToDevice));
529-
#endif
530543
}
531544

532545
// Pre-calculate ax and aTy for current iterate
533546
#ifdef CUPDLP_GPU
534-
CUDA_CHECK(cudaMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double)));
535-
CUDA_CHECK(cudaMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double)));
536-
CUDA_CHECK(cudaMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double)));
537-
CUDA_CHECK(cudaMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double)));
547+
if (!params_.use_halpern_restart) {
548+
CUDA_CHECK(cudaMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double)));
549+
CUDA_CHECK(cudaMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double)));
550+
CUDA_CHECK(cudaMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double)));
551+
CUDA_CHECK(cudaMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double)));
552+
}
538553
sum_weights_gpu_ = 0.0;
539554
CUDA_CHECK(cudaMemcpy(d_x_current_, x.data(), lp_.num_col_ * sizeof(double),
540555
cudaMemcpyHostToDevice));
541556
CUDA_CHECK(cudaMemcpy(d_y_current_, y.data(), lp_.num_row_ * sizeof(double),
542557
cudaMemcpyHostToDevice));
558+
if (params_.use_halpern_restart) {
559+
CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_x_current_,
560+
lp_.num_col_ * sizeof(double),
561+
cudaMemcpyDeviceToDevice));
562+
CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_y_current_,
563+
lp_.num_row_ * sizeof(double),
564+
cudaMemcpyDeviceToDevice));
565+
}
543566
linalgGpuAx(d_x_current_, d_ax_current_);
544567
linalgGpuATy(d_y_current_, d_aty_current_);
545568
#else
@@ -580,12 +603,6 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
580603

581604
// -- Step 1 (Major, isolated for restart-FPE check) --
582605
#ifdef CUPDLP_GPU
583-
CUDA_CHECK(cudaMemcpyAsync(d_primal_step_size_, &stepsize_.primal_step,
584-
sizeof(double), cudaMemcpyHostToDevice,
585-
gpu_stream_));
586-
CUDA_CHECK(cudaMemcpyAsync(d_dual_step_size_, &stepsize_.dual_step,
587-
sizeof(double), cudaMemcpyHostToDevice,
588-
gpu_stream_));
589606
CUDA_CHECK(cudaMemcpyAsync(d_halpern_iteration_, &halpern_iteration_,
590607
sizeof(int), cudaMemcpyHostToDevice,
591608
gpu_stream_));
@@ -685,6 +702,12 @@ if (DEBUG_MODE){
685702
CUDA_CHECK(cudaMemcpyAsync(d_y_current_, d_pdhg_dual_,
686703
lp_.num_row_ * sizeof(double),
687704
cudaMemcpyDeviceToDevice, gpu_stream_));
705+
CUDA_CHECK(cudaMemcpyAsync(d_primal_step_size_, &stepsize_.primal_step,
706+
sizeof(double), cudaMemcpyHostToDevice,
707+
gpu_stream_));
708+
CUDA_CHECK(cudaMemcpyAsync(d_dual_step_size_, &stepsize_.dual_step,
709+
sizeof(double), cudaMemcpyHostToDevice,
710+
gpu_stream_));
688711
#else
689712
x_anchor_ = x_next_;
690713
y_anchor_ = y_next_;
@@ -740,7 +763,7 @@ double PDLPSolver::computeFixedPointError() {
740763
}
741764

742765
double movement =
743-
primal_norm_sq * params_.omega + dual_norm_sq / params_.omega;
766+
primal_norm_sq * primal_weight_ + dual_norm_sq / primal_weight_;
744767
double interaction = 2.0 * params_.eta * cross_term;
745768

746769
return std::sqrt(std::max(0.0, movement + interaction));
@@ -802,7 +825,8 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
802825
TerminationStatus& status) {
803826
// 1. Compute Average Iterate (GPU or CPU)
804827
#ifdef CUPDLP_GPU
805-
computeAverageIterateGpu();
828+
const bool halpern_gpu_current_only = params_.use_halpern_restart;
829+
if (!halpern_gpu_current_only) computeAverageIterateGpu();
806830
#else
807831
computeAverageIterate(Ax_avg_, ATy_avg_);
808832
#endif
@@ -814,15 +838,21 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
814838

815839
// 2. Check Convergence (GPU or CPU)
816840
#ifdef CUPDLP_GPU
817-
// Only use Halpern PDHG slack/iterates if we have performed at least one
818-
// iteration
819-
if (params_.use_halpern_restart && d_pdhg_primal_ != nullptr && iter > 0) {
820-
linalgGpuAx(d_pdhg_primal_, d_ax_next_);
821-
linalgGpuATy(d_pdhg_dual_, d_aty_next_);
822-
current_converged =
823-
checkConvergenceGpu(iter, d_pdhg_primal_, d_pdhg_dual_, d_ax_next_,
824-
d_aty_next_, params_.tolerance, current_results,
825-
"[L-GPU]", d_dSlackPos_, d_dSlackNeg_, true);
841+
if (params_.use_halpern_restart) {
842+
if (d_pdhg_primal_ != nullptr && iter > 0) {
843+
linalgGpuAx(d_pdhg_primal_, d_ax_next_);
844+
linalgGpuATy(d_pdhg_dual_, d_aty_next_);
845+
current_converged =
846+
checkConvergenceGpu(iter, d_pdhg_primal_, d_pdhg_dual_, d_ax_next_,
847+
d_aty_next_, params_.tolerance, current_results,
848+
"[L-GPU]", d_dSlackPos_, d_dSlackNeg_, true);
849+
} else {
850+
current_converged =
851+
checkConvergenceGpu(iter, d_x_current_, d_y_current_, d_ax_current_,
852+
d_aty_current_, params_.tolerance,
853+
current_results, "[L-GPU]", d_dSlackPos_,
854+
d_dSlackNeg_, false);
855+
}
826856
} else {
827857
current_converged =
828858
checkConvergenceGpu(iter, d_x_current_, d_y_current_, d_ax_current_,
@@ -854,7 +884,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
854884

855885
// 3. Handle Convergence Success
856886
if (current_converged || average_converged) {
857-
bool prefer_avg = average_converged;
887+
bool prefer_avg = !params_.use_halpern_restart && average_converged;
858888

859889
// Copy result to output vectors
860890
#ifdef CUPDLP_GPU
@@ -1959,18 +1989,12 @@ void AdaptiveLinesearchParams::initialise() {
19591989
// =============================================================================
19601990

19611991
void PDLPSolver::initializeStepSizes() {
1962-
double cost_norm_sq = linalg::vectorNormSquared(lp_.col_cost_);
1963-
double rhs_norm_sq = linalg::vectorNormSquared(lp_.row_lower_);
1964-
1965-
if (std::min(cost_norm_sq, rhs_norm_sq) > 1e-6) {
1966-
stepsize_.beta = cost_norm_sq / rhs_norm_sq;
1967-
} else {
1968-
stepsize_.beta = 1.0;
1969-
}
1970-
1971-
// Match initial primal weight calculation from cuPDLPx
1972-
params_.omega = (unscaled_c_norm_ + 1.0) / (unscaled_rhs_norm_ + 1.0);
1973-
primal_weight_ = params_.omega;
1992+
// Align the initial geometry with cuPDLPx: start from a neutral primal
1993+
// weight and let restart/PID updates adapt it afterwards.
1994+
primal_weight_ = 1.0;
1995+
best_primal_weight_ = primal_weight_;
1996+
stepsize_.beta = primal_weight_ * primal_weight_;
1997+
params_.omega = primal_weight_;
19741998

19751999
if (params_.step_size_strategy != StepSizeStrategy::FIXED &&
19762000
params_.step_size_strategy != StepSizeStrategy::PID) {
@@ -2493,8 +2517,10 @@ void PDLPSolver::setupGpu() {
24932517
CUDA_CHECK(cudaMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool)));
24942518
CUDA_CHECK(cudaMalloc(&d_x_current_, a_num_cols_ * sizeof(double)));
24952519
CUDA_CHECK(cudaMalloc(&d_y_current_, a_num_rows_ * sizeof(double)));
2496-
CUDA_CHECK(cudaMalloc(&d_x_avg_, a_num_cols_ * sizeof(double)));
2497-
CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
2520+
if (!params_.use_halpern_restart) {
2521+
CUDA_CHECK(cudaMalloc(&d_x_avg_, a_num_cols_ * sizeof(double)));
2522+
CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
2523+
}
24982524
CUDA_CHECK(cudaMalloc(&d_x_next_, a_num_cols_ * sizeof(double)));
24992525
CUDA_CHECK(cudaMalloc(&d_y_next_, a_num_rows_ * sizeof(double)));
25002526
CUDA_CHECK(cudaMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double)));
@@ -2518,15 +2544,19 @@ void PDLPSolver::setupGpu() {
25182544
CUDA_CHECK(cudaMalloc(&d_aty_current_, a_num_cols_ * sizeof(double)));
25192545
CUDA_CHECK(cudaMalloc(&d_ax_next_, a_num_rows_ * sizeof(double)));
25202546
CUDA_CHECK(cudaMalloc(&d_aty_next_, a_num_cols_ * sizeof(double)));
2521-
CUDA_CHECK(cudaMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double)));
2522-
CUDA_CHECK(cudaMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double)));
2523-
CUDA_CHECK(cudaMalloc(&d_x_sum_, a_num_cols_ * sizeof(double)));
2524-
CUDA_CHECK(cudaMalloc(&d_y_sum_, a_num_rows_ * sizeof(double)));
2547+
if (!params_.use_halpern_restart) {
2548+
CUDA_CHECK(cudaMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double)));
2549+
CUDA_CHECK(cudaMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double)));
2550+
CUDA_CHECK(cudaMalloc(&d_x_sum_, a_num_cols_ * sizeof(double)));
2551+
CUDA_CHECK(cudaMalloc(&d_y_sum_, a_num_rows_ * sizeof(double)));
2552+
}
25252553
CUDA_CHECK(cudaMalloc(&d_convergence_results_, 4 * sizeof(double)));
25262554
CUDA_CHECK(cudaMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double)));
25272555
CUDA_CHECK(cudaMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double)));
2528-
CUDA_CHECK(cudaMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double)));
2529-
CUDA_CHECK(cudaMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double)));
2556+
if (!params_.use_halpern_restart) {
2557+
CUDA_CHECK(cudaMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double)));
2558+
CUDA_CHECK(cudaMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double)));
2559+
}
25302560
CUDA_CHECK(cudaMalloc(&d_col_scale_, a_num_cols_ * sizeof(double)));
25312561
CUDA_CHECK(cudaMalloc(&d_row_scale_, a_num_rows_ * sizeof(double)));
25322562

@@ -2602,16 +2632,20 @@ void PDLPSolver::setupGpu() {
26022632

26032633
CUDA_CHECK(cudaMemset(d_x_current_, 0, a_num_cols_ * sizeof(double)));
26042634
CUDA_CHECK(cudaMemset(d_y_current_, 0, a_num_rows_ * sizeof(double)));
2605-
CUDA_CHECK(cudaMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double)));
2606-
CUDA_CHECK(cudaMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double)));
2635+
if (!params_.use_halpern_restart) {
2636+
CUDA_CHECK(cudaMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double)));
2637+
CUDA_CHECK(cudaMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double)));
2638+
}
26072639
CUDA_CHECK(cudaMemset(d_x_next_, 0, a_num_cols_ * sizeof(double)));
26082640
CUDA_CHECK(cudaMemset(d_y_next_, 0, a_num_rows_ * sizeof(double)));
26092641
CUDA_CHECK(cudaMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double)));
26102642
CUDA_CHECK(cudaMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double)));
2611-
CUDA_CHECK(cudaMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double)));
2612-
CUDA_CHECK(cudaMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double)));
2613-
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
2614-
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
2643+
if (!params_.use_halpern_restart) {
2644+
CUDA_CHECK(cudaMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double)));
2645+
CUDA_CHECK(cudaMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double)));
2646+
CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
2647+
CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
2648+
}
26152649
CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
26162650
sum_weights_gpu_ = 0.0;
26172651

@@ -2671,6 +2705,8 @@ void PDLPSolver::cleanupGpu() {
26712705
CUDA_CHECK(cudaFree(d_y_temp_diff_norm_result_));
26722706
CUDA_CHECK(cudaFree(d_x_current_));
26732707
CUDA_CHECK(cudaFree(d_y_current_));
2708+
CUDA_CHECK(cudaFree(d_x_avg_));
2709+
CUDA_CHECK(cudaFree(d_y_avg_));
26742710
CUDA_CHECK(cudaFree(d_x_next_));
26752711
CUDA_CHECK(cudaFree(d_y_next_));
26762712
CUDA_CHECK(cudaFree(d_ax_current_));
@@ -2855,6 +2891,9 @@ void PDLPSolver::updateAverageIteratesGpu(HighsInt inner_iter) {
28552891
}
28562892

28572893
void PDLPSolver::computeAverageIterateGpu() {
2894+
if (params_.use_halpern_restart || d_x_avg_ == nullptr || d_y_avg_ == nullptr)
2895+
return;
2896+
28582897
double dScale = sum_weights_gpu_ > 1e-10 ? 1.0 / sum_weights_gpu_ : 1.0;
28592898

28602899
// x_avg = x_sum * Scale

0 commit comments

Comments
 (0)