@@ -340,9 +340,22 @@ void PDLPSolver::preprocessLp() {
340340 scaling_.passLp (&processed_lp);
341341 unscaled_processed_lp_ = processed_lp;
342342
343- // 8. Compute and store norms of unscaled cost and rhs
343+ // 8. Compute cuPDLPx-style norms of the working problem data.
344+ //
345+ // Using ||row_lower|| alone is not compatible with the residual
346+ // normalization used by cuPDLPx after our slack-variable reformulation:
347+ // equalities should contribute once, one-sided constraints should contribute
348+ // their finite side, and transformed slack equalities contribute zero. The
349+ // PID restart controller is sensitive to this scaling.
344350 unscaled_c_norm_ = linalg::vectorNorm (processed_lp.col_cost_ );
345- unscaled_rhs_norm_ = linalg::vectorNorm (processed_lp.row_lower_ );
351+ double rhs_norm_sq = 0.0 ;
352+ for (HighsInt i = 0 ; i < processed_lp.num_row_ ; ++i) {
353+ const double lower = processed_lp.row_lower_ [i];
354+ const double upper = processed_lp.row_upper_ [i];
355+ if (std::isfinite (lower) && lower != upper) rhs_norm_sq += lower * lower;
356+ if (std::isfinite (upper)) rhs_norm_sq += upper * upper;
357+ }
358+ unscaled_rhs_norm_ = std::sqrt (rhs_norm_sq);
346359
347360 logger_.detailed (" Preprocessing complete. New dimensions: " +
348361 std::to_string (processed_lp.num_row_ ) + " rows, " +
@@ -502,6 +515,14 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
502515 setupGpu ();
503516#endif
504517 initializeStepSizes ();
518+ #ifdef CUPDLP_GPU
519+ CUDA_CHECK (cudaMemcpyAsync (d_primal_step_size_, &stepsize_.primal_step ,
520+ sizeof (double ), cudaMemcpyHostToDevice,
521+ gpu_stream_));
522+ CUDA_CHECK (cudaMemcpyAsync (d_dual_step_size_, &stepsize_.dual_step ,
523+ sizeof (double ), cudaMemcpyHostToDevice,
524+ gpu_stream_));
525+ #endif
505526 initialize (); // Resets vectors, caches, and sets initial x_current,
506527 // y_current
507528 best_primal_weight_ = primal_weight_;
@@ -519,27 +540,29 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
519540 x_anchor_ = x_current_;
520541 y_anchor_ = y_current_;
521542 halpern_iteration_ = 0 ;
522- #ifdef CUPDLP_GPU
523- CUDA_CHECK (cudaMemcpy (d_x_anchor_, d_x_current_,
524- lp_.num_col_ * sizeof (double ),
525- cudaMemcpyDeviceToDevice));
526- CUDA_CHECK (cudaMemcpy (d_y_anchor_, d_y_current_,
527- lp_.num_row_ * sizeof (double ),
528- cudaMemcpyDeviceToDevice));
529- #endif
530543 }
531544
532545 // Pre-calculate ax and aTy for current iterate
533546#ifdef CUPDLP_GPU
534- CUDA_CHECK (cudaMemset (d_x_sum_, 0 , lp_.num_col_ * sizeof (double )));
535- CUDA_CHECK (cudaMemset (d_y_sum_, 0 , lp_.num_row_ * sizeof (double )));
536- CUDA_CHECK (cudaMemset (d_x_avg_, 0 , lp_.num_col_ * sizeof (double )));
537- CUDA_CHECK (cudaMemset (d_y_avg_, 0 , lp_.num_row_ * sizeof (double )));
547+ if (!params_.use_halpern_restart ) {
548+ CUDA_CHECK (cudaMemset (d_x_sum_, 0 , lp_.num_col_ * sizeof (double )));
549+ CUDA_CHECK (cudaMemset (d_y_sum_, 0 , lp_.num_row_ * sizeof (double )));
550+ CUDA_CHECK (cudaMemset (d_x_avg_, 0 , lp_.num_col_ * sizeof (double )));
551+ CUDA_CHECK (cudaMemset (d_y_avg_, 0 , lp_.num_row_ * sizeof (double )));
552+ }
538553 sum_weights_gpu_ = 0.0 ;
539554 CUDA_CHECK (cudaMemcpy (d_x_current_, x.data (), lp_.num_col_ * sizeof (double ),
540555 cudaMemcpyHostToDevice));
541556 CUDA_CHECK (cudaMemcpy (d_y_current_, y.data (), lp_.num_row_ * sizeof (double ),
542557 cudaMemcpyHostToDevice));
558+ if (params_.use_halpern_restart ) {
559+ CUDA_CHECK (cudaMemcpy (d_x_anchor_, d_x_current_,
560+ lp_.num_col_ * sizeof (double ),
561+ cudaMemcpyDeviceToDevice));
562+ CUDA_CHECK (cudaMemcpy (d_y_anchor_, d_y_current_,
563+ lp_.num_row_ * sizeof (double ),
564+ cudaMemcpyDeviceToDevice));
565+ }
543566 linalgGpuAx (d_x_current_, d_ax_current_);
544567 linalgGpuATy (d_y_current_, d_aty_current_);
545568#else
@@ -580,12 +603,6 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
580603
581604 // -- Step 1 (Major, isolated for restart-FPE check) --
582605#ifdef CUPDLP_GPU
583- CUDA_CHECK (cudaMemcpyAsync (d_primal_step_size_, &stepsize_.primal_step ,
584- sizeof (double ), cudaMemcpyHostToDevice,
585- gpu_stream_));
586- CUDA_CHECK (cudaMemcpyAsync (d_dual_step_size_, &stepsize_.dual_step ,
587- sizeof (double ), cudaMemcpyHostToDevice,
588- gpu_stream_));
589606 CUDA_CHECK (cudaMemcpyAsync (d_halpern_iteration_, &halpern_iteration_,
590607 sizeof (int ), cudaMemcpyHostToDevice,
591608 gpu_stream_));
@@ -685,6 +702,12 @@ if (DEBUG_MODE){
685702 CUDA_CHECK (cudaMemcpyAsync (d_y_current_, d_pdhg_dual_,
686703 lp_.num_row_ * sizeof (double ),
687704 cudaMemcpyDeviceToDevice, gpu_stream_));
705+ CUDA_CHECK (cudaMemcpyAsync (d_primal_step_size_, &stepsize_.primal_step ,
706+ sizeof (double ), cudaMemcpyHostToDevice,
707+ gpu_stream_));
708+ CUDA_CHECK (cudaMemcpyAsync (d_dual_step_size_, &stepsize_.dual_step ,
709+ sizeof (double ), cudaMemcpyHostToDevice,
710+ gpu_stream_));
688711#else
689712 x_anchor_ = x_next_;
690713 y_anchor_ = y_next_;
@@ -740,7 +763,7 @@ double PDLPSolver::computeFixedPointError() {
740763 }
741764
742765 double movement =
743- primal_norm_sq * params_. omega + dual_norm_sq / params_. omega ;
766+ primal_norm_sq * primal_weight_ + dual_norm_sq / primal_weight_ ;
744767 double interaction = 2.0 * params_.eta * cross_term;
745768
746769 return std::sqrt (std::max (0.0 , movement + interaction));
@@ -802,7 +825,8 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
802825 TerminationStatus& status) {
803826 // 1. Compute Average Iterate (GPU or CPU)
804827#ifdef CUPDLP_GPU
805- computeAverageIterateGpu ();
828+ const bool halpern_gpu_current_only = params_.use_halpern_restart ;
829+ if (!halpern_gpu_current_only) computeAverageIterateGpu ();
806830#else
807831 computeAverageIterate (Ax_avg_, ATy_avg_);
808832#endif
@@ -814,15 +838,21 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
814838
815839 // 2. Check Convergence (GPU or CPU)
816840#ifdef CUPDLP_GPU
817- // Only use Halpern PDHG slack/iterates if we have performed at least one
818- // iteration
819- if (params_.use_halpern_restart && d_pdhg_primal_ != nullptr && iter > 0 ) {
820- linalgGpuAx (d_pdhg_primal_, d_ax_next_);
821- linalgGpuATy (d_pdhg_dual_, d_aty_next_);
822- current_converged =
823- checkConvergenceGpu (iter, d_pdhg_primal_, d_pdhg_dual_, d_ax_next_,
824- d_aty_next_, params_.tolerance , current_results,
825- " [L-GPU]" , d_dSlackPos_, d_dSlackNeg_, true );
841+ if (params_.use_halpern_restart ) {
842+ if (d_pdhg_primal_ != nullptr && iter > 0 ) {
843+ linalgGpuAx (d_pdhg_primal_, d_ax_next_);
844+ linalgGpuATy (d_pdhg_dual_, d_aty_next_);
845+ current_converged =
846+ checkConvergenceGpu (iter, d_pdhg_primal_, d_pdhg_dual_, d_ax_next_,
847+ d_aty_next_, params_.tolerance , current_results,
848+ " [L-GPU]" , d_dSlackPos_, d_dSlackNeg_, true );
849+ } else {
850+ current_converged =
851+ checkConvergenceGpu (iter, d_x_current_, d_y_current_, d_ax_current_,
852+ d_aty_current_, params_.tolerance ,
853+ current_results, " [L-GPU]" , d_dSlackPos_,
854+ d_dSlackNeg_, false );
855+ }
826856 } else {
827857 current_converged =
828858 checkConvergenceGpu (iter, d_x_current_, d_y_current_, d_ax_current_,
@@ -854,7 +884,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
854884
855885 // 3. Handle Convergence Success
856886 if (current_converged || average_converged) {
857- bool prefer_avg = average_converged;
887+ bool prefer_avg = !params_. use_halpern_restart && average_converged;
858888
859889 // Copy result to output vectors
860890#ifdef CUPDLP_GPU
@@ -1959,18 +1989,12 @@ void AdaptiveLinesearchParams::initialise() {
19591989// =============================================================================
19601990
19611991void PDLPSolver::initializeStepSizes () {
1962- double cost_norm_sq = linalg::vectorNormSquared (lp_.col_cost_ );
1963- double rhs_norm_sq = linalg::vectorNormSquared (lp_.row_lower_ );
1964-
1965- if (std::min (cost_norm_sq, rhs_norm_sq) > 1e-6 ) {
1966- stepsize_.beta = cost_norm_sq / rhs_norm_sq;
1967- } else {
1968- stepsize_.beta = 1.0 ;
1969- }
1970-
1971- // Match initial primal weight calculation from cuPDLPx
1972- params_.omega = (unscaled_c_norm_ + 1.0 ) / (unscaled_rhs_norm_ + 1.0 );
1973- primal_weight_ = params_.omega ;
1992+ // Align the initial geometry with cuPDLPx: start from a neutral primal
1993+ // weight and let restart/PID updates adapt it afterwards.
1994+ primal_weight_ = 1.0 ;
1995+ best_primal_weight_ = primal_weight_;
1996+ stepsize_.beta = primal_weight_ * primal_weight_;
1997+ params_.omega = primal_weight_;
19741998
19751999 if (params_.step_size_strategy != StepSizeStrategy::FIXED &&
19762000 params_.step_size_strategy != StepSizeStrategy::PID) {
@@ -2493,8 +2517,10 @@ void PDLPSolver::setupGpu() {
24932517 CUDA_CHECK (cudaMalloc (&d_is_equality_row_, a_num_rows_ * sizeof (bool )));
24942518 CUDA_CHECK (cudaMalloc (&d_x_current_, a_num_cols_ * sizeof (double )));
24952519 CUDA_CHECK (cudaMalloc (&d_y_current_, a_num_rows_ * sizeof (double )));
2496- CUDA_CHECK (cudaMalloc (&d_x_avg_, a_num_cols_ * sizeof (double )));
2497- CUDA_CHECK (cudaMalloc (&d_y_avg_, a_num_rows_ * sizeof (double )));
2520+ if (!params_.use_halpern_restart ) {
2521+ CUDA_CHECK (cudaMalloc (&d_x_avg_, a_num_cols_ * sizeof (double )));
2522+ CUDA_CHECK (cudaMalloc (&d_y_avg_, a_num_rows_ * sizeof (double )));
2523+ }
24982524 CUDA_CHECK (cudaMalloc (&d_x_next_, a_num_cols_ * sizeof (double )));
24992525 CUDA_CHECK (cudaMalloc (&d_y_next_, a_num_rows_ * sizeof (double )));
25002526 CUDA_CHECK (cudaMalloc (&d_x_at_last_restart_, a_num_cols_ * sizeof (double )));
@@ -2518,15 +2544,19 @@ void PDLPSolver::setupGpu() {
25182544 CUDA_CHECK (cudaMalloc (&d_aty_current_, a_num_cols_ * sizeof (double )));
25192545 CUDA_CHECK (cudaMalloc (&d_ax_next_, a_num_rows_ * sizeof (double )));
25202546 CUDA_CHECK (cudaMalloc (&d_aty_next_, a_num_cols_ * sizeof (double )));
2521- CUDA_CHECK (cudaMalloc (&d_ax_avg_, a_num_rows_ * sizeof (double )));
2522- CUDA_CHECK (cudaMalloc (&d_aty_avg_, a_num_cols_ * sizeof (double )));
2523- CUDA_CHECK (cudaMalloc (&d_x_sum_, a_num_cols_ * sizeof (double )));
2524- CUDA_CHECK (cudaMalloc (&d_y_sum_, a_num_rows_ * sizeof (double )));
2547+ if (!params_.use_halpern_restart ) {
2548+ CUDA_CHECK (cudaMalloc (&d_ax_avg_, a_num_rows_ * sizeof (double )));
2549+ CUDA_CHECK (cudaMalloc (&d_aty_avg_, a_num_cols_ * sizeof (double )));
2550+ CUDA_CHECK (cudaMalloc (&d_x_sum_, a_num_cols_ * sizeof (double )));
2551+ CUDA_CHECK (cudaMalloc (&d_y_sum_, a_num_rows_ * sizeof (double )));
2552+ }
25252553 CUDA_CHECK (cudaMalloc (&d_convergence_results_, 4 * sizeof (double )));
25262554 CUDA_CHECK (cudaMalloc (&d_dSlackPos_, a_num_cols_ * sizeof (double )));
25272555 CUDA_CHECK (cudaMalloc (&d_dSlackNeg_, a_num_cols_ * sizeof (double )));
2528- CUDA_CHECK (cudaMalloc (&d_dSlackPosAvg_, a_num_cols_ * sizeof (double )));
2529- CUDA_CHECK (cudaMalloc (&d_dSlackNegAvg_, a_num_cols_ * sizeof (double )));
2556+ if (!params_.use_halpern_restart ) {
2557+ CUDA_CHECK (cudaMalloc (&d_dSlackPosAvg_, a_num_cols_ * sizeof (double )));
2558+ CUDA_CHECK (cudaMalloc (&d_dSlackNegAvg_, a_num_cols_ * sizeof (double )));
2559+ }
25302560 CUDA_CHECK (cudaMalloc (&d_col_scale_, a_num_cols_ * sizeof (double )));
25312561 CUDA_CHECK (cudaMalloc (&d_row_scale_, a_num_rows_ * sizeof (double )));
25322562
@@ -2602,16 +2632,20 @@ void PDLPSolver::setupGpu() {
26022632
26032633 CUDA_CHECK (cudaMemset (d_x_current_, 0 , a_num_cols_ * sizeof (double )));
26042634 CUDA_CHECK (cudaMemset (d_y_current_, 0 , a_num_rows_ * sizeof (double )));
2605- CUDA_CHECK (cudaMemset (d_x_avg_, 0 , a_num_cols_ * sizeof (double )));
2606- CUDA_CHECK (cudaMemset (d_y_avg_, 0 , a_num_rows_ * sizeof (double )));
2635+ if (!params_.use_halpern_restart ) {
2636+ CUDA_CHECK (cudaMemset (d_x_avg_, 0 , a_num_cols_ * sizeof (double )));
2637+ CUDA_CHECK (cudaMemset (d_y_avg_, 0 , a_num_rows_ * sizeof (double )));
2638+ }
26072639 CUDA_CHECK (cudaMemset (d_x_next_, 0 , a_num_cols_ * sizeof (double )));
26082640 CUDA_CHECK (cudaMemset (d_y_next_, 0 , a_num_rows_ * sizeof (double )));
26092641 CUDA_CHECK (cudaMemset (d_ax_current_, 0 , a_num_rows_ * sizeof (double )));
26102642 CUDA_CHECK (cudaMemset (d_ax_next_, 0 , a_num_rows_ * sizeof (double )));
2611- CUDA_CHECK (cudaMemset (d_ax_avg_, 0 , a_num_rows_ * sizeof (double )));
2612- CUDA_CHECK (cudaMemset (d_aty_avg_, 0 , a_num_cols_ * sizeof (double )));
2613- CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
2614- CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
2643+ if (!params_.use_halpern_restart ) {
2644+ CUDA_CHECK (cudaMemset (d_ax_avg_, 0 , a_num_rows_ * sizeof (double )));
2645+ CUDA_CHECK (cudaMemset (d_aty_avg_, 0 , a_num_cols_ * sizeof (double )));
2646+ CUDA_CHECK (cudaMemset (d_x_sum_, 0 , a_num_cols_ * sizeof (double )));
2647+ CUDA_CHECK (cudaMemset (d_y_sum_, 0 , a_num_rows_ * sizeof (double )));
2648+ }
26152649 CUDA_CHECK (cudaMemset (d_aty_current_, 0 , a_num_cols_ * sizeof (double )));
26162650 sum_weights_gpu_ = 0.0 ;
26172651
@@ -2671,6 +2705,8 @@ void PDLPSolver::cleanupGpu() {
26712705 CUDA_CHECK (cudaFree (d_y_temp_diff_norm_result_));
26722706 CUDA_CHECK (cudaFree (d_x_current_));
26732707 CUDA_CHECK (cudaFree (d_y_current_));
2708+ CUDA_CHECK (cudaFree (d_x_avg_));
2709+ CUDA_CHECK (cudaFree (d_y_avg_));
26742710 CUDA_CHECK (cudaFree (d_x_next_));
26752711 CUDA_CHECK (cudaFree (d_y_next_));
26762712 CUDA_CHECK (cudaFree (d_ax_current_));
@@ -2855,6 +2891,9 @@ void PDLPSolver::updateAverageIteratesGpu(HighsInt inner_iter) {
28552891}
28562892
28572893void PDLPSolver::computeAverageIterateGpu () {
2894+ if (params_.use_halpern_restart || d_x_avg_ == nullptr || d_y_avg_ == nullptr )
2895+ return ;
2896+
28582897 double dScale = sum_weights_gpu_ > 1e-10 ? 1.0 / sum_weights_gpu_ : 1.0 ;
28592898
28602899 // x_avg = x_sum * Scale
0 commit comments