optimize kernelCheckDual and computeFixedPointErrorGpu

Yanyu000 · Yanyu000 · commit cc5182596553 · 2026-04-30T15:39:47.000+01:00
diff --git a/highs/pdlp/hipdlp/pdhg.cc b/highs/pdlp/hipdlp/pdhg.cc
@@ -771,23 +771,9 @@ double PDLPSolver::computeFixedPointError() {
 
 #ifdef CUPDLP_GPU
 double PDLPSolver::computeFixedPointErrorGpu() {
-  double alpha_minus_one = -1.0;
-
-  // 1. delta_x = x_next_ - reflected_x_
-  // (Assuming d_pdhg_primal_ maps to x_next_ and d_x_next_ is used as
-  // reflected_x_ in your minor/major steps)
-  CUDA_CHECK(cudaMemcpyAsync(d_delta_x_, d_pdhg_primal_,
-                             a_num_cols_ * sizeof(double),
-                             cudaMemcpyDeviceToDevice, gpu_stream_));
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one,
-                           d_x_next_, 1, d_delta_x_, 1));
-
-  // 2. delta_y = y_next_ - reflected_y_
-  CUDA_CHECK(cudaMemcpyAsync(d_delta_y_, d_pdhg_dual_, 
-                             a_num_rows_ * sizeof(double),
-                              cudaMemcpyDeviceToDevice, gpu_stream_));
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one,
-                           d_y_next_, 1, d_delta_y_, 1));
+  launchKernelComputeSolutionDelta_wrapper(
+      d_pdhg_primal_, d_x_next_, d_delta_x_, d_pdhg_dual_, d_y_next_,
+      d_delta_y_, a_num_cols_, a_num_rows_, gpu_stream_);
 
   // 3. AT_delta_y = A^T * delta_y
   linalgGpuATy(d_delta_y_, d_AT_delta_y_);
diff --git a/highs/pdlp/hipdlp/pdhg.cu b/highs/pdlp/hipdlp/pdhg.cu
@@ -112,6 +112,20 @@ __global__ void kernelScaleVector(
   }
 }
 
+__global__ void kernelComputeSolutionDelta(
+    const double* __restrict__ d_primal_new,
+    const double* __restrict__ d_primal_old,
+    double* __restrict__ d_delta_primal,
+    const double* __restrict__ d_dual_new,
+    const double* __restrict__ d_dual_old,
+    double* __restrict__ d_delta_dual, int n_cols, int n_rows) {
+  const int n = n_cols > n_rows ? n_cols : n_rows;
+  CUDA_GRID_STRIDE_LOOP(i, n) {
+    if (i < n_cols) d_delta_primal[i] = d_primal_new[i] - d_primal_old[i];
+    if (i < n_rows) d_delta_dual[i] = d_dual_new[i] - d_dual_old[i];
+  }
+}
+
 // === KERNEL 4: Primal Convergence Check (Row-wise) ===
 __global__ void kernelCheckPrimal(
   double* d_results,
@@ -215,10 +229,15 @@ __global__ void kernelCheckDual(
     local_dual_obj_part += obj_term;
   }
 
-  // Atomic accumulation
-  atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
-  atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
-  atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
+  FULL_WARP_REDUCE(local_dual_feas_sq);
+  FULL_WARP_REDUCE(local_primal_obj);
+  FULL_WARP_REDUCE(local_dual_obj_part);
+
+  if ((threadIdx.x & 31) == 0) {
+    atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
+    atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
+    atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
+  }
 }
 
 // ============================================================================
@@ -389,6 +408,22 @@ void launchKernelScaleVector_wrapper(
     cudaGetLastError();
 }
 
+void launchKernelComputeSolutionDelta_wrapper(
+    const double* d_primal_new, const double* d_primal_old,
+    double* d_delta_primal, const double* d_dual_new,
+    const double* d_dual_old, double* d_delta_dual, int n_cols, int n_rows,
+    cudaStream_t stream) {
+    const int block_size = 256;
+    const int n = n_cols > n_rows ? n_cols : n_rows;
+    dim3 config = GetLaunchConfig(n, block_size);
+
+    kernelComputeSolutionDelta<<<config.x, block_size, 0, stream>>>(
+        d_primal_new, d_primal_old, d_delta_primal, d_dual_new, d_dual_old,
+        d_delta_dual, n_cols, n_rows);
+
+    cudaGetLastError();
+}
+
 void launchCheckConvergenceKernels_wrapper(
     double* d_results,
     double* d_slack_pos, double* d_slack_neg,
@@ -494,4 +529,4 @@ void launchKernelHalpernBlend_wrapper(
         d_halpern_iteration, k_offset, reflection_coeff, n);
     cudaGetLastError();
 }
-} // extern "C"
+} // extern "C"
diff --git a/highs/pdlp/hipdlp/pdhg_kernels.hpp b/highs/pdlp/hipdlp/pdhg_kernels.hpp
@@ -28,6 +28,12 @@ void launchKernelUpdateAverages_wrapper(double* d_x_sum, double* d_y_sum,
 void launchKernelScaleVector_wrapper(double* d_out, const double* d_in,
                                      double scale, int n, cudaStream_t stream);
 
+void launchKernelComputeSolutionDelta_wrapper(
+    const double* d_primal_new, const double* d_primal_old,
+    double* d_delta_primal, const double* d_dual_new,
+    const double* d_dual_old, double* d_delta_dual, int n_cols, int n_rows,
+    cudaStream_t stream);
+
 void launchCheckConvergenceKernels_wrapper(
     double* d_results, double* d_slack_pos, double* d_slack_neg,
     const double* d_x, const double* d_y, const double* d_ax,
@@ -74,4 +80,4 @@ void launchKernelHalpernBlend_wrapper(double* d_current,
 #ifdef __cplusplus
 }
 #endif
-#endif
+#endif