Skip to content

Commit cc51825

Browse files
committed
optimize kernelCheckDual and computeFixedPointErrorGpu
1 parent 5e2efaf commit cc51825

3 files changed

Lines changed: 50 additions & 23 deletions

File tree

highs/pdlp/hipdlp/pdhg.cc

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -771,23 +771,9 @@ double PDLPSolver::computeFixedPointError() {
771771

772772
#ifdef CUPDLP_GPU
773773
double PDLPSolver::computeFixedPointErrorGpu() {
774-
double alpha_minus_one = -1.0;
775-
776-
// 1. delta_x = x_next_ - reflected_x_
777-
// (Assuming d_pdhg_primal_ maps to x_next_ and d_x_next_ is used as
778-
// reflected_x_ in your minor/major steps)
779-
CUDA_CHECK(cudaMemcpyAsync(d_delta_x_, d_pdhg_primal_,
780-
a_num_cols_ * sizeof(double),
781-
cudaMemcpyDeviceToDevice, gpu_stream_));
782-
CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one,
783-
d_x_next_, 1, d_delta_x_, 1));
784-
785-
// 2. delta_y = y_next_ - reflected_y_
786-
CUDA_CHECK(cudaMemcpyAsync(d_delta_y_, d_pdhg_dual_,
787-
a_num_rows_ * sizeof(double),
788-
cudaMemcpyDeviceToDevice, gpu_stream_));
789-
CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one,
790-
d_y_next_, 1, d_delta_y_, 1));
774+
launchKernelComputeSolutionDelta_wrapper(
775+
d_pdhg_primal_, d_x_next_, d_delta_x_, d_pdhg_dual_, d_y_next_,
776+
d_delta_y_, a_num_cols_, a_num_rows_, gpu_stream_);
791777

792778
// 3. AT_delta_y = A^T * delta_y
793779
linalgGpuATy(d_delta_y_, d_AT_delta_y_);

highs/pdlp/hipdlp/pdhg.cu

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,20 @@ __global__ void kernelScaleVector(
112112
}
113113
}
114114

115+
__global__ void kernelComputeSolutionDelta(
116+
const double* __restrict__ d_primal_new,
117+
const double* __restrict__ d_primal_old,
118+
double* __restrict__ d_delta_primal,
119+
const double* __restrict__ d_dual_new,
120+
const double* __restrict__ d_dual_old,
121+
double* __restrict__ d_delta_dual, int n_cols, int n_rows) {
122+
const int n = n_cols > n_rows ? n_cols : n_rows;
123+
CUDA_GRID_STRIDE_LOOP(i, n) {
124+
if (i < n_cols) d_delta_primal[i] = d_primal_new[i] - d_primal_old[i];
125+
if (i < n_rows) d_delta_dual[i] = d_dual_new[i] - d_dual_old[i];
126+
}
127+
}
128+
115129
// === KERNEL 4: Primal Convergence Check (Row-wise) ===
116130
__global__ void kernelCheckPrimal(
117131
double* d_results,
@@ -215,10 +229,15 @@ __global__ void kernelCheckDual(
215229
local_dual_obj_part += obj_term;
216230
}
217231

218-
// Atomic accumulation
219-
atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
220-
atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
221-
atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
232+
FULL_WARP_REDUCE(local_dual_feas_sq);
233+
FULL_WARP_REDUCE(local_primal_obj);
234+
FULL_WARP_REDUCE(local_dual_obj_part);
235+
236+
if ((threadIdx.x & 31) == 0) {
237+
atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
238+
atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
239+
atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
240+
}
222241
}
223242

224243
// ============================================================================
@@ -389,6 +408,22 @@ void launchKernelScaleVector_wrapper(
389408
cudaGetLastError();
390409
}
391410

411+
void launchKernelComputeSolutionDelta_wrapper(
412+
const double* d_primal_new, const double* d_primal_old,
413+
double* d_delta_primal, const double* d_dual_new,
414+
const double* d_dual_old, double* d_delta_dual, int n_cols, int n_rows,
415+
cudaStream_t stream) {
416+
const int block_size = 256;
417+
const int n = n_cols > n_rows ? n_cols : n_rows;
418+
dim3 config = GetLaunchConfig(n, block_size);
419+
420+
kernelComputeSolutionDelta<<<config.x, block_size, 0, stream>>>(
421+
d_primal_new, d_primal_old, d_delta_primal, d_dual_new, d_dual_old,
422+
d_delta_dual, n_cols, n_rows);
423+
424+
cudaGetLastError();
425+
}
426+
392427
void launchCheckConvergenceKernels_wrapper(
393428
double* d_results,
394429
double* d_slack_pos, double* d_slack_neg,
@@ -494,4 +529,4 @@ void launchKernelHalpernBlend_wrapper(
494529
d_halpern_iteration, k_offset, reflection_coeff, n);
495530
cudaGetLastError();
496531
}
497-
} // extern "C"
532+
} // extern "C"

highs/pdlp/hipdlp/pdhg_kernels.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ void launchKernelUpdateAverages_wrapper(double* d_x_sum, double* d_y_sum,
2828
void launchKernelScaleVector_wrapper(double* d_out, const double* d_in,
2929
double scale, int n, cudaStream_t stream);
3030

31+
void launchKernelComputeSolutionDelta_wrapper(
32+
const double* d_primal_new, const double* d_primal_old,
33+
double* d_delta_primal, const double* d_dual_new,
34+
const double* d_dual_old, double* d_delta_dual, int n_cols, int n_rows,
35+
cudaStream_t stream);
36+
3137
void launchCheckConvergenceKernels_wrapper(
3238
double* d_results, double* d_slack_pos, double* d_slack_neg,
3339
const double* d_x, const double* d_y, const double* d_ax,
@@ -74,4 +80,4 @@ void launchKernelHalpernBlend_wrapper(double* d_current,
7480
#ifdef __cplusplus
7581
}
7682
#endif
77-
#endif
83+
#endif

0 commit comments

Comments
 (0)