@@ -112,6 +112,20 @@ __global__ void kernelScaleVector(
112112 }
113113}
114114
115+ __global__ void kernelComputeSolutionDelta (
116+ const double * __restrict__ d_primal_new,
117+ const double * __restrict__ d_primal_old,
118+ double * __restrict__ d_delta_primal,
119+ const double * __restrict__ d_dual_new,
120+ const double * __restrict__ d_dual_old,
121+ double * __restrict__ d_delta_dual, int n_cols, int n_rows) {
122+ const int n = n_cols > n_rows ? n_cols : n_rows;
123+ CUDA_GRID_STRIDE_LOOP (i, n) {
124+ if (i < n_cols) d_delta_primal[i] = d_primal_new[i] - d_primal_old[i];
125+ if (i < n_rows) d_delta_dual[i] = d_dual_new[i] - d_dual_old[i];
126+ }
127+ }
128+
115129// === KERNEL 4: Primal Convergence Check (Row-wise) ===
116130__global__ void kernelCheckPrimal (
117131 double * d_results,
@@ -215,10 +229,15 @@ __global__ void kernelCheckDual(
215229 local_dual_obj_part += obj_term;
216230 }
217231
218- // Atomic accumulation
219- atomicAdd (&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
220- atomicAdd (&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
221- atomicAdd (&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
232+ FULL_WARP_REDUCE (local_dual_feas_sq);
233+ FULL_WARP_REDUCE (local_primal_obj);
234+ FULL_WARP_REDUCE (local_dual_obj_part);
235+
236+ if ((threadIdx .x & 31 ) == 0 ) {
237+ atomicAdd (&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
238+ atomicAdd (&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
239+ atomicAdd (&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
240+ }
222241}
223242
224243// ============================================================================
@@ -389,6 +408,22 @@ void launchKernelScaleVector_wrapper(
389408 cudaGetLastError ();
390409}
391410
411+ void launchKernelComputeSolutionDelta_wrapper (
412+ const double * d_primal_new, const double * d_primal_old,
413+ double * d_delta_primal, const double * d_dual_new,
414+ const double * d_dual_old, double * d_delta_dual, int n_cols, int n_rows,
415+ cudaStream_t stream) {
416+ const int block_size = 256 ;
417+ const int n = n_cols > n_rows ? n_cols : n_rows;
418+ dim3 config = GetLaunchConfig (n, block_size);
419+
420+ kernelComputeSolutionDelta<<<config.x, block_size, 0 , stream>>> (
421+ d_primal_new, d_primal_old, d_delta_primal, d_dual_new, d_dual_old,
422+ d_delta_dual, n_cols, n_rows);
423+
424+ cudaGetLastError ();
425+ }
426+
392427void launchCheckConvergenceKernels_wrapper (
393428 double * d_results,
394429 double * d_slack_pos, double * d_slack_neg,
@@ -494,4 +529,4 @@ void launchKernelHalpernBlend_wrapper(
494529 d_halpern_iteration, k_offset, reflection_coeff, n);
495530 cudaGetLastError ();
496531}
497- } // extern "C"
532+ } // extern "C"
0 commit comments