Merge pull request QMCPACK#5594 from ye-luo/update-du-gpu

PDoakORNL · web-flow · commit ceb830bc07d1 · 2025-08-13T11:57:19.000-04:00
Slight adjustment in batched delayed update
diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateBatched.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateBatched.h
@@ -645,11 +645,11 @@ class DelayedUpdateBatched
     //std::copy_n(Ainv[rowchanged], norb, V[delay_count]);
     compute::BLAS::copy_batched(blas_handle, norb, invRow_mw_ptr, 1, V_row_mw_ptr, 1, nw);
     // handle accepted walkers
-    // the new Binv is [[X Y] [Z sigma]]
+    // the new Binv is [[X y] [z sigma]]
     //BLAS::gemv('T', norb, delay_count + 1, cminusone, V.data(), norb, psiV.data(), 1, czero, p.data(), 1);
     compute::BLAS::gemv_batched(blas_handle, 'T', norb, delay_count, cminusone_vec.device_data(), V_mw_ptr, norb,
                                 phiVGL_mw_ptr, 1, czero_vec.device_data(), p_mw_ptr, 1, n_accepted);
-    // Y
+    // y
     //BLAS::gemv('T', delay_count, delay_count, sigma, Binv.data(), lda_Binv, p.data(), 1, czero, Binv.data() + delay_count,
     //           lda_Binv);
     compute::BLAS::gemv_batched(blas_handle, 'T', delay_count, delay_count, ratio_inv_mw_ptr, Binv_mw_ptr, lda_Binv,
diff --git a/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu b/src/QMCWaveFunctions/detail/CUDA/matrix_update_helper.cu
@@ -321,20 +321,16 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
 
   if (iw < n_accepted)
   {
-    // real accept, settle y and Z
+    // real accept
     int* __restrict__ delay_list_iw = delay_list[iw];
     T* __restrict__ binvrow_iw      = binv[iw] + delay_count * binv_lda;
-    const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
-    T* __restrict__ phi_out_iw      = phi_out[iw];
-    T* __restrict__ dphi_out_iw     = dphi_out[iw];
-    T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
-
     if (tid == 0)
     {
       delay_list_iw[delay_count] = rowchanged;
       binvrow_iw[delay_count]    = ratio_inv[iw];
     }
 
+    // Settle z by applying the final resaling.
     const int num_delay_count_col_blocks = (delay_count + COLBS - 1) / COLBS;
     for (int ib = 0; ib < num_delay_count_col_blocks; ib++)
     {
@@ -343,6 +339,12 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
         binvrow_iw[col_id] *= ratio_inv[iw];
     }
 
+    // Save VGL
+    const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
+    T* __restrict__ phi_out_iw      = phi_out[iw];
+    T* __restrict__ dphi_out_iw     = dphi_out[iw];
+    T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
+
     const int num_col_blocks = (norb + COLBS - 1) / COLBS;
     for (int ib = 0; ib < num_col_blocks; ib++)
     {
@@ -360,7 +362,7 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
   }
   else
   {
-    // fake accept. Set Y, Z with zero and x with 1
+    // pseudo accept
     T* __restrict__ Urow_iw   = phi_out[iw];
     const int num_blocks_norb = (norb + COLBS - 1) / COLBS;
     for (int ib = 0; ib < num_blocks_norb; ib++)
@@ -370,15 +372,17 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],
         Urow_iw[col_id] = T(0);
     }
 
+    // Set y to zero
     T* __restrict__ binv_iw          = binv[iw];
     const int num_blocks_delay_count = (delay_count + COLBS - 1) / COLBS;
     for (int ib = 0; ib < num_blocks_delay_count; ib++)
     {
       const int col_id = ib * COLBS + tid;
       if (col_id < delay_count)
-        binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);
+        binv_iw[delay_count + binv_lda * col_id] = T(0);
     }
 
+    // Set x to 1
     int* __restrict__ delay_list_iw = delay_list[iw];
     if (tid == 0)
     {
diff --git a/src/QMCWaveFunctions/detail/SYCL/matrix_update_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/matrix_update_helper.cpp
@@ -233,20 +233,16 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,
 
                            if (iw < n_accepted)
                            {
-                             // real accept, settle y and Z
+                             // real accept
                              int* __restrict__ delay_list_iw = delay_list[iw];
                              T* __restrict__ binvrow_iw      = binv[iw] + delay_count * binv_lda;
-                             const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
-                             T* __restrict__ phi_out_iw      = phi_out[iw];
-                             T* __restrict__ dphi_out_iw     = dphi_out[iw];
-                             T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
-
                              if (tid == 0)
                              {
                                delay_list_iw[delay_count] = rowchanged;
                                binvrow_iw[delay_count]    = ratio_inv[iw];
                              }
 
+                             // Settle z
                              const int num_delay_count_col_blocks = (delay_count + COLBS - 1) / COLBS;
                              for (int ib = 0; ib < num_delay_count_col_blocks; ib++)
                              {
@@ -255,6 +251,12 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,
                                  binvrow_iw[col_id] *= ratio_inv[iw];
                              }
 
+                             // Save VGL
+                             const T* __restrict__ phi_in_iw = phi_vgl_in[iw];
+                             T* __restrict__ phi_out_iw      = phi_out[iw];
+                             T* __restrict__ dphi_out_iw     = dphi_out[iw];
+                             T* __restrict__ d2phi_out_iw    = d2phi_out[iw];
+
                              const int num_col_blocks = (norb + COLBS - 1) / COLBS;
                              for (int ib = 0; ib < num_col_blocks; ib++)
                              {
@@ -272,7 +274,7 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,
                            }
                            else
                            {
-                             // fake accept. Set Y, Z with zero and x with 1
+                             // pseudo accept
                              T* __restrict__ Urow_iw   = phi_out[iw];
                              const int num_blocks_norb = (norb + COLBS - 1) / COLBS;
                              for (int ib = 0; ib < num_blocks_norb; ib++)
@@ -282,16 +284,17 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,
                                  Urow_iw[col_id] = T{};
                              }
 
+                             // Set y to zero
                              T* __restrict__ binv_iw          = binv[iw];
                              const int num_blocks_delay_count = (delay_count + COLBS - 1) / COLBS;
                              for (int ib = 0; ib < num_blocks_delay_count; ib++)
                              {
                                const int col_id = ib * COLBS + tid;
                                if (col_id < delay_count)
-                                 binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] =
-                                     T(0);
+                                 binv_iw[delay_count + binv_lda * col_id] = T(0);
                              }
 
+                             // Set x to 1
                              int* __restrict__ delay_list_iw = delay_list[iw];
                              if (tid == 0)
                              {

Original file line number	Diff line number	Diff line change
`@@ -321,20 +321,16 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],`
`321`	`321`
`322`	`322`	`if (iw < n_accepted)`
`323`	`323`	`{`
`324`		`- // real accept, settle y and Z`
	`324`	`+ // real accept`
`325`	`325`	`int* __restrict__ delay_list_iw = delay_list[iw];`
`326`	`326`	`T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;`
`327`		`- const T* __restrict__ phi_in_iw = phi_vgl_in[iw];`
`328`		`- T* __restrict__ phi_out_iw = phi_out[iw];`
`329`		`- T* __restrict__ dphi_out_iw = dphi_out[iw];`
`330`		`- T* __restrict__ d2phi_out_iw = d2phi_out[iw];`
`331`		`-`
`332`	`327`	`if (tid == 0)`
`333`	`328`	`{`
`334`	`329`	`delay_list_iw[delay_count] = rowchanged;`
`335`	`330`	`binvrow_iw[delay_count] = ratio_inv[iw];`
`336`	`331`	`}`
`337`	`332`
	`333`	`+ // Settle z by applying the final resaling.`
`338`	`334`	`const int num_delay_count_col_blocks = (delay_count + COLBS - 1) / COLBS;`
`339`	`335`	`for (int ib = 0; ib < num_delay_count_col_blocks; ib++)`
`340`	`336`	`{`
`@@ -343,6 +339,12 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],`
`343`	`339`	`binvrow_iw[col_id] *= ratio_inv[iw];`
`344`	`340`	`}`
`345`	`341`
	`342`	`+ // Save VGL`
	`343`	`+ const T* __restrict__ phi_in_iw = phi_vgl_in[iw];`
	`344`	`+ T* __restrict__ phi_out_iw = phi_out[iw];`
	`345`	`+ T* __restrict__ dphi_out_iw = dphi_out[iw];`
	`346`	`+ T* __restrict__ d2phi_out_iw = d2phi_out[iw];`
	`347`	`+`
`346`	`348`	`const int num_col_blocks = (norb + COLBS - 1) / COLBS;`
`347`	`349`	`for (int ib = 0; ib < num_col_blocks; ib++)`
`348`	`350`	`{`
`@@ -360,7 +362,7 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],`
`360`	`362`	`}`
`361`	`363`	`else`
`362`	`364`	`{`
`363`		`- // fake accept. Set Y, Z with zero and x with 1`
	`365`	`+ // pseudo accept`
`364`	`366`	`T* __restrict__ Urow_iw = phi_out[iw];`
`365`	`367`	`const int num_blocks_norb = (norb + COLBS - 1) / COLBS;`
`366`	`368`	`for (int ib = 0; ib < num_blocks_norb; ib++)`
`@@ -370,15 +372,17 @@ __global__ void add_delay_list_save_sigma_VGL_kernel(int* const delay_list[],`
`370`	`372`	`Urow_iw[col_id] = T(0);`
`371`	`373`	`}`
`372`	`374`
	`375`	`+ // Set y to zero`
`373`	`376`	`T* __restrict__ binv_iw = binv[iw];`
`374`	`377`	`const int num_blocks_delay_count = (delay_count + COLBS - 1) / COLBS;`
`375`	`378`	`for (int ib = 0; ib < num_blocks_delay_count; ib++)`
`376`	`379`	`{`
`377`	`380`	`const int col_id = ib * COLBS + tid;`
`378`	`381`	`if (col_id < delay_count)`
`379`		`- binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] = T(0);`
	`382`	`+ binv_iw[delay_count + binv_lda * col_id] = T(0);`
`380`	`383`	`}`
`381`	`384`
	`385`	`+ // Set x to 1`
`382`	`386`	`int* __restrict__ delay_list_iw = delay_list[iw];`
`383`	`387`	`if (tid == 0)`
`384`	`388`	`{`
Original file line number	Diff line number	Diff line change
`@@ -233,20 +233,16 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,`
`233`	`233`
`234`	`234`	`if (iw < n_accepted)`
`235`	`235`	`{`
`236`		`- // real accept, settle y and Z`
	`236`	`+ // real accept`
`237`	`237`	`int* __restrict__ delay_list_iw = delay_list[iw];`
`238`	`238`	`T* __restrict__ binvrow_iw = binv[iw] + delay_count * binv_lda;`
`239`		`- const T* __restrict__ phi_in_iw = phi_vgl_in[iw];`
`240`		`- T* __restrict__ phi_out_iw = phi_out[iw];`
`241`		`- T* __restrict__ dphi_out_iw = dphi_out[iw];`
`242`		`- T* __restrict__ d2phi_out_iw = d2phi_out[iw];`
`243`		`-`
`244`	`239`	`if (tid == 0)`
`245`	`240`	`{`
`246`	`241`	`delay_list_iw[delay_count] = rowchanged;`
`247`	`242`	`binvrow_iw[delay_count] = ratio_inv[iw];`
`248`	`243`	`}`
`249`	`244`
	`245`	`+ // Settle z`
`250`	`246`	`const int num_delay_count_col_blocks = (delay_count + COLBS - 1) / COLBS;`
`251`	`247`	`for (int ib = 0; ib < num_delay_count_col_blocks; ib++)`
`252`	`248`	`{`
`@@ -255,6 +251,12 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,`
`255`	`251`	`binvrow_iw[col_id] *= ratio_inv[iw];`
`256`	`252`	`}`
`257`	`253`
	`254`	`+ // Save VGL`
	`255`	`+ const T* __restrict__ phi_in_iw = phi_vgl_in[iw];`
	`256`	`+ T* __restrict__ phi_out_iw = phi_out[iw];`
	`257`	`+ T* __restrict__ dphi_out_iw = dphi_out[iw];`
	`258`	`+ T* __restrict__ d2phi_out_iw = d2phi_out[iw];`
	`259`	`+`
`258`	`260`	`const int num_col_blocks = (norb + COLBS - 1) / COLBS;`
`259`	`261`	`for (int ib = 0; ib < num_col_blocks; ib++)`
`260`	`262`	`{`
`@@ -272,7 +274,7 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,`
`272`	`274`	`}`
`273`	`275`	`else`
`274`	`276`	`{`
`275`		`- // fake accept. Set Y, Z with zero and x with 1`
	`277`	`+ // pseudo accept`
`276`	`278`	`T* __restrict__ Urow_iw = phi_out[iw];`
`277`	`279`	`const int num_blocks_norb = (norb + COLBS - 1) / COLBS;`
`278`	`280`	`for (int ib = 0; ib < num_blocks_norb; ib++)`
`@@ -282,16 +284,17 @@ sycl::event add_delay_list_save_sigma_VGL_batched(sycl::queue& aq,`
`282`	`284`	`Urow_iw[col_id] = T{};`
`283`	`285`	`}`
`284`	`286`
	`287`	`+ // Set y to zero`
`285`	`288`	`T* __restrict__ binv_iw = binv[iw];`
`286`	`289`	`const int num_blocks_delay_count = (delay_count + COLBS - 1) / COLBS;`
`287`	`290`	`for (int ib = 0; ib < num_blocks_delay_count; ib++)`
`288`	`291`	`{`
`289`	`292`	`const int col_id = ib * COLBS + tid;`
`290`	`293`	`if (col_id < delay_count)`
`291`		`- binv_iw[delay_count * binv_lda + col_id] = binv_iw[delay_count + binv_lda * col_id] =`
`292`		`- T(0);`
	`294`	`+ binv_iw[delay_count + binv_lda * col_id] = T(0);`
`293`	`295`	`}`
`294`	`296`
	`297`	`+ // Set x to 1`
`295`	`298`	`int* __restrict__ delay_list_iw = delay_list[iw];`
`296`	`299`	`if (tid == 0)`
`297`	`300`	`{`