@@ -285,38 +285,40 @@ __kernel void calculate_cut_sparse(
285285}
286286
287287real1 single_bit_flip_worker_sparse (__constant uint * theta , __global const real1 * G_data , __global const uint * G_rows , __global const uint * G_cols , const int n , const bool is_spin_glass , const int k ) {
288- real1 energy = ZERO_R1 ;
289- for (int u = 0 ; u < n ; ++ u ) {
290- bool u_bit = get_const_bit (theta , u );
291- if (u == k ) {
292- u_bit = !u_bit ;
293- }
294- const size_t mCol = G_rows [u + 1 ];
295- for (int col = G_rows [u ]; col < mCol ; ++ col ) {
296- const int v = G_cols [col ];
297- const real1 val = G_data [col ];
298- bool v_bit = get_const_bit (theta , v );
299- if (v == k ) {
300- v_bit = !v_bit ;
301- }
302- if (u_bit != v_bit ) {
303- energy += val ;
304- } else if (is_spin_glass ) {
305- energy -= val ;
306- }
288+ // Only edges incident to k change when k is flipped.
289+ // Walk only the adjacency list of k: O(degree(k)) instead of O(n * degree).
290+ // Returns the energy DELTA (new_energy - base_energy) for this flip.
291+ // All threads share the same base state (best_theta), so deltas are
292+ // directly comparable for finding the best single-bit flip.
293+ const bool k_bit_old = get_const_bit (theta , k );
294+ const bool k_bit_new = !k_bit_old ;
295+ real1 delta = ZERO_R1 ;
296+ const uint row_end = G_rows [k + 1 ];
297+ for (uint col = G_rows [k ]; col < row_end ; ++ col ) {
298+ const int v = G_cols [col ];
299+ const real1 val = G_data [col ];
300+ const bool v_bit = get_const_bit (theta , v );
301+ const bool old_cut = k_bit_old != v_bit ;
302+ const bool new_cut = k_bit_new != v_bit ;
303+ if (old_cut != new_cut ) {
304+ delta += new_cut ? val : - val ;
307305 }
308306 }
309-
310- return energy ;
307+ // For spin glass, the same edge contributes to both cut and non-cut terms,
308+ // so the delta magnitude doubles (unlike-becomes-like costs twice: lose val + gain -val).
309+ if (is_spin_glass ) {
310+ delta *= 2.0 ;
311+ }
312+ return delta ;
311313}
312314
313315__kernel void single_bit_flips_sparse (
314316 __global const real1 * G_data ,
315317 __global const uint * G_rows ,
316318 __global const uint * G_cols ,
317319 __constant uint * best_theta ,
318- __constant int * args , // args[0] = n, args[1] = k
319- __global real1 * max_energy_ptr , // output: per-group min energy
320+ __constant int * args , // args[0] = n, args[1] = is_spin_glass
321+ __global real1 * max_energy_ptr , // output: per-group best delta
320322 __global int * max_index_ptr , // output: per-group best index (i)
321323 __local real1 * loc_energy , // local memory buffer
322324 __local int * loc_index // local memory buffer
@@ -331,6 +333,8 @@ __kernel void single_bit_flips_sparse(
331333 int best_i = i ;
332334
333335 for (; i < n ; i += max_i ) {
336+ // Worker returns delta; sign determines accept/reject.
337+ // Only the largest positive delta matters for selecting the best flip.
334338 const real1 energy = single_bit_flip_worker_sparse (best_theta , G_data , G_rows , G_cols , n , is_spin_glass , i );
335339 if (energy > best_energy ) {
336340 best_energy = energy ;
@@ -342,38 +346,62 @@ __kernel void single_bit_flips_sparse(
342346}
343347
344348real1 double_bit_flip_worker_sparse (__constant uint * theta , __global const real1 * G_data , __global const uint * G_rows , __global const uint * G_cols , const int n , const bool is_spin_glass , const int k , const int l ) {
345- real1 energy = ZERO_R1 ;
346- for (int u = 0 ; u < n ; ++ u ) {
347- bool u_bit = get_const_bit (theta , u );
348- if ((u == k ) || (u == l )) {
349- u_bit = !u_bit ;
349+ // Only edges incident to k or l change when both are flipped simultaneously.
350+ // Walk adjacency lists of k and l: O(degree(k) + degree(l)) instead of O(n * degree).
351+ // Returns the energy DELTA for this double flip.
352+ const bool k_bit_old = get_const_bit (theta , k );
353+ const bool k_bit_new = !k_bit_old ;
354+ const bool l_bit_old = get_const_bit (theta , l );
355+ const bool l_bit_new = !l_bit_old ;
356+ real1 delta = ZERO_R1 ;
357+
358+ // Contribution from edges incident to k (excluding the k-l edge, handled below)
359+ const uint k_row_end = G_rows [k + 1 ];
360+ for (uint col = G_rows [k ]; col < k_row_end ; ++ col ) {
361+ const int v = G_cols [col ];
362+ if (v == l ) continue ; // handle k-l edge separately
363+ const real1 val = G_data [col ];
364+ // v is not flipped, so v_bit unchanged
365+ const bool v_bit = get_const_bit (theta , v );
366+ const bool old_cut = k_bit_old != v_bit ;
367+ const bool new_cut = k_bit_new != v_bit ;
368+ if (old_cut != new_cut ) {
369+ delta += new_cut ? val : - val ;
350370 }
351- const size_t mCol = G_rows [ u + 1 ];
352- for ( int col = G_rows [ u ]; col < mCol ; ++ col ) {
353- const int v = G_cols [ col ];
354- const real1 val = G_data [ col ];
355- bool v_bit = get_const_bit ( theta , v );
356- if (( v == k ) || ( v == l )) {
357- v_bit = ! v_bit ;
358- }
359- if ( u_bit != v_bit ) {
360- energy += val ;
361- } else if ( is_spin_glass ) {
362- energy -= val ;
363- }
371+ }
372+
373+ // Contribution from edges incident to l (excluding the k-l edge)
374+ const uint l_row_end = G_rows [ l + 1 ];
375+ for ( uint col = G_rows [ l ]; col < l_row_end ; ++ col ) {
376+ const int v = G_cols [ col ];
377+ if ( v == k ) continue ; // handle k-l edge separately
378+ const real1 val = G_data [ col ];
379+ const bool v_bit = get_const_bit ( theta , v );
380+ const bool old_cut = l_bit_old != v_bit ;
381+ const bool new_cut = l_bit_new != v_bit ;
382+ if ( old_cut != new_cut ) {
383+ delta += new_cut ? val : - val ;
364384 }
365385 }
366386
367- return energy ;
387+ // Handle the k-l edge: both endpoints flip, so XOR status may or may not change.
388+ // old_cut = (k_bit_old != l_bit_old); new_cut = (k_bit_new != l_bit_new).
389+ // Since both bits flip, new_cut == old_cut — the k-l edge contribution is unchanged.
390+ // No delta from the k-l edge. (Included here explicitly for clarity.)
391+
392+ if (is_spin_glass ) {
393+ delta *= 2.0 ;
394+ }
395+ return delta ;
368396}
369397
370398__kernel void double_bit_flips_sparse (
371399 __global const real1 * G_data ,
372400 __global const uint * G_rows ,
373401 __global const uint * G_cols ,
374402 __constant uint * best_theta ,
375- __constant int * args , // args[0] = n, args[1] = k
376- __global real1 * max_energy_ptr , // output: per-group min energy
403+ __constant int * args , // args[0] = n, args[1] = is_spin_glass
404+ __global real1 * max_energy_ptr , // output: per-group best delta
377405 __global int * max_index_ptr , // output: per-group best index (i)
378406 __local real1 * loc_energy , // local memory buffer
379407 __local int * loc_index // local memory buffer
@@ -403,6 +431,7 @@ __kernel void double_bit_flips_sparse(
403431 }
404432 const int l = c + k + 1 ;
405433
434+ // Worker returns delta; sign determines accept/reject.
406435 const real1 energy = double_bit_flip_worker_sparse (best_theta , G_data , G_rows , G_cols , n , is_spin_glass , k , l );
407436 if (energy > best_energy ) {
408437 best_energy = energy ;
@@ -879,12 +908,10 @@ inline size_t gray_code_next(ulong* theta, const size_t curr_idx, const size_t o
879908 size_t curr = curr_idx + 1U ;
880909 prev = prev ^ (prev >> 1U );
881910 curr = curr ^ (curr >> 1U );
882- size_t diff = prev ^ curr ;
883- size_t flip_bit = 0 ;
884- while (!((diff >> flip_bit ) & 1U )) {
885- ++ flip_bit ;
886- }
887- flip_bit += offset ;
911+ const ulong diff = prev ^ curr ;
912+ // ctz gives the position of the lowest set bit in a single native instruction.
913+ // diff is always non-zero (consecutive Gray codes differ in exactly one bit).
914+ const size_t flip_bit = (size_t )ctz (diff ) + offset ;
888915 theta [flip_bit >> 6U ] ^= 1UL << (flip_bit & 63U );
889916
890917 return flip_bit ;
@@ -1117,25 +1144,31 @@ __kernel void gray_sparse(
11171144 for (; i < gray_iterations ; i += max_i ) {
11181145 for (int block = 0 ; block < blocks ; ++ block ) {
11191146 const size_t flip_bit = gray_code_next (theta_local , i , block << 6U );
1120- real1 energy = ZERO_R1 ;
1121- for (uint u = 0 ; u < n ; u ++ ) {
1122- const bool u_bit = get_local_bit (theta_local , u );
1123- const size_t mCol = G_rows [u + 1 ];
1124- for (int col = G_rows [u ]; col < mCol ; ++ col ) {
1125- const int v = G_cols [col ];
1126- const real1 val = G_data [col ];
1127- const bool v_bit = get_local_bit (theta_local , v );
1128- if (u_bit != v_bit ) {
1129- energy += val ;
1130- } else if (is_spin_glass ) {
1131- energy -= val ;
1132- }
1147+ // Incremental update: only edges incident to flip_bit change.
1148+ // After gray_code_next, theta_local already has the bit flipped.
1149+ // Determine the new and old bit values.
1150+ const bool new_bit = get_local_bit (theta_local , flip_bit );
1151+ const bool old_bit = !new_bit ;
1152+ real1 delta = ZERO_R1 ;
1153+ const uint row_end = G_rows [flip_bit + 1 ];
1154+ for (uint col = G_rows [flip_bit ]; col < row_end ; ++ col ) {
1155+ const uint v = G_cols [col ];
1156+ const real1 val = G_data [col ];
1157+ const bool v_bit = get_local_bit (theta_local , v );
1158+ const bool old_cut = old_bit != v_bit ;
1159+ const bool new_cut = new_bit != v_bit ;
1160+ if (old_cut != new_cut ) {
1161+ delta += new_cut ? val : - val ;
11331162 }
11341163 }
1164+ if (is_spin_glass ) {
1165+ delta *= 2.0 ;
1166+ }
11351167
1136- if (energy > best_energy ) {
1137- best_energy = energy ;
1168+ if (delta > ZERO_R1 ) {
1169+ best_energy += delta ;
11381170 } else {
1171+ // Reject: undo the flip
11391172 theta_local [flip_bit >> 6U ] ^= 1UL << (flip_bit & 63U );
11401173 }
11411174 }
@@ -1204,25 +1237,27 @@ __kernel void gray_sparse_segmented(
12041237 for (; i < gray_iterations ; i += max_i ) {
12051238 for (int block = 0 ; block < blocks ; ++ block ) {
12061239 const size_t flip_bit = gray_code_next (theta_local , i , block << 6U );
1207- real1 energy = ZERO_R1 ;
1208- for (uint u = 0 ; u < n ; u ++ ) {
1209- const size_t u_offset = u * n ;
1210- const bool u_bit = get_local_bit (theta_local , u );
1211- const uint row_end = G_rows [u + 1 ];
1212- for (uint col = G_rows [u ]; col < row_end ; ++ col ) {
1213- const int v = G_cols [col ];
1214- const real1 val = get_G_m (G_data , col , segment_size );
1215- const bool v_bit = get_local_bit (theta_local , v );
1216- if (u_bit != v_bit ) {
1217- energy += val ;
1218- } else if (is_spin_glass ) {
1219- energy -= val ;
1220- }
1240+ // Incremental update: only edges incident to flip_bit change.
1241+ const bool new_bit = get_local_bit (theta_local , flip_bit );
1242+ const bool old_bit = !new_bit ;
1243+ real1 delta = ZERO_R1 ;
1244+ const uint row_end = G_rows [flip_bit + 1 ];
1245+ for (uint col = G_rows [flip_bit ]; col < row_end ; ++ col ) {
1246+ const int v = G_cols [col ];
1247+ const real1 val = get_G_m (G_data , col , segment_size );
1248+ const bool v_bit = get_local_bit (theta_local , v );
1249+ const bool old_cut = old_bit != v_bit ;
1250+ const bool new_cut = new_bit != v_bit ;
1251+ if (old_cut != new_cut ) {
1252+ delta += new_cut ? val : - val ;
12211253 }
12221254 }
1255+ if (is_spin_glass ) {
1256+ delta *= 2.0 ;
1257+ }
12231258
1224- if (energy > best_energy ) {
1225- best_energy = energy ;
1259+ if (delta > ZERO_R1 ) {
1260+ best_energy += delta ;
12261261 } else {
12271262 theta_local [flip_bit >> 6U ] ^= 1UL << (flip_bit & 63U );
12281263 }
0 commit comments