Skip to content

Commit c55fe47

Browse files
(Anthropic) Claude improves OpenCL kernels
1 parent 413876a commit c55fe47

4 files changed

Lines changed: 234 additions & 196 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "pyqrackising"
10-
version = "9.15.2"
10+
version = "9.15.3"
1111
requires-python = ">=3.8"
1212
description = "Fast MAXCUT, TSP, and sampling heuristics from near-ideal transverse field Ising model (TFIM)"
1313
readme = {file = "README.txt", content-type = "text/markdown"}

pyqrackising/kernels.cl

Lines changed: 116 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -285,38 +285,40 @@ __kernel void calculate_cut_sparse(
285285
}
286286

287287
real1 single_bit_flip_worker_sparse(__constant uint* theta, __global const real1* G_data, __global const uint* G_rows, __global const uint* G_cols, const int n, const bool is_spin_glass, const int k) {
288-
real1 energy = ZERO_R1;
289-
for (int u = 0; u < n; ++u) {
290-
bool u_bit = get_const_bit(theta, u);
291-
if (u == k) {
292-
u_bit = !u_bit;
293-
}
294-
const size_t mCol = G_rows[u + 1];
295-
for (int col = G_rows[u]; col < mCol; ++col) {
296-
const int v = G_cols[col];
297-
const real1 val = G_data[col];
298-
bool v_bit = get_const_bit(theta, v);
299-
if (v == k) {
300-
v_bit = !v_bit;
301-
}
302-
if (u_bit != v_bit) {
303-
energy += val;
304-
} else if (is_spin_glass) {
305-
energy -= val;
306-
}
288+
// Only edges incident to k change when k is flipped.
289+
// Walk only the adjacency list of k: O(degree(k)) instead of O(n * degree).
290+
// Returns the energy DELTA (new_energy - base_energy) for this flip.
291+
// All threads share the same base state (best_theta), so deltas are
292+
// directly comparable for finding the best single-bit flip.
293+
const bool k_bit_old = get_const_bit(theta, k);
294+
const bool k_bit_new = !k_bit_old;
295+
real1 delta = ZERO_R1;
296+
const uint row_end = G_rows[k + 1];
297+
for (uint col = G_rows[k]; col < row_end; ++col) {
298+
const int v = G_cols[col];
299+
const real1 val = G_data[col];
300+
const bool v_bit = get_const_bit(theta, v);
301+
const bool old_cut = k_bit_old != v_bit;
302+
const bool new_cut = k_bit_new != v_bit;
303+
if (old_cut != new_cut) {
304+
delta += new_cut ? val : -val;
307305
}
308306
}
309-
310-
return energy;
307+
// For spin glass, the same edge contributes to both cut and non-cut terms,
308+
// so the delta magnitude doubles (unlike-becomes-like costs twice: lose val + gain -val).
309+
if (is_spin_glass) {
310+
delta *= 2.0;
311+
}
312+
return delta;
311313
}
312314

313315
__kernel void single_bit_flips_sparse(
314316
__global const real1* G_data,
315317
__global const uint* G_rows,
316318
__global const uint* G_cols,
317319
__constant uint* best_theta,
318-
__constant int* args, // args[0] = n, args[1] = k
319-
__global real1* max_energy_ptr, // output: per-group min energy
320+
__constant int* args, // args[0] = n, args[1] = is_spin_glass
321+
__global real1* max_energy_ptr, // output: per-group best delta
320322
__global int* max_index_ptr, // output: per-group best index (i)
321323
__local real1* loc_energy, // local memory buffer
322324
__local int* loc_index // local memory buffer
@@ -331,6 +333,8 @@ __kernel void single_bit_flips_sparse(
331333
int best_i = i;
332334

333335
for (; i < n; i += max_i) {
336+
// Worker returns delta; sign determines accept/reject.
337+
// Only the largest positive delta matters for selecting the best flip.
334338
const real1 energy = single_bit_flip_worker_sparse(best_theta, G_data, G_rows, G_cols, n, is_spin_glass, i);
335339
if (energy > best_energy) {
336340
best_energy = energy;
@@ -342,38 +346,62 @@ __kernel void single_bit_flips_sparse(
342346
}
343347

344348
real1 double_bit_flip_worker_sparse(__constant uint* theta, __global const real1* G_data, __global const uint* G_rows, __global const uint* G_cols, const int n, const bool is_spin_glass, const int k, const int l) {
345-
real1 energy = ZERO_R1;
346-
for (int u = 0; u < n; ++u) {
347-
bool u_bit = get_const_bit(theta, u);
348-
if ((u == k) || (u == l)) {
349-
u_bit = !u_bit;
349+
// Only edges incident to k or l change when both are flipped simultaneously.
350+
// Walk adjacency lists of k and l: O(degree(k) + degree(l)) instead of O(n * degree).
351+
// Returns the energy DELTA for this double flip.
352+
const bool k_bit_old = get_const_bit(theta, k);
353+
const bool k_bit_new = !k_bit_old;
354+
const bool l_bit_old = get_const_bit(theta, l);
355+
const bool l_bit_new = !l_bit_old;
356+
real1 delta = ZERO_R1;
357+
358+
// Contribution from edges incident to k (excluding the k-l edge, handled below)
359+
const uint k_row_end = G_rows[k + 1];
360+
for (uint col = G_rows[k]; col < k_row_end; ++col) {
361+
const int v = G_cols[col];
362+
if (v == l) continue; // handle k-l edge separately
363+
const real1 val = G_data[col];
364+
// v is not flipped, so v_bit unchanged
365+
const bool v_bit = get_const_bit(theta, v);
366+
const bool old_cut = k_bit_old != v_bit;
367+
const bool new_cut = k_bit_new != v_bit;
368+
if (old_cut != new_cut) {
369+
delta += new_cut ? val : -val;
350370
}
351-
const size_t mCol = G_rows[u + 1];
352-
for (int col = G_rows[u]; col < mCol; ++col) {
353-
const int v = G_cols[col];
354-
const real1 val = G_data[col];
355-
bool v_bit = get_const_bit(theta, v);
356-
if ((v == k) || (v == l)) {
357-
v_bit = !v_bit;
358-
}
359-
if (u_bit != v_bit) {
360-
energy += val;
361-
} else if (is_spin_glass) {
362-
energy -= val;
363-
}
371+
}
372+
373+
// Contribution from edges incident to l (excluding the k-l edge)
374+
const uint l_row_end = G_rows[l + 1];
375+
for (uint col = G_rows[l]; col < l_row_end; ++col) {
376+
const int v = G_cols[col];
377+
if (v == k) continue; // handle k-l edge separately
378+
const real1 val = G_data[col];
379+
const bool v_bit = get_const_bit(theta, v);
380+
const bool old_cut = l_bit_old != v_bit;
381+
const bool new_cut = l_bit_new != v_bit;
382+
if (old_cut != new_cut) {
383+
delta += new_cut ? val : -val;
364384
}
365385
}
366386

367-
return energy;
387+
// Handle the k-l edge: both endpoints flip, so XOR status may or may not change.
388+
// old_cut = (k_bit_old != l_bit_old); new_cut = (k_bit_new != l_bit_new).
389+
// Since both bits flip, new_cut == old_cut — the k-l edge contribution is unchanged.
390+
// No delta from the k-l edge. (Included here explicitly for clarity.)
391+
392+
if (is_spin_glass) {
393+
delta *= 2.0;
394+
}
395+
return delta;
368396
}
369397

370398
__kernel void double_bit_flips_sparse(
371399
__global const real1* G_data,
372400
__global const uint* G_rows,
373401
__global const uint* G_cols,
374402
__constant uint* best_theta,
375-
__constant int* args, // args[0] = n, args[1] = k
376-
__global real1* max_energy_ptr, // output: per-group min energy
403+
__constant int* args, // args[0] = n, args[1] = is_spin_glass
404+
__global real1* max_energy_ptr, // output: per-group best delta
377405
__global int* max_index_ptr, // output: per-group best index (i)
378406
__local real1* loc_energy, // local memory buffer
379407
__local int* loc_index // local memory buffer
@@ -403,6 +431,7 @@ __kernel void double_bit_flips_sparse(
403431
}
404432
const int l = c + k + 1;
405433

434+
// Worker returns delta; sign determines accept/reject.
406435
const real1 energy = double_bit_flip_worker_sparse(best_theta, G_data, G_rows, G_cols, n, is_spin_glass, k, l);
407436
if (energy > best_energy) {
408437
best_energy = energy;
@@ -879,12 +908,10 @@ inline size_t gray_code_next(ulong* theta, const size_t curr_idx, const size_t o
879908
size_t curr = curr_idx + 1U;
880909
prev = prev ^ (prev >> 1U);
881910
curr = curr ^ (curr >> 1U);
882-
size_t diff = prev ^ curr;
883-
size_t flip_bit = 0;
884-
while (!((diff >> flip_bit) & 1U)) {
885-
++flip_bit;
886-
}
887-
flip_bit += offset;
911+
const ulong diff = prev ^ curr;
912+
// ctz gives the position of the lowest set bit in a single native instruction.
913+
// diff is always non-zero (consecutive Gray codes differ in exactly one bit).
914+
const size_t flip_bit = (size_t)ctz(diff) + offset;
888915
theta[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
889916

890917
return flip_bit;
@@ -1117,25 +1144,31 @@ __kernel void gray_sparse(
11171144
for (; i < gray_iterations; i += max_i) {
11181145
for (int block = 0; block < blocks; ++block) {
11191146
const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
1120-
real1 energy = ZERO_R1;
1121-
for (uint u = 0; u < n; u++) {
1122-
const bool u_bit = get_local_bit(theta_local, u);
1123-
const size_t mCol = G_rows[u + 1];
1124-
for (int col = G_rows[u]; col < mCol; ++col) {
1125-
const int v = G_cols[col];
1126-
const real1 val = G_data[col];
1127-
const bool v_bit = get_local_bit(theta_local, v);
1128-
if (u_bit != v_bit) {
1129-
energy += val;
1130-
} else if (is_spin_glass) {
1131-
energy -= val;
1132-
}
1147+
// Incremental update: only edges incident to flip_bit change.
1148+
// After gray_code_next, theta_local already has the bit flipped.
1149+
// Determine the new and old bit values.
1150+
const bool new_bit = get_local_bit(theta_local, flip_bit);
1151+
const bool old_bit = !new_bit;
1152+
real1 delta = ZERO_R1;
1153+
const uint row_end = G_rows[flip_bit + 1];
1154+
for (uint col = G_rows[flip_bit]; col < row_end; ++col) {
1155+
const uint v = G_cols[col];
1156+
const real1 val = G_data[col];
1157+
const bool v_bit = get_local_bit(theta_local, v);
1158+
const bool old_cut = old_bit != v_bit;
1159+
const bool new_cut = new_bit != v_bit;
1160+
if (old_cut != new_cut) {
1161+
delta += new_cut ? val : -val;
11331162
}
11341163
}
1164+
if (is_spin_glass) {
1165+
delta *= 2.0;
1166+
}
11351167

1136-
if (energy > best_energy) {
1137-
best_energy = energy;
1168+
if (delta > ZERO_R1) {
1169+
best_energy += delta;
11381170
} else {
1171+
// Reject: undo the flip
11391172
theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
11401173
}
11411174
}
@@ -1204,25 +1237,27 @@ __kernel void gray_sparse_segmented(
12041237
for (; i < gray_iterations; i += max_i) {
12051238
for (int block = 0; block < blocks; ++block) {
12061239
const size_t flip_bit = gray_code_next(theta_local, i, block << 6U);
1207-
real1 energy = ZERO_R1;
1208-
for (uint u = 0; u < n; u++) {
1209-
const size_t u_offset = u * n;
1210-
const bool u_bit = get_local_bit(theta_local, u);
1211-
const uint row_end = G_rows[u + 1];
1212-
for (uint col = G_rows[u]; col < row_end; ++col) {
1213-
const int v = G_cols[col];
1214-
const real1 val = get_G_m(G_data, col, segment_size);
1215-
const bool v_bit = get_local_bit(theta_local, v);
1216-
if (u_bit != v_bit) {
1217-
energy += val;
1218-
} else if (is_spin_glass) {
1219-
energy -= val;
1220-
}
1240+
// Incremental update: only edges incident to flip_bit change.
1241+
const bool new_bit = get_local_bit(theta_local, flip_bit);
1242+
const bool old_bit = !new_bit;
1243+
real1 delta = ZERO_R1;
1244+
const uint row_end = G_rows[flip_bit + 1];
1245+
for (uint col = G_rows[flip_bit]; col < row_end; ++col) {
1246+
const int v = G_cols[col];
1247+
const real1 val = get_G_m(G_data, col, segment_size);
1248+
const bool v_bit = get_local_bit(theta_local, v);
1249+
const bool old_cut = old_bit != v_bit;
1250+
const bool new_cut = new_bit != v_bit;
1251+
if (old_cut != new_cut) {
1252+
delta += new_cut ? val : -val;
12211253
}
12221254
}
1255+
if (is_spin_glass) {
1256+
delta *= 2.0;
1257+
}
12231258

1224-
if (energy > best_energy) {
1225-
best_energy = energy;
1259+
if (delta > ZERO_R1) {
1260+
best_energy += delta;
12261261
} else {
12271262
theta_local[flip_bit >> 6U] ^= 1UL << (flip_bit & 63U);
12281263
}

0 commit comments

Comments
 (0)