swich to f32

selmanozleyen · selmanozleyen · commit 6b7d7b270d6c · 2025-09-10T13:38:14.000Z
diff --git a/src/rapids_singlecell/pertpy_gpu/_distances_standalone.py b/src/rapids_singlecell/pertpy_gpu/_distances_standalone.py
@@ -76,7 +76,7 @@ def compute_pairwise_means_gpu(
     num_pairs = len(pair_left)  # k * (k-1) pairs instead of k²
 
     # Allocate output for off-diagonal distances only
-    d_other_offdiag = cp.zeros(num_pairs, dtype=np.float64)
+    d_other_offdiag = cp.zeros(num_pairs, dtype=np.float32)
 
     # Choose optimal block size
     props = cp.cuda.runtime.getDeviceProperties(0)
@@ -85,7 +85,7 @@ def compute_pairwise_means_gpu(
     chosen_threads = None
     shared_mem_size = 0  # TODO: think of a better way to do this
     for tpb in (1024, 512, 256, 128, 64, 32):
-        required = tpb * cp.dtype(cp.float64).itemsize
+        required = tpb * cp.dtype(cp.float32).itemsize
         if required <= max_smem:
             chosen_threads = tpb
             shared_mem_size = required
@@ -111,7 +111,7 @@ def compute_pairwise_means_gpu(
     )
 
     # Build full k x k matrix
-    pairwise_means = cp.zeros((k, k), dtype=np.float64)
+    pairwise_means = cp.zeros((k, k), dtype=np.float32)
 
     # Fill the full matrix
     for i, idx in enumerate(pair_indices.get()):
@@ -322,10 +322,9 @@ def pairwise_edistance_gpu(
     df : pd.DataFrame
         Final edistance matrix
     """
-    # 1. Prepare data (same as original)
     _assert_categorical_obs(adata, key=groupby)
 
-    embedding = cp.array(adata.obsm[obsm_key]).astype(np.float64)
+    embedding = cp.array(adata.obsm[obsm_key]).astype(np.float32)  # Changed from float64
     original_groups = adata.obs[groupby]
     group_map = {v: i for i, v in enumerate(original_groups.cat.categories.values)}
     group_labels = cp.array([group_map[c] for c in original_groups], dtype=cp.int32)
diff --git a/src/rapids_singlecell/pertpy_gpu/kernels/edistance_kernels.cu b/src/rapids_singlecell/pertpy_gpu/kernels/edistance_kernels.cu
@@ -9,22 +9,22 @@ extern "C" {
  * Each block processes one group pair, threads collaborate within the block
  */
 __global__ void compute_group_distances(
-    const double* __restrict__ embedding,
+    const float* __restrict__ embedding,
     const int* __restrict__ cat_offsets,
     const int* __restrict__ cell_indices,
     const int* __restrict__ pair_left,
     const int* __restrict__ pair_right,
-    double* __restrict__ d_other,
+    float* __restrict__ d_other,
     int k,
     int n_features)
 {
-    extern __shared__ double shared_sums[];
+    extern __shared__ float shared_sums[];
 
     const int thread_id = threadIdx.x;
     const int block_id = blockIdx.x;
     const int block_size = blockDim.x;
 
-    double local_sum = 0.0;
+    float local_sum = 0.0f;
 
     const int a = pair_left[block_id];
     const int b = pair_right[block_id];
@@ -46,14 +46,14 @@ __global__ void compute_group_distances(
         for (int jb = start_b; jb < end_b; ++jb) {
             const int idx_j = cell_indices[jb];
 
-            double dist_sq = 0.0;
+            float dist_sq = 0.0f;
             #pragma unroll
             for (int feat = 0; feat < n_features; ++feat) {
-                double diff = embedding[idx_i * n_features + feat] -
+                float diff = embedding[idx_i * n_features + feat] -
                             embedding[idx_j * n_features + feat];
                 dist_sq += diff * diff;
             }
-            local_sum += sqrt(dist_sq);
+            local_sum += sqrtf(dist_sq);
         }
     }
 
@@ -70,7 +70,7 @@ __global__ void compute_group_distances(
 
     if (thread_id == 0) {
         // Store mean between-group distance
-        d_other[block_id] = shared_sums[0] / (double)(n_a * n_b);
+        d_other[block_id] = shared_sums[0] / (float)(n_a * n_b);
     }
 }