remove zeros to match against cuda impl

Lenny Wang · Lenny Wang · commit d7458d888dc1 · 2025-08-28T00:15:56.000Z
diff --git a/benchmarks/voxelization_benchmark.py b/benchmarks/voxelization_benchmark.py
@@ -27,7 +27,7 @@
     help="Number of input points",
 )
 @click.option(
-    "--max_num_points_per_voxel",
+    "--max-num-points-per-voxel",
     required=False,
     type=int,
     default=4,
@@ -41,7 +41,7 @@
     help="Voxel dimension same for x,y,z",
 )
 @click.option(
-    "--grid_range",
+    "--grid-range",
     required=False,
     type=float,
     default=50,
@@ -56,14 +56,14 @@
     "--iteration-time-ms",
     required=False,
     type=int,
-    default=10000,
+    default=100,
     help="Time in milliseconds to run benchmark",
 )
 @click.option(
     "--warmup-time-ms",
     required=False,
     type=int,
-    default=100,
+    default=10,
     help="Time in milliseconds to warmup before recording times",
 )
 @click.option(
diff --git a/conch/kernels/vision/voxelization.py b/conch/kernels/vision/voxelization.py
@@ -136,10 +136,11 @@ def generate_voxels_triton_kernel(  # noqa: PLR0913, D417
     # store all feature points, including padded 0s
     for point_idx in range(0, max_num_points_per_voxel):
         input_idx = flat_voxel_idx * max_num_points_per_voxel + point_idx
-        point_x = tl.load(dense_point_features_ptr + input_idx * 4 + 0, mask=valid_voxel)
-        point_y = tl.load(dense_point_features_ptr + input_idx * 4 + 1, mask=valid_voxel)
-        point_z = tl.load(dense_point_features_ptr + input_idx * 4 + 2, mask=valid_voxel)
-        point_w = tl.load(dense_point_features_ptr + input_idx * 4 + 3, mask=valid_voxel)
+        valid_point = (point_idx < num_points_in_voxel) and valid_voxel
+        point_x = tl.load(dense_point_features_ptr + input_idx * 4 + 0, mask=valid_point, other=0)
+        point_y = tl.load(dense_point_features_ptr + input_idx * 4 + 1, mask=valid_point, other=0)
+        point_z = tl.load(dense_point_features_ptr + input_idx * 4 + 2, mask=valid_point, other=0)
+        point_w = tl.load(dense_point_features_ptr + input_idx * 4 + 3, mask=valid_point, other=0)
 
         output_idx = voxel_idx * max_num_points_per_voxel + point_idx
         tl.store(point_features_ptr + output_idx * 4 + 0, point_x, mask=valid_voxel)
diff --git a/conch/ops/vision/voxelization.py b/conch/ops/vision/voxelization.py
@@ -68,9 +68,9 @@ def generate_voxels(
     # same as original nvidia cuda impl
     num_elements_per_voxel_index = 4
 
-    # dense (must set to 0s)
+    # dense
     dense_num_points_per_voxel = torch.zeros((param.max_num_voxels), dtype=torch.int32, device=device)
-    dense_point_features = torch.zeros(
+    dense_point_features = torch.empty(
         (param.max_num_voxels, param.max_num_points_per_voxel, num_features_per_point), dtype=torch.float, device=device
     )