stackav-oss
diff --git a/‎benchmarks/voxelization_benchmark.py‎
Lines changed: 29 additions & 8 deletions b/‎benchmarks/voxelization_benchmark.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎conch/kernels/vision/voxelization.py‎
Lines changed: 1 addition & 104 deletions b/‎conch/kernels/vision/voxelization.py‎
Lines changed: 1 addition & 104 deletions
@@ -8,13 +8,9 @@
 import click
 import torch
 
-from conch.ops.vision.voxelization import (
-    VoxelizationParameter,
-    collect_point_features,
-    generate_voxels,
-    voxelization_stable,
-)
+from conch.ops.vision.voxelization import VoxelizationParameter, generate_voxels
 from conch.platforms import current_platform
+from conch.reference.vision.voxelization import collect_point_features, voxelization_stable
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 
@@ -82,6 +78,11 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
 @click.option(
     "--cuda-ref",
     is_flag=True,
@@ -97,6 +98,7 @@ def main(
     warmup_time_ms: int,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
     cuda_ref: bool,
 ) -> None:
     """Benchmark voxelization.
@@ -106,11 +108,12 @@ def main(
         max_num_points_per_voxel: Max number of points per voxel for output feature tensor.
         voxel_dim: Voxel dimensions for x,y,z
         grid_range: Grid boundary for x,y,z
-        torch_ref: Flag to enable torch reference implementation.
+        torch_ref: Flag to use pure torch reference implementation instead of hybrid triton/torch.
         iteration_time_ms: Time in milliseconds to run benchmark.
         warmup_time_ms: Time in milliseconds to warmup before recording times.
         gpu: Which gpu to run on.
         csv: Flag to indicate whether or not to print results in CSV format.
+        compile_ref: Flag to torch.compile() the pure torch reference implementation.
         cuda_ref: Flag to enable CUDA reference implementation.
     """
 
@@ -137,7 +140,7 @@ def main(
     def generate_voxels_torch(
         points: torch.Tensor, param: VoxelizationParameter
     ) -> tuple[torch.tensor, torch.tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """triton/torch hybrid, 2-step, stable voxelization first then generate a feature tensor."""
+        """reference triton/torch hybrid, 2-step, stable voxelization first then generate a feature tensor."""
         use_triton = not torch_ref
         actual_num_points_per_voxel, point_raw_indices, flat_voxel_indices = voxelization_stable(
             points, param, use_triton=use_triton
@@ -223,11 +226,29 @@ def generate_voxels_torch(
             warmup_time_ms=warmup_time_ms,
         )
 
+    reference_compiled_result = None
+    reference_compiled_fn = None
+
+    if compile_ref and torch_ref:
+        # Compile the reference implementation if requested
+        reference_compiled_fn = torch.compile(generate_voxels_torch)
+
+    if reference_compiled_fn:
+        baseline_result = benchmark_it(
+            lambda: reference_compiled_fn(*args),
+            tag="Baseline (Torch compiled)",
+            metadata=metadata,
+            iteration_time_ms=iteration_time_ms,
+            warmup_time_ms=warmup_time_ms,
+        )
+
     conch_result.print_parameters(csv=csv)
     conch_result.print_results(csv=csv)
     baseline_result.print_results(csv=csv)
     if reference_cuda_result:
         reference_cuda_result.print_results(csv=csv)
+    if reference_compiled_result:
+        reference_compiled_result.print_results(csv=csv)
 
 
 if __name__ == "__main__":
 
@@ -8,59 +8,6 @@
 import triton.language as tl
 
 
-@triton.jit
-def filter_and_label_points_triton_kernel(  # noqa: PLR0913, D417
-    # input
-    points_ptr: torch.Tensor,
-    num_points: int,
-    num_features_per_point: int,
-    # parameters
-    min_x: float,
-    min_y: float,
-    min_z: float,
-    max_x: float,
-    max_y: float,
-    max_z: float,
-    voxel_dim_x: float,
-    voxel_dim_y: float,
-    voxel_dim_z: float,
-    grid_dim_x: int,
-    grid_dim_y: int,
-    grid_dim_z: int,
-    max_num_voxels: int,
-    # output
-    point_voxel_indices_ptr: torch.Tensor,
-    # Constants
-    cxpr_block_size: tl.constexpr,
-) -> None:
-    """Filter valid points and label each with a voxel index.
-
-    Args:
-        points_ptr: input points, shape (num_points, num_features_per_point).
-        voxelization parameters
-        point_voxel_indices_ptr: output per point flattened voxel indices, shape (num_points).
-    """
-    block_idx = tl.program_id(axis=0)
-    point_idx = block_idx * cxpr_block_size + tl.arange(0, cxpr_block_size)
-    point_mask = point_idx < num_points
-
-    point_x = tl.load(points_ptr + point_idx * num_features_per_point + 0, mask=point_mask, other=max_x + voxel_dim_x)
-    point_y = tl.load(points_ptr + point_idx * num_features_per_point + 1, mask=point_mask, other=max_y + voxel_dim_y)
-    point_z = tl.load(points_ptr + point_idx * num_features_per_point + 2, mask=point_mask, other=max_z + voxel_dim_z)
-
-    voxel_x = tl.floor((point_x - min_x) / voxel_dim_x).to(tl.int32)
-    voxel_y = tl.floor((point_y - min_y) / voxel_dim_y).to(tl.int32)
-    voxel_z = tl.floor((point_z - min_z) / voxel_dim_z).to(tl.int32)
-
-    valid_x = (voxel_x >= 0) & (voxel_x < grid_dim_x)
-    valid_y = (voxel_y >= 0) & (voxel_y < grid_dim_y)
-    valid_z = (voxel_z >= 0) & (voxel_z < grid_dim_z)
-    valid_point = point_mask & valid_x & valid_y & valid_z
-
-    flat_voxel_idx = tl.where(valid_point, ((voxel_z * grid_dim_y + voxel_y) * grid_dim_x + voxel_x), max_num_voxels)
-    tl.store(point_voxel_indices_ptr + point_idx, flat_voxel_idx, mask=point_mask)
-
-
 @triton.jit
 def generate_dense_voxels_triton_kernel(  # noqa: PLR0913, D417
     # input
@@ -187,7 +134,7 @@ def generate_voxels_triton_kernel(  # noqa: PLR0913, D417
     tl.store(voxel_indices_ptr + voxel_idx * 4 + 2, voxel_z, mask=valid_voxel)
 
     # store all feature points, including padded 0s
-    for point_idx in range(0, max_num_points_per_voxel, 1):
+    for point_idx in range(0, max_num_points_per_voxel):
         input_idx = flat_voxel_idx * max_num_points_per_voxel + point_idx
         point_x = tl.load(dense_point_features_ptr + input_idx * 4 + 0, mask=valid_voxel)
         point_y = tl.load(dense_point_features_ptr + input_idx * 4 + 1, mask=valid_voxel)
@@ -199,53 +146,3 @@ def generate_voxels_triton_kernel(  # noqa: PLR0913, D417
         tl.store(point_features_ptr + output_idx * 4 + 1, point_y, mask=valid_voxel)
         tl.store(point_features_ptr + output_idx * 4 + 2, point_z, mask=valid_voxel)
         tl.store(point_features_ptr + output_idx * 4 + 3, point_w, mask=valid_voxel)
-
-
-@triton.jit
-def collect_point_features_triton_kernel(  # noqa: PLR0913, D417
-    # input
-    points_ptr: torch.Tensor,
-    num_features_per_point: int,
-    segment_offsets_ptr: torch.Tensor,
-    num_filled_voxels: int,
-    point_indices_ptr: torch.Tensor,
-    # parameters
-    max_num_points_per_voxel: int,
-    # output
-    point_features_ptr: torch.Tensor,
-    capped_num_points_per_voxel_ptr: torch.Tensor,
-    # Constants
-    cxpr_block_size: tl.constexpr,
-) -> None:
-    """Group valid points into dense voxels.
-
-    Args:
-        points_ptr: input points tensor, shape (num_points, num_features_per_point)
-        segment_offsets_ptr: input segment end offsets, shape (num_filled_voxels)
-        point_indices_ptr: input raw point indices, shape (num_valid_points)
-        voxelization parameters
-        point_features_ptr: output voxel point features, shape (num_filled_voxels, max_num_points_per_voxel, num_features_per_point)
-        capped_num_points_per_voxel_ptr: output number of points per voxel tensor after capping, shape (num_filled_voxels)
-    """
-    block_idx = tl.program_id(axis=0)
-    voxel_idx = block_idx * cxpr_block_size + tl.arange(0, cxpr_block_size)
-    voxel_mask = voxel_idx < num_filled_voxels
-
-    # top n filtering
-    segment_start = tl.load(segment_offsets_ptr + voxel_idx - 1, mask=(voxel_mask & (voxel_idx > 0)), other=0)
-    segment_end = tl.load(segment_offsets_ptr + voxel_idx, mask=voxel_mask, other=0)
-    num_points_in_voxel = segment_end - segment_start
-    num_points_in_voxel = tl.minimum(num_points_in_voxel, max_num_points_per_voxel)
-    tl.store(capped_num_points_per_voxel_ptr + voxel_idx, num_points_in_voxel, mask=voxel_mask)
-
-    for voxel_point_idx in range(0, max_num_points_per_voxel, 1):
-        # this mask is sufficient since other num_points_in_voxel == 0
-        per_voxel_mask = voxel_point_idx < num_points_in_voxel
-
-        raw_point_idx = tl.load(point_indices_ptr + segment_start + voxel_point_idx, mask=per_voxel_mask)
-        output_idx = voxel_idx * max_num_points_per_voxel + voxel_point_idx
-        for feature_idx in range(0, num_features_per_point, 1):
-            value = tl.load(
-                points_ptr + raw_point_idx * num_features_per_point + feature_idx, mask=per_voxel_mask, other=0
-            )
-            tl.store(point_features_ptr + output_idx * num_features_per_point + feature_idx, value, mask=voxel_mask)