Merge pull request #39 from stackav-oss/feature/jmanning/voxelization-cleanup

jmanning-stackav · web-flow · commit f5142d8a045f · 2025-09-02T17:13:27.000-04:00
Cleanup voxelization
diff --git a/benchmarks/voxelization_benchmark.py b/benchmarks/voxelization_benchmark.py
@@ -11,6 +11,7 @@
 from conch.ops.vision.voxelization import VoxelizationParameter, generate_voxels
 from conch.platforms import current_platform
 from conch.reference.vision.voxelization import collect_point_features, voxelization_stable
+from conch.third_party.vllm.utils import seed_everything
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 
@@ -116,6 +117,8 @@ def main(
         compile_ref: Flag to torch.compile() the pure torch reference implementation.
         cuda_ref: Flag to enable CUDA reference implementation.
     """
+    seed: Final = 0
+    seed_everything(seed)
 
     device: Final = torch.device(gpu)
     torch.set_default_device(device)
@@ -139,7 +142,7 @@ def main(
 
     def generate_voxels_torch(
         points: torch.Tensor, param: VoxelizationParameter
-    ) -> tuple[torch.tensor, torch.tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """reference triton/torch hybrid, 2-step, stable voxelization first then generate a feature tensor."""
         use_triton = not torch_ref
         actual_num_points_per_voxel, point_raw_indices, flat_voxel_indices = voxelization_stable(
@@ -163,7 +166,7 @@ def generate_voxels_torch(
     print(f"number of filled voxels: {actual_num_points_per_voxel.shape[0]}")
     print(f"Avg number of points per voxel: {torch.mean(actual_num_points_per_voxel, dtype=torch.float32)}")
     print(f"Max number of points per voxel: {torch.max(actual_num_points_per_voxel)}")
-    overflow_count = (actual_num_points_per_voxel > param.max_num_points_per_voxel).int().sum()
+    overflow_count = (actual_num_points_per_voxel > param.max_num_points_per_voxel).to(torch.int32).sum()
     print(f"Number of voxels with overflowing points: {overflow_count}")
 
     # Benchmark implementations
diff --git a/conch/kernels/vision/voxelization.py b/conch/kernels/vision/voxelization.py
@@ -8,8 +8,8 @@
 import triton.language as tl
 
 
-@triton.jit
-def generate_dense_voxels_triton_kernel(  # noqa: PLR0913, D417
+@triton.jit  # type: ignore[misc]
+def generate_dense_voxels_triton_kernel(
     # input
     points_ptr: torch.Tensor,
     num_points: int,
@@ -73,8 +73,8 @@ def generate_dense_voxels_triton_kernel(  # noqa: PLR0913, D417
     tl.store(dense_point_features_ptr + output_idx * 4 + 3, point_w, mask=output_mask)
 
 
-@triton.jit
-def generate_voxels_triton_kernel(  # noqa: PLR0913, D417
+@triton.jit  # type: ignore[misc]
+def generate_voxels_triton_kernel(
     # input
     dense_point_features_ptr: torch.Tensor,
     dense_num_points_per_voxel_ptr: torch.Tensor,
diff --git a/conch/ops/vision/voxelization.py b/conch/ops/vision/voxelization.py
@@ -61,10 +61,9 @@ def generate_voxels(
             empty points are filled with 0.
             voxel_indices, shape [num_filled_voxels, 4], only first 3 fields are used for x,y,z indices.
     """
-    assert points.is_cuda
     device = points.device
     num_points, num_features_per_point = points.shape
-    assert num_features_per_point == 4  # noqa: PLR2004
+    assert num_features_per_point == 4
     # same as original nvidia cuda impl
     num_elements_per_voxel_index = 4
 
@@ -105,7 +104,7 @@ def generate_voxels(
         dense_num_points_per_voxel,
         dense_point_features,
         cxpr_block_size=block_size,
-        num_warps=block_size // num_threads_per_warp,  # pyright: ignore[reportCallIssue]
+        num_warps=block_size // num_threads_per_warp,
     )
 
     # compress into contiguous/sparse filled voxels
@@ -122,7 +121,7 @@ def generate_voxels(
         point_features,
         voxel_indices,
         cxpr_block_size=block_size,
-        num_warps=block_size // num_threads_per_warp,  # pyright: ignore[reportCallIssue]
+        num_warps=block_size // num_threads_per_warp,
     )
 
     total_filled_voxels = num_filled_voxels.cpu()[0]
diff --git a/conch/reference/vision/voxelization.py b/conch/reference/vision/voxelization.py
@@ -9,8 +9,9 @@
 
 from conch.ops.vision.voxelization import VoxelizationParameter
 
-@triton.jit
-def filter_and_label_points_triton_kernel(  # noqa: PLR0913, D417
+
+@triton.jit  # type: ignore[misc]
+def filter_and_label_points_triton_kernel(
     # input
     points_ptr: torch.Tensor,
     num_points: int,
@@ -62,7 +63,7 @@ def filter_and_label_points_triton_kernel(  # noqa: PLR0913, D417
     tl.store(point_voxel_indices_ptr + point_idx, flat_voxel_idx, mask=point_mask)
 
 
-def filter_and_label_points_torch(  # noqa: PLR0913, D417
+def filter_and_label_points_torch(
     points: torch.Tensor,
     min_range: tuple[float, float, float],
     voxel_dim: tuple[float, float, float],
@@ -122,7 +123,6 @@ def voxelization_stable(
             contiguous with segment size specified in num_points_per_voxel.
             flat_voxel_indices, shape [num_filled_voxels].
     """
-    assert points.is_cuda
     device = points.device
     num_points, num_features_per_point = points.shape
 
@@ -154,7 +154,7 @@ def voxelization_stable(
             param.max_num_voxels,
             point_voxel_indices,
             cxpr_block_size=block_size,
-            num_warps=block_size // num_threads_per_warp,  # pyright: ignore[reportCallIssue]
+            num_warps=block_size // num_threads_per_warp,
         )
     else:
         filter_and_label_points_torch(
@@ -177,8 +177,8 @@ def voxelization_stable(
     return num_points_per_voxel.to(torch.int32), sorted_raw_indices, voxel_indices
 
 
-@triton.jit
-def collect_point_features_triton_kernel(  # noqa: PLR0913, D417
+@triton.jit  # type: ignore[misc]
+def collect_point_features_triton_kernel(
     # input
     points_ptr: torch.Tensor,
     num_features_per_point: int,
@@ -227,7 +227,7 @@ def collect_point_features_triton_kernel(  # noqa: PLR0913, D417
             tl.store(point_features_ptr + output_idx * num_features_per_point + feature_idx, value, mask=voxel_mask)
 
 
-def collect_point_features_torch(  # noqa: PLR0913, D417
+def collect_point_features_torch(
     points: torch.Tensor,
     num_points_per_voxel: torch.Tensor,
     segment_offsets: torch.Tensor,
@@ -287,7 +287,6 @@ def collect_point_features(
         filled with 0.
         capped_num_points_per_voxel: shape [num_filled_voxels], number of points in each voxel after max capping.
     """
-    assert points.is_cuda
     device = points.device
     num_points, num_features_per_point = points.shape
 
@@ -319,7 +318,7 @@ def collect_point_features(
             point_features,
             capped_num_points_per_voxel,
             cxpr_block_size=block_size,
-            num_warps=block_size // num_threads_per_warp,  # pyright: ignore[reportCallIssue]
+            num_warps=block_size // num_threads_per_warp,
         )
     else:
         collect_point_features_torch(
diff --git a/tests/voxelization_test.py b/tests/voxelization_test.py
@@ -1,14 +1,17 @@
 # Copyright 2025 Stack AV Co.
 # SPDX-License-Identifier: Apache-2.0
 
-# pyright: reportPrivateUsage=false
 """Test voxelization."""
 
+from typing import Final
+
 import pytest
 import torch
 
 from conch.ops.vision.voxelization import VoxelizationParameter, generate_voxels
+from conch.platforms import current_platform
 from conch.reference.vision.voxelization import collect_point_features, voxelization_stable
+from conch.third_party.vllm.utils import seed_everything
 
 
 def voxel_coords_to_flat_indices(coords: torch.Tensor, grid_dim: tuple[int, int, int]) -> torch.Tensor:
@@ -23,18 +26,28 @@ def voxel_coords_to_flat_indices(coords: torch.Tensor, grid_dim: tuple[int, int,
 
 # whether or not use Triton for the reference Torch implementation
 @pytest.mark.parametrize("use_triton", [True, False])
-def test_voxelization(use_triton: bool) -> None:
+@pytest.mark.parametrize("num_points", [1000, 500000])
+@pytest.mark.parametrize("range_xyz", [50.0, 80.0])
+@pytest.mark.parametrize("voxel_dim", [2.5, 5.0])
+@pytest.mark.parametrize("max_num_points_per_voxel", [4, 10])
+def test_voxelization(
+    use_triton: bool, num_points: int, range_xyz: float, voxel_dim: float, max_num_points_per_voxel: int
+) -> None:
     """Test triton/pytorch voxelization."""
-    num_points = 500000
+    seed: Final = 0
+    seed_everything(seed)
+
+    device: Final = torch.device(current_platform.device)
+    torch.set_default_device(device)
+
     num_features_per_point = 4
-    range_xyz = 50.0
-    points = torch.randn((num_points, num_features_per_point), device="cuda") * range_xyz
+    points = torch.randn((num_points, num_features_per_point)) * range_xyz
 
     param = VoxelizationParameter(
         min_range=(-range_xyz, -range_xyz, -range_xyz),
         max_range=(range_xyz, range_xyz, range_xyz),
-        voxel_dim=(2.5, 2.5, 2.5),
-        max_num_points_per_voxel=4,
+        voxel_dim=(voxel_dim, voxel_dim, voxel_dim),
+        max_num_points_per_voxel=max_num_points_per_voxel,
     )
 
     print(f"Grid dimensions: {param.grid_dim}")