rmurai0610 · ZJLi2013 · May 28, 2026
diff --git a/cuda_rasterizer/auxiliary.h b/cuda_rasterizer/auxiliary.h
@@ -156,7 +156,7 @@ __forceinline__ __device__ bool in_frustum(int idx,
 		if (prefiltered)
 		{
 			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
-			__trap();
+			__builtin_trap();
 		}
 		return false;
 	}

diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu
@@ -13,7 +13,6 @@
 #include "auxiliary.h"
 #include "math.h"
 #include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
 
 // Backward pass for conversion of spherical harmonics to RGB for
@@ -538,10 +537,14 @@ __global__ void preprocessCUDA(
 		computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
 }
 
+__device__ void inline reduce_add(float &a, float b) { a += b; }
+__device__ void inline reduce_add(float2 &a, float2 b) { a.x += b.x; a.y += b.y; }
+__device__ void inline reduce_add(float3 &a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; }
+__device__ void inline reduce_add(float4 &a, float4 b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
 template <typename T>
 __device__ void inline reduce_helper(int lane, int i, T *data) {
   if (lane < i) {
-    data[lane] += data[lane + i];
+    reduce_add(data[lane], data[lane + i]);
   }
 }
 
@@ -817,7 +820,7 @@ void BACKWARD::preprocess(
 	// Somewhat long, thus it is its own kernel rather than being part of 
 	// "preprocess". When done, loss gradient w.r.t. 3D means has been
 	// modified and gradient w.r.t. 3D covariance matrix has been computed.	
-	computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
+	computeCov2DCUDA <<<(P + 255) / 256, 256 >>> (
 		P,
 		means3D,
 		radii,
@@ -835,7 +838,7 @@ void BACKWARD::preprocess(
 	// Propagate gradients for remaining steps: finish 3D mean gradients,
 	// propagate color gradients to SH (if desireD), propagate 3D covariance
 	// matrix gradients to scale and rotation.
-	preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
+	preprocessCUDA<NUM_CHANNELS> <<< (P + 255) / 256, 256 >>> (
 		P, D, M,
 		(float3*)means3D,
 		radii,
@@ -879,7 +882,7 @@ void BACKWARD::render(
 	float* dL_dcolors,
 	float* dL_ddepths)
 {
-	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
+	renderCUDA<NUM_CHANNELS> <<<grid, block >>>(
 		ranges,
 		point_list,
 		W, H,

diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h
@@ -14,7 +14,6 @@
 
 #include <cuda.h>
 #include "cuda_runtime.h"
-#include "device_launch_parameters.h"
 #define GLM_FORCE_CUDA
 #include <glm/glm.hpp>
 

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
@@ -14,7 +14,6 @@
 #include "helper_math.h"
 #include "math.h"
 #include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
 
 // Forward method for converting the input spherical harmonics
@@ -408,7 +407,7 @@ void FORWARD::render(
 	float* out_opacity,
 	int* n_touched)
 {
-	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
+	renderCUDA<NUM_CHANNELS> <<<grid, block >>> (
 		ranges,
 		point_list,
 		W, H,
@@ -451,7 +450,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	uint32_t* tiles_touched,
 	bool prefiltered)
 {
-	preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
+	preprocessCUDA<NUM_CHANNELS> <<<(P + 255) / 256, 256 >>> (
 		P, D, M,
 		means3D,
 		scales,

diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
@@ -14,7 +14,6 @@
 
 #include <cuda.h>
 #include "cuda_runtime.h"
-#include "device_launch_parameters.h"
 #define GLM_FORCE_CUDA
 #include <glm/glm.hpp>
 

diff --git a/cuda_rasterizer/helper_math.h b/cuda_rasterizer/helper_math.h
@@ -1034,16 +1034,13 @@ inline __device__ __host__ float smoothstep(float a, float b, float x) {
   return (y * y * (3.0f - (2.0f * y)));
 }
 inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) {
-  float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-  return (y * y * (make_float2(3.0f) - (make_float2(2.0f) * y)));
+    return make_float2(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y));
 }
 inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) {
-  float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-  return (y * y * (make_float3(3.0f) - (make_float3(2.0f) * y)));
+    return make_float3(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y), smoothstep(a.z, b.z, x.z));
 }
 inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) {
-  float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-  return (y * y * (make_float4(3.0f) - (make_float4(2.0f) * y)));
+    return make_float4(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y), smoothstep(a.z, b.z, x.z), smoothstep(a.w, b.w, x.w));
 }
 
-#endif
+#endif
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
@@ -16,14 +16,12 @@
 #include <numeric>
 #include <cuda.h>
 #include "cuda_runtime.h"
-#include "device_launch_parameters.h"
 #include <cub/cub.cuh>
 #include <cub/device/device_radix_sort.cuh>
 #define GLM_FORCE_CUDA
 #include <glm/glm.hpp>
 
 #include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
 
 #include "auxiliary.h"
@@ -145,7 +143,7 @@ void CudaRasterizer::Rasterizer::markVisible(
 	float* projmatrix,
 	bool* present)
 {
-	checkFrustum << <(P + 255) / 256, 256 >> > (
+	checkFrustum <<<(P + 255) / 256, 256 >>> (
 		P,
 		means3D,
 		viewmatrix, projmatrix,
@@ -289,7 +287,7 @@ int CudaRasterizer::Rasterizer::forward(
 
 	// For each instance to be rendered, produce adequate [ tile | depth ] key 
 	// and corresponding dublicated Gaussian indices to be sorted
-	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
+	duplicateWithKeys <<<(P + 255) / 256, 256 >>> (
 		P,
 		geomState.means2D,
 		geomState.depths,
@@ -314,7 +312,7 @@ int CudaRasterizer::Rasterizer::forward(
 
 	// Identify start and end of per-tile workloads in sorted list
 	if (num_rendered > 0)
-		identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
+		identifyTileRanges <<<(num_rendered + 255) / 256, 256 >>> (
 			num_rendered,
 			binningState.point_list_keys,
 			imgState.ranges);

diff --git a/cuda_rasterizer/rasterizer_impl.h b/cuda_rasterizer/rasterizer_impl.h
@@ -21,7 +21,7 @@ namespace CudaRasterizer
 	template <typename T>
 	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
 	{
-		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
+		std::size_t offset = (reinterpret_cast<uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
 		ptr = reinterpret_cast<T*>(offset);
 		chunk = reinterpret_cast<char*>(ptr + count);
 	}