Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_rasterizer/auxiliary.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ __forceinline__ __device__ bool in_frustum(int idx,
if (prefiltered)
{
printf("Point is filtered although prefiltered is set. This shouldn't happen!");
__trap();
__builtin_trap();
}
return false;
}
Expand Down
13 changes: 8 additions & 5 deletions cuda_rasterizer/backward.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "auxiliary.h"
#include "math.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;

// Backward pass for conversion of spherical harmonics to RGB for
Expand Down Expand Up @@ -538,10 +537,14 @@ __global__ void preprocessCUDA(
computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
}

__device__ void inline reduce_add(float &a, float b) { a += b; }
__device__ void inline reduce_add(float2 &a, float2 b) { a.x += b.x; a.y += b.y; }
__device__ void inline reduce_add(float3 &a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; }
__device__ void inline reduce_add(float4 &a, float4 b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
template <typename T>
__device__ void inline reduce_helper(int lane, int i, T *data) {
if (lane < i) {
data[lane] += data[lane + i];
reduce_add(data[lane], data[lane + i]);
}
}

Expand Down Expand Up @@ -817,7 +820,7 @@ void BACKWARD::preprocess(
// Somewhat long, thus it is its own kernel rather than being part of
// "preprocess". When done, loss gradient w.r.t. 3D means has been
// modified and gradient w.r.t. 3D covariance matrix has been computed.
computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
computeCov2DCUDA <<<(P + 255) / 256, 256 >>> (
P,
means3D,
radii,
Expand All @@ -835,7 +838,7 @@ void BACKWARD::preprocess(
// Propagate gradients for remaining steps: finish 3D mean gradients,
// propagate color gradients to SH (if desireD), propagate 3D covariance
// matrix gradients to scale and rotation.
preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
preprocessCUDA<NUM_CHANNELS> <<< (P + 255) / 256, 256 >>> (
P, D, M,
(float3*)means3D,
radii,
Expand Down Expand Up @@ -879,7 +882,7 @@ void BACKWARD::render(
float* dL_dcolors,
float* dL_ddepths)
{
renderCUDA<NUM_CHANNELS> << <grid, block >> >(
renderCUDA<NUM_CHANNELS> <<<grid, block >>>(
ranges,
point_list,
W, H,
Expand Down
1 change: 0 additions & 1 deletion cuda_rasterizer/backward.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>

Expand Down
5 changes: 2 additions & 3 deletions cuda_rasterizer/forward.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "helper_math.h"
#include "math.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;

// Forward method for converting the input spherical harmonics
Expand Down Expand Up @@ -408,7 +407,7 @@ void FORWARD::render(
float* out_opacity,
int* n_touched)
{
renderCUDA<NUM_CHANNELS> << <grid, block >> > (
renderCUDA<NUM_CHANNELS> <<<grid, block >>> (
ranges,
point_list,
W, H,
Expand Down Expand Up @@ -451,7 +450,7 @@ void FORWARD::preprocess(int P, int D, int M,
uint32_t* tiles_touched,
bool prefiltered)
{
preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
preprocessCUDA<NUM_CHANNELS> <<<(P + 255) / 256, 256 >>> (
P, D, M,
means3D,
scales,
Expand Down
1 change: 0 additions & 1 deletion cuda_rasterizer/forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>

Expand Down
11 changes: 4 additions & 7 deletions cuda_rasterizer/helper_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -1034,16 +1034,13 @@ inline __device__ __host__ float smoothstep(float a, float b, float x) {
return (y * y * (3.0f - (2.0f * y)));
}
inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) {
float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
return (y * y * (make_float2(3.0f) - (make_float2(2.0f) * y)));
return make_float2(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y));
}
inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) {
float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
return (y * y * (make_float3(3.0f) - (make_float3(2.0f) * y)));
return make_float3(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y), smoothstep(a.z, b.z, x.z));
}
inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) {
float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
return (y * y * (make_float4(3.0f) - (make_float4(2.0f) * y)));
return make_float4(smoothstep(a.x, b.x, x.x), smoothstep(a.y, b.y, x.y), smoothstep(a.z, b.z, x.z), smoothstep(a.w, b.w, x.w));
}

#endif
#endif
8 changes: 3 additions & 5 deletions cuda_rasterizer/rasterizer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@
#include <numeric>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;

#include "auxiliary.h"
Expand Down Expand Up @@ -145,7 +143,7 @@ void CudaRasterizer::Rasterizer::markVisible(
float* projmatrix,
bool* present)
{
checkFrustum << <(P + 255) / 256, 256 >> > (
checkFrustum <<<(P + 255) / 256, 256 >>> (
P,
means3D,
viewmatrix, projmatrix,
Expand Down Expand Up @@ -289,7 +287,7 @@ int CudaRasterizer::Rasterizer::forward(

// For each instance to be rendered, produce adequate [ tile | depth ] key
// and corresponding dublicated Gaussian indices to be sorted
duplicateWithKeys << <(P + 255) / 256, 256 >> > (
duplicateWithKeys <<<(P + 255) / 256, 256 >>> (
P,
geomState.means2D,
geomState.depths,
Expand All @@ -314,7 +312,7 @@ int CudaRasterizer::Rasterizer::forward(

// Identify start and end of per-tile workloads in sorted list
if (num_rendered > 0)
identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
identifyTileRanges <<<(num_rendered + 255) / 256, 256 >>> (
num_rendered,
binningState.point_list_keys,
imgState.ranges);
Expand Down
2 changes: 1 addition & 1 deletion cuda_rasterizer/rasterizer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace CudaRasterizer
template <typename T>
static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
{
std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
std::size_t offset = (reinterpret_cast<uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
ptr = reinterpret_cast<T*>(offset);
chunk = reinterpret_cast<char*>(ptr + count);
}
Expand Down