@@ -44,17 +44,16 @@ struct RasterizeInternalNodesFunctor
4444
4545 const PairT *dPairs;
4646 const RootT *dRoot;
47- Mask<5 > *dUpperMasks;
48- Mask<4 > (*dLowerMasks)[Mask<5 >::SIZE];
47+ Mask<5 > *dUpperMasks;
48+ Mask<4 > (*dLowerMasks)[Mask<5 >::SIZE];
4949
5050 __device__ void operator ()(size_t pairID) const
5151 {
5252 const auto &pair = dPairs[pairID];
5353
5454 // Locate the root tile containing this leaf origin
5555 const auto *tile = dRoot->probeTile (pair.origin );
56- uint64_t tileIdx = util::PtrDiff (tile, dRoot->tile (0 ))
57- / sizeof (typename RootT::Tile);
56+ uint64_t tileIdx = util::PtrDiff (tile, dRoot->tile (0 )) / sizeof (typename RootT::Tile);
5857
5958 // Offsets of the enclosing upper and lower nodes
6059 const uint32_t upperBit = UpperT::CoordToOffset (pair.origin );
@@ -65,20 +64,18 @@ struct RasterizeInternalNodesFunctor
6564 }
6665};
6766
68- // / @brief Fills leaf voxel value masks via exact point-to-triangle UDF .
67+ // / @brief Fills leaf voxel value masks via exact point-to-triangle distance calculation .
6968// /
7069// / Intended to be called via nanovdb::util::cuda::operatorKernelInstance.
7170// / Launched as <<<pairCount, MaxThreadsPerBlock>>> - 1 CTA per leaf/triangle
72- // / pair, 512 threads (one per voxel in the 8^3 leaf). Each CTA:
73- // / 1. Each thread decodes its voxel local coords (lx, ly, lz) and
74- // / computes closestPointOnTriangleToPoint from the voxel center
75- // / to the pair's triangle.
76- // / 2. Warp ballot builds a local 16-word mask without atomics (same
77- // / pattern as evaluateAndCountSubBoxesKernel).
71+ // / pair, 512 threads (one per voxel in the 8^3 leaf). For each CTA:
72+ // / 1. Each thread computes closestPointOnTriangleToPoint from the voxel
73+ // / center (lx,ly,lz) to the triangle.
74+ // / 2. Warp ballot builds a local 16-word mask without atomics
7875// / 3. Thread 0 locates the destination leaf via probeLeaf().
7976// / 4. Threads 0..7 each pack two 32-bit ballots into one uint64_t and
8077// / atomicOr into the corresponding mask word, allowing multiple CTAs
81- // / writing to the same leaf to coexist .
78+ // / to work concurrently on the same leaf.
8279// /
8380// / @note Degenerate triangles (zero area) are handled implicitly: the face
8481// / interior test naturally fails and the code falls through to the
@@ -91,32 +88,27 @@ struct RasterizeLeafNodesFunctor
9188 static constexpr int MaxThreadsPerBlock = 512 ;
9289 static constexpr int MinBlocksPerMultiprocessor = 1 ;
9390
94- const PairT *dPairs;
95- const TriangleT *dTriangles;
91+ const PairT *dPairs;
92+ const TriangleT *dTriangles;
9693 NanoGrid<BuildT> *dGrid;
97- float bandWidthSqr;
94+ float bandWidthSqr;
9895
9996 __device__ void operator ()() const
10097 {
10198 const uint64_t pairID = blockIdx .x ;
102- const int threadID = threadIdx .x ; // 0..511 = voxel index within leaf
99+ const int threadID = threadIdx .x ; // 0..511 = voxel index within leaf
103100
104101 const auto &pair = dPairs[pairID];
105- const auto &tri = dTriangles[pair.triangleID ];
102+ const auto &tri = dTriangles[pair.triangleID ];
106103
107104 // Decode voxel local coords within the 8^3 leaf
108105 // Bit ordering: threadID = lx + ly*8 + lz*64 matches NanoVDB Mask<3> layout
109106 const int lx = threadID & 0x7 ;
110107 const int ly = (threadID >> 3 ) & 0x7 ;
111108 const int lz = (threadID >> 6 ) & 0x7 ;
112109
113- const nanovdb::Vec3f voxelCenter (
114- float (pair.origin [0 ] + lx),
115- float (pair.origin [1 ] + ly),
116- float (pair.origin [2 ] + lz));
117-
118- const bool hit = nanovdb::math::pointToTriangleDistSqr (
119- tri[0 ], tri[1 ], tri[2 ], voxelCenter) <= bandWidthSqr;
110+ const nanovdb::Vec3f voxelCenter (float (pair.origin [0 ] + lx), float (pair.origin [1 ] + ly), float (pair.origin [2 ] + lz));
111+ const bool hit = nanovdb::math::pointToTriangleDistSqr (tri[0 ], tri[1 ], tri[2 ], voxelCenter) <= bandWidthSqr;
120112
121113 // Build a per-block local mask via warp ballot (avoids per-voxel atomics).
122114 // 512 threads -> 16 warps -> 16 x 32-bit ballot words.
@@ -128,8 +120,7 @@ struct RasterizeLeafNodesFunctor
128120
129121 // Threads 0..7 each pack two ballots into one uint64_t and atomicOr into the
130122 // corresponding mask word.
131- auto *leaf = const_cast <nanovdb::NanoLeaf<BuildT>*>(
132- dGrid->tree ().root ().probeLeaf (pair.origin ));
123+ auto *leaf = const_cast <nanovdb::NanoLeaf<BuildT>*>(dGrid->tree ().root ().probeLeaf (pair.origin ));
133124
134125 if (threadID < int (nanovdb::Mask<3 >::WORD_COUNT)) {
135126 const uint64_t word = uint64_t (s_ballots[2 *threadID])
@@ -147,7 +138,7 @@ struct RasterizeLeafNodesFunctor
147138// /
148139// / Intended to be called via nanovdb::util::cuda::operatorKernelInstance.
149140// / Launched as <<<pairCount, MaxThreadsPerBlock>>> - 1 CTA per leaf/triangle
150- // / pair, 512 threads (one per voxel in the 8^3 leaf). Each CTA:
141+ // / pair, 512 threads (one per voxel in the 8^3 leaf). For each CTA:
151142// / 1. Probes the leaf pointer from the grid using the pair's origin.
152143// / 2. Each thread skips its voxel if inactive in the leaf mask.
153144// / 3. Active threads compute pointToTriangleDistSqr from the voxel center
@@ -163,19 +154,19 @@ struct RasterizeLeafNodesFunctor
163154template <typename BuildT, typename PairT, typename TriangleT>
164155struct ComputeUDFFunctor
165156{
166- static constexpr int MaxThreadsPerBlock = 512 ;
157+ static constexpr int MaxThreadsPerBlock = 512 ;
167158 static constexpr int MinBlocksPerMultiprocessor = 1 ;
168159
169- const PairT *dPairs;
170- const TriangleT *dTriangles;
160+ const PairT *dPairs;
161+ const TriangleT *dTriangles;
171162 const NanoGrid<BuildT> *dGrid;
172- float *dSidecar;
173- float bandWidthSqr;
163+ float *dSidecar;
164+ float bandWidthSqr;
174165
175166 __device__ void operator ()() const
176167 {
177168 const uint64_t pairID = blockIdx .x ;
178- const int threadID = threadIdx .x ;
169+ const int threadID = threadIdx .x ;
179170
180171 const auto &pair = dPairs[pairID];
181172 const auto *leaf = dGrid->tree ().root ().probeLeaf (pair.origin );
@@ -186,14 +177,10 @@ struct ComputeUDFFunctor
186177 const int ly = (threadID >> 3 ) & 0x7 ;
187178 const int lz = (threadID >> 6 ) & 0x7 ;
188179
189- const nanovdb::Vec3f voxelCenter (
190- float (pair.origin [0 ] + lx),
191- float (pair.origin [1 ] + ly),
192- float (pair.origin [2 ] + lz));
180+ const nanovdb::Vec3f voxelCenter (float (pair.origin [0 ] + lx), float (pair.origin [1 ] + ly), float (pair.origin [2 ] + lz));
193181
194182 const auto &tri = dTriangles[pair.triangleID ];
195- const float distSqr = nanovdb::math::pointToTriangleDistSqr (
196- tri[0 ], tri[1 ], tri[2 ], voxelCenter);
183+ const float distSqr = nanovdb::math::pointToTriangleDistSqr (tri[0 ], tri[1 ], tri[2 ], voxelCenter);
197184
198185 if (distSqr >= bandWidthSqr) return ;
199186
0 commit comments