99#ifndef OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
1010#define OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
1111
12+ // / @brief Dev option to experiment with batched rasterization, regardless of
13+ // / the ISA in use.
14+ #define OPENVDB_DISABLE_BATCHED_TRANSFERS 0
15+
1216namespace openvdb {
1317OPENVDB_USE_VERSION_NAMESPACE
1418namespace OPENVDB_VERSION_NAME {
@@ -177,6 +181,18 @@ struct SignedDistanceFieldTransfer :
177181 using FilteredTransferT = FilteredTransfer<FilterT>;
178182 using PositionHandleT = AttributeHandle<Vec3f, PositionCodecT>;
179183
184+ template <typename RealT>
185+ static constexpr size_t GetBatchSize ()
186+ {
187+ #if OPENVDB_DISABLE_BATCHED_TRANSFERS
188+ return 1 ;
189+ #else
190+ using namespace openvdb ::util;
191+ using NativeT = typename simd::SimdNativeT<RealT>::Type;
192+ return simd::SimdTraits<NativeT>::size;
193+ #endif
194+ }
195+
180196 // typically the max radius of all points rounded up
181197 inline Vec3i range (const Coord&, size_t ) const { return mMaxKernelWidth ; }
182198
@@ -192,7 +208,7 @@ struct SignedDistanceFieldTransfer :
192208 mPosition = std::make_unique<PositionHandleT>(leaf.constAttributeArray (mPIdx ));
193209 mRadius .reset (leaf);
194210 // if CPG, store leaf id in upper 32 bits of mask
195- if (CPG) mPLeafMask = (Index64 (mIds ->find (&leaf)->second ) << 32 );
211+ if constexpr (CPG) mPLeafMask = (Index64 (mIds ->find (&leaf)->second ) << 32 );
196212 return true ;
197213 }
198214
@@ -305,25 +321,59 @@ struct SphericalTransfer :
305321 const std::unordered_map<const PointDataTree::LeafNodeType*, Index>* ids = nullptr )
306322 : SphericalTransfer(pidx, Vec3i(width), rt, source, filter, interrupt, surface, cpg, ids) {}
307323
324+ static constexpr size_t GetBatchSize ()
325+ {
326+ return BaseT::template GetBatchSize<RealT>();
327+ }
328+
308329 // / @brief For each point, stamp a sphere with a given radius by running
309330 // / over all intersecting voxels and calculating if this point is closer
310331 // / than the currently held distance value. Note that the default value
311332 // / of the surface buffer should be the background value of the surface.
333+ inline void rasterizePoints (const Coord& ijk,
334+ const Index start,
335+ const Index end,
336+ const CoordBBox& bounds)
337+ {
338+ constexpr auto N2 = BaseT::template GetBatchSize<Real>();
339+ if constexpr (N2 == 1 ) {
340+ // Fallback to per point rasterization
341+ for (Index i = start; i < end; ++i) {
342+ this ->rasterizePoint (ijk, i, bounds);
343+ }
344+ }
345+ else {
346+ // Batched/vectorized rasterization. Expect power of two for
347+ // batched size
348+ static_assert ((N2 > 1 ) && !(N2 & (N2 - 1 )));
349+
350+ std::array<int64_t , N2> ids;
351+ Index offset = 0 ;
352+ for (Index i = start; i < end; ++i) {
353+ if (!BaseT::filter (i)) continue ;
354+ ids[offset++] = int64_t (i);
355+ if (offset == N2) {
356+ this ->rasterizeN2 <N2>(ijk, ids, bounds);
357+ offset = 0 ;
358+ }
359+ }
360+
361+ if (offset == 0 ) return ;
362+ else if (offset == 1 ) this ->rasterizePoint (ijk, Index (ids[0 ]), bounds);
363+ else {
364+ for (; offset < N2; ++offset) ids[offset] = int64_t (-1 );
365+ this ->rasterizeN2 <N2>(ijk, ids, bounds);
366+ }
367+ }
368+ }
369+
370+ // / @brief Single point rasterization
312371 inline void rasterizePoint (const Coord& ijk,
313372 const Index id,
314373 const CoordBBox& bounds)
315374 {
316375 if (!BaseT::filter (id)) return ;
317-
318- // @note GCC 6.3 warns here in optimised builds
319- #if defined(__GNUC__) && !defined(__clang__)
320- #pragma GCC diagnostic push
321- #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
322- #endif
323376 Vec3d P = ijk.asVec3d () + Vec3d (this ->mPosition ->get (id));
324- #if defined(__GNUC__) && !defined(__clang__)
325- #pragma GCC diagnostic pop
326- #endif
327377 P = this ->transformSourceToTarget (P);
328378 this ->rasterizePoint (P, id, bounds, this ->mRadius .eval (id));
329379 }
@@ -382,10 +432,128 @@ struct SphericalTransfer :
382432 const RealT max = r.max ();
383433 CoordBBox intersectBox (Coord::round (P - max), Coord::round (P + max));
384434 intersectBox.intersect (bounds);
385- if (intersectBox.empty ()) return ;
435+
436+ this ->stamp <RealT, const Index*>
437+ (P.x (), P.y (), P.z (), r.get (), r.minSq (), r.maxSq (), &id, intersectBox);
438+ }
439+
440+ private:
441+ template <size_t Size>
442+ inline void rasterizeN2 (const Coord& ijk,
443+ const std::array<int64_t , BaseT::template GetBatchSize<Real>()>& points,
444+ const CoordBBox& bounds)
445+ {
446+ using namespace openvdb ::util;
447+ using SimdT = typename simd::SimdT<RealT, Size>::Type;
448+ using SimdIT = typename simd::SimdT<int64_t , Size>::Type;
449+
450+ // simd containers
451+ SimdT px, py, pz, rad, rmax, rmin2, rmax2;
452+ // temporaries - 3 components per point, 4 for varying radii
453+ // x,x,x,x, y.y.y.y, z.z.z.z etc
454+ std::array<RealT, (RadiusType::Fixed?3 :4 )*Size> cache;
455+
456+ const Vec3d ijkd = ijk.asVec3d ();
457+ Vec3d tmp;
458+
459+ const SimdIT ids = simd::load<Size>(points.data ());
460+ int64_t firstInvalidIdx =
461+ simd::horizontal_find_first (simd::eq (ids, SimdIT (-1 )));
462+ if (firstInvalidIdx == -1 ) firstInvalidIdx = Size;
463+ // It's guaranteed that at least two indices are valid in the
464+ // "points" array (if it's one then rasterizePoint is called).
465+ OPENVDB_ASSERT (firstInvalidIdx >= 2 );
466+
467+ // convert AoS to SoA
468+ for (size_t i = 0 ; i < Size; ++i) {
469+ if (int64_t (i) < firstInvalidIdx) {
470+ OPENVDB_ASSERT (points[i] != -1 );
471+ tmp = ijkd + Vec3d (this ->mPosition ->get (Index (points[i])));
472+ tmp = this ->transformSourceToTarget (tmp);
473+ }
474+ cache[i+(Size*0 )] = tmp[0 ];
475+ cache[i+(Size*1 )] = tmp[1 ];
476+ cache[i+(Size*2 )] = tmp[2 ];
477+ }
478+
479+ px = simd::load<Size>(cache.data () + (Size*0 ));
480+ py = simd::load<Size>(cache.data () + (Size*1 ));
481+ pz = simd::load<Size>(cache.data () + (Size*2 ));
482+
483+ if constexpr (RadiusType::Fixed) {
484+ // all points have the same radius, just fill from one
485+ const auto reval = this ->mRadius .eval (Index (points[0 ]));
486+ rad = SimdT (reval.get ());
487+ rmax = SimdT (reval.max ());
488+ rmax2 = SimdT (reval.maxSq ());
489+ rmin2 = SimdT (reval.minSq ());
490+ }
491+ else {
492+ // varying radius, convert AoS to SoA
493+ for (int64_t i = 0 ; i < firstInvalidIdx; ++i) {
494+ const auto reval = this ->mRadius .eval (Index (points[i]));
495+ cache[i+(Size*0 )] = reval.get ();
496+ cache[i+(Size*1 )] = reval.max ();
497+ cache[i+(Size*2 )] = reval.maxSq ();
498+ cache[i+(Size*3 )] = reval.minSq ();
499+ }
500+
501+ // Broadcast first radius into remaining elements
502+ for (int64_t i = firstInvalidIdx; i < int64_t (Size); ++i) {
503+ cache[i+(Size*0 )] = cache[0 +(Size*0 )];
504+ cache[i+(Size*1 )] = cache[0 +(Size*1 )];
505+ cache[i+(Size*2 )] = cache[0 +(Size*2 )];
506+ cache[i+(Size*3 )] = cache[0 +(Size*3 )];
507+ }
508+
509+ rad = simd::load<Size>(cache.data () + (Size*0 ));
510+ rmax = simd::load<Size>(cache.data () + (Size*1 ));
511+ rmax2 = simd::load<Size>(cache.data () + (Size*2 ));
512+ rmin2 = simd::load<Size>(cache.data () + (Size*3 ));
513+ }
514+
515+ // @note Could technically improve this when Size == 4 and only do a
516+ // single -/+ by horizontallying into 2xVCL4 types.
517+ CoordBBox intersectBox (
518+ Coord::round (Vec3d (
519+ simd::horizontal_min (simd::sub (px, rmax)),
520+ simd::horizontal_min (simd::sub (py, rmax)),
521+ simd::horizontal_min (simd::sub (pz, rmax))
522+ )),
523+ Coord::round (Vec3d (
524+ simd::horizontal_max (simd::add (px, rmax)),
525+ simd::horizontal_max (simd::add (py, rmax)),
526+ simd::horizontal_max (simd::add (pz, rmax))
527+ ))
528+ );
529+ intersectBox.intersect (bounds);
530+
531+ this ->stamp <SimdT, SimdIT>
532+ (px, py, pz, rad, rmin2, rmax2, ids, intersectBox);
533+ }
534+
535+ template <typename ScalarT, typename IdT> // / RealT or SimdT
536+ inline void stamp (const ScalarT& Px,
537+ const ScalarT& Py,
538+ const ScalarT& Pz,
539+ const ScalarT& r,
540+ const ScalarT& Rmin2,
541+ const ScalarT& Rmax2,
542+ const IdT& ids,
543+ const CoordBBox& intersection)
544+ {
545+ using namespace openvdb ::util;
546+
547+ OPENVDB_ASSERT (simd::horizontal_and (simd::is_finite (r)));
548+ OPENVDB_ASSERT (simd::horizontal_and (simd::is_finite (Px)));
549+ OPENVDB_ASSERT (simd::horizontal_and (simd::is_finite (Py)));
550+ OPENVDB_ASSERT (simd::horizontal_and (simd::is_finite (Pz)));
551+
552+ if (intersection.empty ()) return ;
386553
387554 auto * const data = this ->template buffer <0 >();
388- auto * const cpg = CPG ? this ->template buffer <CPG ? 1 : 0 >() : nullptr ;
555+ [[maybe_unused]] auto * const cpg = CPG ? this ->template buffer <CPG ? 1 : 0 >() : nullptr ;
556+
389557 auto & mask = *(this ->template mask <0 >());
390558
391559 // If min2 == 0.0, then the index space radius is equal to or less than
@@ -396,46 +564,61 @@ struct SphericalTransfer :
396564 // incorrectly setting these voxels to inactive -background values as
397565 // x2y2z2 will never be < 0.0. We still want the lteq logic in the
398566 // (x2y2z2 <= min2) check as this is valid when min2 > 0.0.
399- const RealT min2 = r.minSq () == 0.0 ? -1.0 : r.minSq ();
400- const RealT max2 = r.maxSq ();
567+ const ScalarT min2 = simd::select (simd::eq (Rmin2, ScalarT (0.0 )), ScalarT (-1.0 ), Rmin2);
568+ const ScalarT max2 = Rmax2;
569+ const ScalarT vdx (this ->mDx );
401570
402- const Coord& a (intersectBox .min ());
403- const Coord& b (intersectBox .max ());
571+ const Coord& a (intersection .min ());
572+ const Coord& b (intersection .max ());
404573 for (Coord c = a; c.x () <= b.x (); ++c.x ()) {
405- const RealT x2 = static_cast <RealT>( math::Pow2 ( c.x () - P[ 0 ] ));
574+ const ScalarT x2 = simd::square ( simd::sub ( ScalarT ( RealT ( c.x ())), Px ));
406575 const Index i = ((c.x () & (DIM-1u )) << 2 *LOG2DIM); // unsigned bit shift mult
407576 for (c.y () = a.y (); c.y () <= b.y (); ++c.y ()) {
408- const RealT x2y2 = static_cast <RealT> (x2 + math::Pow2 ( c.y () - P[ 1 ] ));
577+ const ScalarT x2y2 = simd::add (x2, simd::square ( simd::sub ( ScalarT ( RealT ( c.y ())), Py) ));
409578 const Index ij = i + ((c.y () & (DIM-1u )) << LOG2DIM);
410579 for (c.z () = a.z (); c.z () <= b.z (); ++c.z ()) {
411580 const Index offset = ij + /* k*/ (c.z () & (DIM-1u ));
412581 if (!mask.isOn (offset)) continue ; // inside existing level set or not in range
413582
414- const RealT x2y2z2 = static_cast <RealT>(x2y2 + math::Pow2 (c.z () - P[2 ]));
415- if (x2y2z2 >= max2) continue ; // outside narrow band of particle in positive direction
416- if (x2y2z2 <= min2) { // outside narrow band of the particle in negative direction. can disable this to fill interior
583+ const ScalarT x2y2z2 = simd::add (x2y2, simd::square (simd::sub (ScalarT (RealT (c.z ())), Pz)));
584+ OPENVDB_ASSERT (simd::horizontal_and (simd::is_finite (x2y2z2)));
585+
586+ // If all outside the maximum band, continue
587+ if (simd::horizontal_and (simd::gte (x2y2z2, max2))) continue ;
588+ // If any inside the minimum band, set the maximum negative background
589+ if (simd::horizontal_or (simd::lte (x2y2z2, min2))) {
417590 data[offset] = -(this ->mBackground );
418591 mask.setOff (offset);
419592 continue ;
420593 }
421594
422- const ValueT d = ValueT (this ->mDx * (math::Sqrt (x2y2z2) - r.get ())); // back to world space
595+ // Compute distance to surface (the mul() takes us back to world space)
596+ const ScalarT dist = simd::mul (vdx, (simd::sub (simd::sqrt (x2y2z2), r)));
597+ // keep original precision for horizontal_find_first below
598+ const auto mindist = simd::horizontal_min (dist);
599+ // Convert to surface precision
600+ const ValueT d = ValueT (mindist);
601+
423602 ValueT& v = data[offset];
424603 if (d < v) {
425- v = d;
426- OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
427- if (CPG) cpg[offset] = Int64 (this ->mPLeafMask | Index64 (id));
428- OPENVDB_NO_TYPE_CONVERSION_WARNING_END
429- // transfer attributes - we can't use this here as the exposed
430- // function signatures take vector of attributes (i.e. an unbounded
431- // size). If we instead clamped the attribute transfer to a fixed
432- // amount of attributes we could get rid of the closest point logic
433- // entirely. @todo consider adding another signature for increased
434- // performance
435- // this->foreach([&](auto* buffer, const size_t idx) {
436- // using Type = typename std::remove_pointer<decltype(buffer)>::type;
437- // buffer[offset] = mAttributes.template get<Type>(idx);
438- // })
604+ v = d; // replace surface value
605+ if constexpr (CPG) {
606+ const int id = simd::horizontal_find_first (simd::eq (ScalarT (mindist), dist));
607+ OPENVDB_ASSERT (id != -1 );
608+ // transfer attributes - we can't use this here as the exposed
609+ // function signatures take vector of attributes (i.e. an unbounded
610+ // size). If we instead clamped the attribute transfer to a fixed
611+ // amount of attributes we could get rid of the closest point logic
612+ // entirely. @todo consider adding another signature for increased
613+ // performance
614+ // this->foreach([&](auto* buffer, const size_t idx) {
615+ // using Type = typename std::remove_pointer<decltype(buffer)>::type;
616+ // buffer[offset] = mAttributes.template get<Type>(idx);
617+ // })
618+ OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
619+ cpg[offset] = Int64 (this ->mPLeafMask | Index64 (ids[id]));
620+ OPENVDB_NO_TYPE_CONVERSION_WARNING_END
621+ }
439622 }
440623 }
441624 }
0 commit comments