Skip to content

Commit d928399

Browse files
committed
Initial manual vectorization/matching of SphericalTransfer scheme i.e. RasterizeSDF with spheres
Signed-off-by: Nick Avramoussis <4256455+Idclip@users.noreply.github.com>
1 parent 251f075 commit d928399

3 files changed

Lines changed: 233 additions & 43 deletions

File tree

openvdb/openvdb/points/PointRasterizeSDF.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
#include <openvdb/thread/Threading.h>
8080
#include <openvdb/util/NullInterrupter.h>
8181
#include <openvdb/util/Assert.h>
82+
#include <openvdb/util/Simd.h>
8283

8384
#include <unordered_map>
8485

openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -390,13 +390,16 @@ struct EllipsoidTransferQuat final :
390390
return ret;
391391
}
392392

393-
inline void rasterizePoint(const Coord& ijk,
394-
const Index id,
393+
inline void rasterizePoints(const Coord& ijk,
394+
const Index start,
395+
const Index end,
395396
const CoordBBox& bounds)
396397
{
397398
Mat3s R;
398-
R.setToRotation(mRotationHandle->get(id));
399-
this->BaseT::rasterizePoint(ijk, id, bounds, R);
399+
for (Index i = start; i < end; ++i) {
400+
R.setToRotation(mRotationHandle->get(i));
401+
this->BaseT::rasterizePoint(ijk, i, bounds, R);
402+
}
400403
}
401404

402405
private:
@@ -439,11 +442,14 @@ struct EllipsoidTransferMat3 final :
439442
return ret;
440443
}
441444

442-
inline void rasterizePoint(const Coord& ijk,
443-
const Index id,
445+
inline void rasterizePoints(const Coord& ijk,
446+
const Index start,
447+
const Index end,
444448
const CoordBBox& bounds)
445449
{
446-
this->BaseT::rasterizePoint(ijk, id, bounds, mXformHandle->get(id));
450+
for (Index i = start; i < end; ++i) {
451+
this->BaseT::rasterizePoint(ijk, i, bounds, mXformHandle->get(i));
452+
}
447453
}
448454

449455
private:

openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h

Lines changed: 219 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#ifndef OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
1010
#define OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
1111

12+
/// @brief Dev option to experiment with batched rasterization, regardless of
13+
/// the ISA in use.
14+
#define OPENVDB_DISABLE_BATCHED_TRANSFERS 0
15+
1216
namespace openvdb {
1317
OPENVDB_USE_VERSION_NAMESPACE
1418
namespace OPENVDB_VERSION_NAME {
@@ -177,6 +181,18 @@ struct SignedDistanceFieldTransfer :
177181
using FilteredTransferT = FilteredTransfer<FilterT>;
178182
using PositionHandleT = AttributeHandle<Vec3f, PositionCodecT>;
179183

184+
template <typename RealT>
185+
static constexpr size_t GetBatchSize()
186+
{
187+
#if OPENVDB_DISABLE_BATCHED_TRANSFERS
188+
return 1;
189+
#else
190+
using namespace openvdb::util;
191+
using NativeT = typename simd::SimdNativeT<RealT>::Type;
192+
return simd::SimdTraits<NativeT>::size;
193+
#endif
194+
}
195+
180196
// typically the max radius of all points rounded up
181197
inline Vec3i range(const Coord&, size_t) const { return mMaxKernelWidth; }
182198

@@ -192,7 +208,7 @@ struct SignedDistanceFieldTransfer :
192208
mPosition = std::make_unique<PositionHandleT>(leaf.constAttributeArray(mPIdx));
193209
mRadius.reset(leaf);
194210
// if CPG, store leaf id in upper 32 bits of mask
195-
if (CPG) mPLeafMask = (Index64(mIds->find(&leaf)->second) << 32);
211+
if constexpr(CPG) mPLeafMask = (Index64(mIds->find(&leaf)->second) << 32);
196212
return true;
197213
}
198214

@@ -305,25 +321,59 @@ struct SphericalTransfer :
305321
const std::unordered_map<const PointDataTree::LeafNodeType*, Index>* ids = nullptr)
306322
: SphericalTransfer(pidx, Vec3i(width), rt, source, filter, interrupt, surface, cpg, ids) {}
307323

324+
static constexpr size_t GetBatchSize()
325+
{
326+
return BaseT::template GetBatchSize<RealT>();
327+
}
328+
308329
/// @brief For each point, stamp a sphere with a given radius by running
309330
/// over all intersecting voxels and calculating if this point is closer
310331
/// than the currently held distance value. Note that the default value
311332
/// of the surface buffer should be the background value of the surface.
333+
inline void rasterizePoints(const Coord& ijk,
334+
const Index start,
335+
const Index end,
336+
const CoordBBox& bounds)
337+
{
338+
constexpr auto N2 = BaseT::template GetBatchSize<Real>();
339+
if constexpr(N2 == 1) {
340+
// Fallback to per point rasterization
341+
for (Index i = start; i < end; ++i) {
342+
this->rasterizePoint(ijk, i, bounds);
343+
}
344+
}
345+
else {
346+
// Batched/vectorized rasterization. Expect power of two for
347+
// batched size
348+
static_assert((N2 > 1) && !(N2 & (N2 - 1)));
349+
350+
std::array<int64_t, N2> ids;
351+
Index offset = 0;
352+
for (Index i = start; i < end; ++i) {
353+
if (!BaseT::filter(i)) continue;
354+
ids[offset++] = int64_t(i);
355+
if (offset == N2) {
356+
this->rasterizeN2<N2>(ijk, ids, bounds);
357+
offset = 0;
358+
}
359+
}
360+
361+
if (offset == 0) return;
362+
else if (offset == 1) this->rasterizePoint(ijk, Index(ids[0]), bounds);
363+
else {
364+
for (; offset < N2; ++offset) ids[offset] = int64_t(-1);
365+
this->rasterizeN2<N2>(ijk, ids, bounds);
366+
}
367+
}
368+
}
369+
370+
/// @brief Single point rasterization
312371
inline void rasterizePoint(const Coord& ijk,
313372
const Index id,
314373
const CoordBBox& bounds)
315374
{
316375
if (!BaseT::filter(id)) return;
317-
318-
// @note GCC 6.3 warns here in optimised builds
319-
#if defined(__GNUC__) && !defined(__clang__)
320-
#pragma GCC diagnostic push
321-
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
322-
#endif
323376
Vec3d P = ijk.asVec3d() + Vec3d(this->mPosition->get(id));
324-
#if defined(__GNUC__) && !defined(__clang__)
325-
#pragma GCC diagnostic pop
326-
#endif
327377
P = this->transformSourceToTarget(P);
328378
this->rasterizePoint(P, id, bounds, this->mRadius.eval(id));
329379
}
@@ -382,10 +432,128 @@ struct SphericalTransfer :
382432
const RealT max = r.max();
383433
CoordBBox intersectBox(Coord::round(P - max), Coord::round(P + max));
384434
intersectBox.intersect(bounds);
385-
if (intersectBox.empty()) return;
435+
436+
this->stamp<RealT, const Index*>
437+
(P.x(), P.y(), P.z(), r.get(), r.minSq(), r.maxSq(), &id, intersectBox);
438+
}
439+
440+
private:
441+
template <size_t Size>
442+
inline void rasterizeN2(const Coord& ijk,
443+
const std::array<int64_t, BaseT::template GetBatchSize<Real>()>& points,
444+
const CoordBBox& bounds)
445+
{
446+
using namespace openvdb::util;
447+
using SimdT = typename simd::SimdT<RealT, Size>::Type;
448+
using SimdIT = typename simd::SimdT<int64_t, Size>::Type;
449+
450+
// simd containers
451+
SimdT px, py, pz, rad, rmax, rmin2, rmax2;
452+
// temporaries - 3 components per point, 4 for varying radii
453+
// x,x,x,x, y.y.y.y, z.z.z.z etc
454+
std::array<RealT, (RadiusType::Fixed?3:4)*Size> cache;
455+
456+
const Vec3d ijkd = ijk.asVec3d();
457+
Vec3d tmp;
458+
459+
const SimdIT ids = simd::load<Size>(points.data());
460+
int64_t firstInvalidIdx =
461+
simd::horizontal_find_first(simd::eq(ids, SimdIT(-1)));
462+
if (firstInvalidIdx == -1) firstInvalidIdx = Size;
463+
// It's guaranteed that at least two indices are valid in the
464+
// "points" array (if it's one then rasterizePoint is called).
465+
OPENVDB_ASSERT(firstInvalidIdx >= 2);
466+
467+
// convert AoS to SoA
468+
for (size_t i = 0; i < Size; ++i) {
469+
if (int64_t(i) < firstInvalidIdx) {
470+
OPENVDB_ASSERT(points[i] != -1);
471+
tmp = ijkd + Vec3d(this->mPosition->get(Index(points[i])));
472+
tmp = this->transformSourceToTarget(tmp);
473+
}
474+
cache[i+(Size*0)] = tmp[0];
475+
cache[i+(Size*1)] = tmp[1];
476+
cache[i+(Size*2)] = tmp[2];
477+
}
478+
479+
px = simd::load<Size>(cache.data() + (Size*0));
480+
py = simd::load<Size>(cache.data() + (Size*1));
481+
pz = simd::load<Size>(cache.data() + (Size*2));
482+
483+
if constexpr(RadiusType::Fixed) {
484+
// all points have the same radius, just fill from one
485+
const auto reval = this->mRadius.eval(Index(points[0]));
486+
rad = SimdT(reval.get());
487+
rmax = SimdT(reval.max());
488+
rmax2 = SimdT(reval.maxSq());
489+
rmin2 = SimdT(reval.minSq());
490+
}
491+
else {
492+
// varying radius, convert AoS to SoA
493+
for (int64_t i = 0; i < firstInvalidIdx; ++i) {
494+
const auto reval = this->mRadius.eval(Index(points[i]));
495+
cache[i+(Size*0)] = reval.get();
496+
cache[i+(Size*1)] = reval.max();
497+
cache[i+(Size*2)] = reval.maxSq();
498+
cache[i+(Size*3)] = reval.minSq();
499+
}
500+
501+
// Broadcast first radius into remaining elements
502+
for (int64_t i = firstInvalidIdx; i < int64_t(Size); ++i) {
503+
cache[i+(Size*0)] = cache[0+(Size*0)];
504+
cache[i+(Size*1)] = cache[0+(Size*1)];
505+
cache[i+(Size*2)] = cache[0+(Size*2)];
506+
cache[i+(Size*3)] = cache[0+(Size*3)];
507+
}
508+
509+
rad = simd::load<Size>(cache.data() + (Size*0));
510+
rmax = simd::load<Size>(cache.data() + (Size*1));
511+
rmax2 = simd::load<Size>(cache.data() + (Size*2));
512+
rmin2 = simd::load<Size>(cache.data() + (Size*3));
513+
}
514+
515+
// @note Could technically improve this when Size == 4 and only do a
516+
// single -/+ by horizontallying into 2xVCL4 types.
517+
CoordBBox intersectBox(
518+
Coord::round(Vec3d(
519+
simd::horizontal_min(simd::sub(px, rmax)),
520+
simd::horizontal_min(simd::sub(py, rmax)),
521+
simd::horizontal_min(simd::sub(pz, rmax))
522+
)),
523+
Coord::round(Vec3d(
524+
simd::horizontal_max(simd::add(px, rmax)),
525+
simd::horizontal_max(simd::add(py, rmax)),
526+
simd::horizontal_max(simd::add(pz, rmax))
527+
))
528+
);
529+
intersectBox.intersect(bounds);
530+
531+
this->stamp<SimdT, SimdIT>
532+
(px, py, pz, rad, rmin2, rmax2, ids, intersectBox);
533+
}
534+
535+
template<typename ScalarT, typename IdT> /// RealT or SimdT
536+
inline void stamp(const ScalarT& Px,
537+
const ScalarT& Py,
538+
const ScalarT& Pz,
539+
const ScalarT& r,
540+
const ScalarT& Rmin2,
541+
const ScalarT& Rmax2,
542+
const IdT& ids,
543+
const CoordBBox& intersection)
544+
{
545+
using namespace openvdb::util;
546+
547+
OPENVDB_ASSERT(simd::horizontal_and(simd::is_finite(r)));
548+
OPENVDB_ASSERT(simd::horizontal_and(simd::is_finite(Px)));
549+
OPENVDB_ASSERT(simd::horizontal_and(simd::is_finite(Py)));
550+
OPENVDB_ASSERT(simd::horizontal_and(simd::is_finite(Pz)));
551+
552+
if (intersection.empty()) return;
386553

387554
auto* const data = this->template buffer<0>();
388-
auto* const cpg = CPG ? this->template buffer<CPG ? 1 : 0>() : nullptr;
555+
[[maybe_unused]] auto* const cpg = CPG ? this->template buffer<CPG ? 1 : 0>() : nullptr;
556+
389557
auto& mask = *(this->template mask<0>());
390558

391559
// If min2 == 0.0, then the index space radius is equal to or less than
@@ -396,46 +564,61 @@ struct SphericalTransfer :
396564
// incorrectly setting these voxels to inactive -background values as
397565
// x2y2z2 will never be < 0.0. We still want the lteq logic in the
398566
// (x2y2z2 <= min2) check as this is valid when min2 > 0.0.
399-
const RealT min2 = r.minSq() == 0.0 ? -1.0 : r.minSq();
400-
const RealT max2 = r.maxSq();
567+
const ScalarT min2 = simd::select(simd::eq(Rmin2, ScalarT(0.0)), ScalarT(-1.0), Rmin2);
568+
const ScalarT max2 = Rmax2;
569+
const ScalarT vdx(this->mDx);
401570

402-
const Coord& a(intersectBox.min());
403-
const Coord& b(intersectBox.max());
571+
const Coord& a(intersection.min());
572+
const Coord& b(intersection.max());
404573
for (Coord c = a; c.x() <= b.x(); ++c.x()) {
405-
const RealT x2 = static_cast<RealT>(math::Pow2(c.x() - P[0]));
574+
const ScalarT x2 = simd::square(simd::sub(ScalarT(RealT(c.x())), Px));
406575
const Index i = ((c.x() & (DIM-1u)) << 2*LOG2DIM); // unsigned bit shift mult
407576
for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
408-
const RealT x2y2 = static_cast<RealT>(x2 + math::Pow2(c.y() - P[1]));
577+
const ScalarT x2y2 = simd::add(x2, simd::square(simd::sub(ScalarT(RealT(c.y())), Py)));
409578
const Index ij = i + ((c.y() & (DIM-1u)) << LOG2DIM);
410579
for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
411580
const Index offset = ij + /*k*/(c.z() & (DIM-1u));
412581
if (!mask.isOn(offset)) continue; // inside existing level set or not in range
413582

414-
const RealT x2y2z2 = static_cast<RealT>(x2y2 + math::Pow2(c.z() - P[2]));
415-
if (x2y2z2 >= max2) continue; //outside narrow band of particle in positive direction
416-
if (x2y2z2 <= min2) { //outside narrow band of the particle in negative direction. can disable this to fill interior
583+
const ScalarT x2y2z2 = simd::add(x2y2, simd::square(simd::sub(ScalarT(RealT(c.z())), Pz)));
584+
OPENVDB_ASSERT(simd::horizontal_and(simd::is_finite(x2y2z2)));
585+
586+
// If all outside the maximum band, continue
587+
if (simd::horizontal_and(simd::gte(x2y2z2, max2))) continue;
588+
// If any inside the minimum band, set the maximum negative background
589+
if (simd::horizontal_or(simd::lte(x2y2z2, min2))) {
417590
data[offset] = -(this->mBackground);
418591
mask.setOff(offset);
419592
continue;
420593
}
421594

422-
const ValueT d = ValueT(this->mDx * (math::Sqrt(x2y2z2) - r.get())); // back to world space
595+
// Compute distance to surface (the mul() takes us back to world space)
596+
const ScalarT dist = simd::mul(vdx, (simd::sub(simd::sqrt(x2y2z2), r)));
597+
// keep original precision for horizontal_find_first below
598+
const auto mindist = simd::horizontal_min(dist);
599+
// Convert to surface precision
600+
const ValueT d = ValueT(mindist);
601+
423602
ValueT& v = data[offset];
424603
if (d < v) {
425-
v = d;
426-
OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
427-
if (CPG) cpg[offset] = Int64(this->mPLeafMask | Index64(id));
428-
OPENVDB_NO_TYPE_CONVERSION_WARNING_END
429-
// transfer attributes - we can't use this here as the exposed
430-
// function signatures take vector of attributes (i.e. an unbounded
431-
// size). If we instead clamped the attribute transfer to a fixed
432-
// amount of attributes we could get rid of the closest point logic
433-
// entirely. @todo consider adding another signature for increased
434-
// performance
435-
// this->foreach([&](auto* buffer, const size_t idx) {
436-
// using Type = typename std::remove_pointer<decltype(buffer)>::type;
437-
// buffer[offset] = mAttributes.template get<Type>(idx);
438-
// })
604+
v = d; // replace surface value
605+
if constexpr(CPG) {
606+
const int id = simd::horizontal_find_first(simd::eq(ScalarT(mindist), dist));
607+
OPENVDB_ASSERT(id != -1);
608+
// transfer attributes - we can't use this here as the exposed
609+
// function signatures take vector of attributes (i.e. an unbounded
610+
// size). If we instead clamped the attribute transfer to a fixed
611+
// amount of attributes we could get rid of the closest point logic
612+
// entirely. @todo consider adding another signature for increased
613+
// performance
614+
// this->foreach([&](auto* buffer, const size_t idx) {
615+
// using Type = typename std::remove_pointer<decltype(buffer)>::type;
616+
// buffer[offset] = mAttributes.template get<Type>(idx);
617+
// })
618+
OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
619+
cpg[offset] = Int64(this->mPLeafMask | Index64(ids[id]));
620+
OPENVDB_NO_TYPE_CONVERSION_WARNING_END
621+
}
439622
}
440623
}
441624
}

0 commit comments

Comments
 (0)