Skip to content

Commit e3f7707

Browse files
committed
Some refactoring
1 parent 530fd54 commit e3f7707

6 files changed

Lines changed: 213 additions & 183 deletions

File tree

internal/SmallVector.h

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ template <typename T, typename Allocator = aligned_allocator<T, alignof(T)>> cla
2121
static const uint32_t OwnerBit = (1u << (8u * sizeof(uint32_t) - 1u));
2222
static const uint32_t CapacityMask = ~OwnerBit;
2323

24-
protected:
24+
protected:
2525
SmallVectorImpl(T *begin, T *end, const uint32_t capacity, const Allocator &alloc)
2626
: Allocator(alloc), begin_(begin), size_(uint32_t(end - begin)), capacity_(capacity) {}
2727

@@ -47,7 +47,7 @@ template <typename T, typename Allocator = aligned_allocator<T, alignof(T)>> cla
4747
reserve(new_capacity);
4848
}
4949

50-
public:
50+
public:
5151
using iterator = T *;
5252
using const_iterator = const T *;
5353

@@ -68,16 +68,11 @@ template <typename T, typename Allocator = aligned_allocator<T, alignof(T)>> cla
6868
capacity_ = 0;
6969
}
7070

71-
reserve(rhs.capacity_ & CapacityMask);
71+
reserve(rhs.size_);
7272

73-
size_ = rhs.size_;
74-
75-
if (rhs.size_) {
76-
T *src = rhs.begin_ + rhs.size_ - 1;
77-
T *dst = begin_ + size_ - 1;
78-
do {
79-
new (dst--) T(*src--);
80-
} while (src >= rhs.begin_);
73+
while (size_ < rhs.size_) {
74+
new (begin_ + size_) T(*(rhs.begin_ + size_));
75+
++size_;
8176
}
8277

8378
return (*this);
@@ -102,14 +97,12 @@ template <typename T, typename Allocator = aligned_allocator<T, alignof(T)>> cla
10297
size_ = std::exchange(rhs.size_, 0);
10398
capacity_ = std::exchange(rhs.capacity_, 0);
10499
} else {
105-
reserve(rhs.capacity_ & CapacityMask);
106-
107-
size_ = rhs.size_;
100+
reserve(rhs.size_);
108101

109-
T *dst = begin_ + size_ - 1;
110102
while (rhs.size_) {
111-
new (dst--) T(std::move(*(rhs.begin_ + --rhs.size_)));
112-
(rhs.begin_ + rhs.size_)->~T();
103+
new (begin_ + size_) T(std::move(*(rhs.begin_ + size_)));
104+
++size_;
105+
--rhs.size_;
113106
}
114107
}
115108

@@ -378,7 +371,7 @@ template <typename T, int N, int AlignmentOfT = alignof(T), typename Allocator =
378371
class SmallVector : public SmallVectorImpl<T, Allocator> {
379372
alignas(AlignmentOfT) char buffer_[sizeof(T) * N];
380373

381-
public:
374+
public:
382375
SmallVector(const Allocator &alloc = Allocator()) // NOLINT
383376
: SmallVectorImpl<T, Allocator>((T *)buffer_, (T *)buffer_, N, alloc) {}
384377
explicit SmallVector(const uint32_t size, const T &val = T(), const Allocator &alloc = Allocator()) // NOLINT

internal/simd/simd.h

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@
110110

111111
namespace Ray {
112112
namespace NS {
113+
template <typename T> struct same_size_uint {
114+
using type =
115+
std::conditional_t<sizeof(T) == 1, uint8_t,
116+
std::conditional_t<sizeof(T) == 2, uint16_t,
117+
std::conditional_t<sizeof(T) == 4, uint32_t,
118+
std::conditional_t<sizeof(T) == 8, uint64_t,
119+
void>>>>; // void as "not found"
120+
};
113121

114122
enum vector_aligned_tag { vector_aligned };
115123

@@ -162,19 +170,20 @@ template <typename T, int S> class fixed_size_simd {
162170
}
163171

164172
fixed_size_simd<T, S> &operator|=(const fixed_size_simd<T, S> &rhs) {
165-
const auto *src2 = reinterpret_cast<const uint8_t *>(&rhs.comp_[0]);
166-
167-
auto *dst = reinterpret_cast<uint8_t *>(&comp_[0]);
168-
169-
for (int i = 0; i < S * sizeof(T); i++) {
173+
const auto *src2 = reinterpret_cast<const typename same_size_uint<T>::type *>(&rhs.comp_[0]);
174+
auto *dst = reinterpret_cast<typename same_size_uint<T>::type *>(&comp_[0]);
175+
for (int i = 0; i < S; i++) {
170176
dst[i] |= src2[i];
171177
}
172-
173178
return *this;
174179
}
175180

176181
fixed_size_simd<T, S> &operator^=(const fixed_size_simd<T, S> &rhs) {
177-
UNROLLED_FOR_S(i, S, { comp_[i] ^= rhs.comp_[i]; })
182+
const auto *src2 = reinterpret_cast<const typename same_size_uint<T>::type *>(&rhs.comp_[0]);
183+
auto *dst = reinterpret_cast<typename same_size_uint<T>::type *>(&comp_[0]);
184+
for (int i = 0; i < S; i++) {
185+
dst[i] ^= src2[i];
186+
}
178187
return *this;
179188
}
180189

@@ -257,15 +266,17 @@ template <typename T, int S> class fixed_size_simd {
257266
}
258267

259268
fixed_size_simd<T, S> &operator&=(const fixed_size_simd<T, S> &rhs) {
260-
UNROLLED_FOR_S(i, S,
261-
{ reinterpret_cast<uint32_t &>(comp_[i]) &= reinterpret_cast<const uint32_t &>(rhs.comp_[i]); })
269+
UNROLLED_FOR_S(i, S, {
270+
reinterpret_cast<typename same_size_uint<T>::type &>(comp_[i]) &=
271+
reinterpret_cast<const typename same_size_uint<T>::type &>(rhs.comp_[i]);
272+
})
262273
return *this;
263274
}
264275

265276
fixed_size_simd<T, S> operator~() const {
266277
fixed_size_simd<T, S> ret;
267278
UNROLLED_FOR_S(i, S, {
268-
const uint32_t temp = ~reinterpret_cast<const uint32_t &>(comp_[i]);
279+
const auto temp = ~reinterpret_cast<const typename same_size_uint<T>::type &>(comp_[i]);
269280
ret.comp_[i] = reinterpret_cast<const T &>(temp);
270281
})
271282
return ret;
@@ -336,15 +347,13 @@ template <typename T, int S> class fixed_size_simd {
336347
}
337348

338349
bool all_zeros(const fixed_size_simd<int, S> &mask) const {
339-
const auto *src1 = reinterpret_cast<const uint8_t *>(&comp_[0]);
340-
const auto *src2 = reinterpret_cast<const uint8_t *>(&mask.comp_[0]);
341-
342-
for (int i = 0; i < S * sizeof(T); i++) {
350+
const auto *src1 = reinterpret_cast<const typename same_size_uint<T>::type *>(&comp_[0]);
351+
const auto *src2 = reinterpret_cast<const typename same_size_uint<T>::type *>(&mask.comp_[0]);
352+
for (int i = 0; i < S; i++) {
343353
if ((src1[i] & src2[i]) != 0) {
344354
return false;
345355
}
346356
}
347-
348357
return true;
349358
}
350359

@@ -397,14 +406,14 @@ template <typename T, int S> class fixed_size_simd {
397406
}
398407

399408
static fixed_size_simd<T, S> and_not(const fixed_size_simd<T, S> &v1, const fixed_size_simd<T, S> &v2) {
400-
const auto *src1 = reinterpret_cast<const uint8_t *>(&v1.comp_[0]);
401-
const auto *src2 = reinterpret_cast<const uint8_t *>(&v2.comp_[0]);
409+
const auto *src1 = reinterpret_cast<const typename same_size_uint<T>::type *>(&v1.comp_[0]);
410+
const auto *src2 = reinterpret_cast<const typename same_size_uint<T>::type *>(&v2.comp_[0]);
402411

403412
fixed_size_simd<T, S> ret;
404413

405-
auto *dst = reinterpret_cast<uint8_t *>(&ret.comp_[0]);
414+
auto *dst = reinterpret_cast<typename same_size_uint<T>::type *>(&ret.comp_[0]);
406415

407-
for (int i = 0; i < S * sizeof(T); i++) {
416+
for (int i = 0; i < S; i++) {
408417
dst[i] = (~src1[i]) & src2[i];
409418
}
410419

@@ -413,26 +422,23 @@ template <typename T, int S> class fixed_size_simd {
413422

414423
static fixed_size_simd<float, S> floor(const fixed_size_simd<float, S> &v1) {
415424
fixed_size_simd<float, S> temp;
416-
UNROLLED_FOR_S(i, S, { temp.comp_[i] = float(int(v1.comp_[i]) - (v1.comp_[i] < 0.0f)); })
425+
UNROLLED_FOR_S(i, S, { temp.comp_[i] = std::floor(v1.comp_[i]); })
417426
return temp;
418427
}
419428

420429
static fixed_size_simd<float, S> ceil(const fixed_size_simd<float, S> &v1) {
421430
fixed_size_simd<float, S> temp;
422-
UNROLLED_FOR_S(i, S, {
423-
int _v = int(v1.comp_[i]);
424-
temp.comp_[i] = float(_v + (v1.comp_[i] != _v));
425-
})
431+
UNROLLED_FOR_S(i, S, { temp.comp_[i] = std::ceil(v1.comp_[i]); })
426432
return temp;
427433
}
428434

429435
#define DEFINE_BITS_OPERATOR(OP) \
430436
friend fixed_size_simd<T, S> operator OP(const fixed_size_simd<T, S> &v1, const fixed_size_simd<T, S> &v2) { \
431-
const auto *src1 = reinterpret_cast<const uint8_t *>(&v1.comp_[0]); \
432-
const auto *src2 = reinterpret_cast<const uint8_t *>(&v2.comp_[0]); \
437+
const auto *src1 = reinterpret_cast<const typename same_size_uint<T>::type *>(&v1.comp_[0]); \
438+
const auto *src2 = reinterpret_cast<const typename same_size_uint<T>::type *>(&v2.comp_[0]); \
433439
fixed_size_simd<T, S> ret; \
434-
auto *dst = reinterpret_cast<uint8_t *>(&ret.comp_[0]); \
435-
for (int i = 0; i < S * sizeof(T); i++) { \
440+
auto *dst = reinterpret_cast<typename same_size_uint<T>::type *>(&ret.comp_[0]); \
441+
for (int i = 0; i < S; i++) { \
436442
dst[i] = src1[i] OP src2[i]; \
437443
} \
438444
return ret; \

internal/simd/simd_avx.h

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
namespace Ray {
3131
namespace NS {
3232

33-
template <> force_inline __m256 _mm_cast(__m256i x) { return _mm256_castsi256_ps(x); }
34-
template <> force_inline __m256i _mm_cast(__m256 x) { return _mm256_castps_si256(x); }
33+
template <> force_inline __m256 _mm_cast(const __m256i x) { return _mm256_castsi256_ps(x); }
34+
template <> force_inline __m256i _mm_cast(const __m256 x) { return _mm256_castps_si256(x); }
3535

3636
template <> class fixed_size_simd<int, 8>;
3737
template <> class fixed_size_simd<unsigned, 8>;
@@ -417,14 +417,22 @@ template <> class fixed_size_simd<int, 8> {
417417

418418
force_inline void vectorcall blend_to(const fixed_size_simd<int, 8> mask, const fixed_size_simd<int, 8> v1) {
419419
validate_mask(mask);
420+
#if defined(USE_AVX2) || defined(USE_AVX512)
421+
vec_ = _mm256_blendv_epi8(vec_, v1.vec_, mask.vec_);
422+
#else
420423
vec_ = _mm256_castps_si256(
421424
_mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_)));
425+
#endif
422426
}
423427

424428
force_inline void vectorcall blend_inv_to(const fixed_size_simd<int, 8> mask, const fixed_size_simd<int, 8> v1) {
425429
validate_mask(mask);
430+
#if defined(USE_AVX2) || defined(USE_AVX512)
431+
vec_ = _mm256_blendv_epi8(v1.vec_, vec_, mask.vec_);
432+
#else
426433
vec_ = _mm256_castps_si256(
427434
_mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_)));
435+
#endif
428436
}
429437

430438
force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); }
@@ -469,22 +477,38 @@ template <> class fixed_size_simd<int, 8> {
469477

470478
force_inline static fixed_size_simd<int, 8> vectorcall and_not(const fixed_size_simd<int, 8> v1,
471479
const fixed_size_simd<int, 8> v2) {
480+
#if defined(USE_AVX2) || defined(USE_AVX512)
481+
return _mm256_andnot_si256(v1.vec_, v2.vec_);
482+
#else
472483
return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
484+
#endif
473485
}
474486

475487
friend force_inline fixed_size_simd<int, 8> vectorcall operator&(const fixed_size_simd<int, 8> v1,
476488
const fixed_size_simd<int, 8> v2) {
489+
#if defined(USE_AVX2) || defined(USE_AVX512)
490+
return _mm256_and_si256(v1.vec_, v2.vec_);
491+
#else
477492
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
493+
#endif
478494
}
479495

480496
friend force_inline fixed_size_simd<int, 8> vectorcall operator|(const fixed_size_simd<int, 8> v1,
481497
const fixed_size_simd<int, 8> v2) {
498+
#if defined(USE_AVX2) || defined(USE_AVX512)
499+
return _mm256_or_si256(v1.vec_, v2.vec_);
500+
#else
482501
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
502+
#endif
483503
}
484504

485505
friend force_inline fixed_size_simd<int, 8> vectorcall operator^(const fixed_size_simd<int, 8> v1,
486506
const fixed_size_simd<int, 8> v2) {
507+
#if defined(USE_AVX2) || defined(USE_AVX512)
508+
return _mm256_xor_si256(v1.vec_, v2.vec_);
509+
#else
487510
return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
511+
#endif
488512
}
489513

490514
friend avx2_inline fixed_size_simd<int, 8> vectorcall operator+(const fixed_size_simd<int, 8> v1,
@@ -760,8 +784,12 @@ template <> class fixed_size_simd<unsigned, 8> {
760784
return operator-=(fixed_size_simd<unsigned, 8>{rhs});
761785
}
762786

763-
fixed_size_simd<unsigned, 8> &vectorcall operator*=(const fixed_size_simd<unsigned, 8> rhs) {
787+
avx2_inline fixed_size_simd<unsigned, 8> &vectorcall operator*=(const fixed_size_simd<unsigned, 8> rhs) {
788+
#if defined(USE_AVX2) || defined(USE_AVX512)
789+
vec_ = _mm256_mullo_epi32(vec_, rhs.vec_);
790+
#else
764791
UNROLLED_FOR(i, 8, { comp_[i] *= rhs.comp_[i]; })
792+
#endif
765793
return *this;
766794
}
767795

@@ -855,15 +883,23 @@ template <> class fixed_size_simd<unsigned, 8> {
855883
force_inline void vectorcall blend_to(const fixed_size_simd<unsigned, 8> mask,
856884
const fixed_size_simd<unsigned, 8> v1) {
857885
validate_mask(mask);
886+
#if defined(USE_AVX2) || defined(USE_AVX512)
887+
vec_ = _mm256_blendv_epi8(vec_, v1.vec_, mask.vec_);
888+
#else
858889
vec_ = _mm256_castps_si256(
859890
_mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_)));
891+
#endif
860892
}
861893

862894
force_inline void vectorcall blend_inv_to(const fixed_size_simd<unsigned, 8> mask,
863895
const fixed_size_simd<unsigned, 8> v1) {
864896
validate_mask(mask);
897+
#if defined(USE_AVX2) || defined(USE_AVX512)
898+
vec_ = _mm256_blendv_epi8(v1.vec_, vec_, mask.vec_);
899+
#else
865900
vec_ = _mm256_castps_si256(
866901
_mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_)));
902+
#endif
867903
}
868904

869905
force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); }
@@ -908,22 +944,38 @@ template <> class fixed_size_simd<unsigned, 8> {
908944

909945
force_inline static fixed_size_simd<unsigned, 8> vectorcall and_not(const fixed_size_simd<unsigned, 8> v1,
910946
const fixed_size_simd<unsigned, 8> v2) {
947+
#if defined(USE_AVX2) || defined(USE_AVX512)
948+
return _mm256_andnot_si256(v1.vec_, v2.vec_);
949+
#else
911950
return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
951+
#endif
912952
}
913953

914954
friend force_inline fixed_size_simd<unsigned, 8> vectorcall operator&(const fixed_size_simd<unsigned, 8> v1,
915955
const fixed_size_simd<unsigned, 8> v2) {
956+
#if defined(USE_AVX2) || defined(USE_AVX512)
957+
return _mm256_and_si256(v1.vec_, v2.vec_);
958+
#else
916959
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
960+
#endif
917961
}
918962

919963
friend force_inline fixed_size_simd<unsigned, 8> vectorcall operator|(const fixed_size_simd<unsigned, 8> v1,
920964
const fixed_size_simd<unsigned, 8> v2) {
965+
#if defined(USE_AVX2) || defined(USE_AVX512)
966+
return _mm256_or_si256(v1.vec_, v2.vec_);
967+
#else
921968
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
969+
#endif
922970
}
923971

924972
friend force_inline fixed_size_simd<unsigned, 8> vectorcall operator^(const fixed_size_simd<unsigned, 8> v1,
925973
const fixed_size_simd<unsigned, 8> v2) {
974+
#if defined(USE_AVX2) || defined(USE_AVX512)
975+
return _mm256_xor_si256(v1.vec_, v2.vec_);
976+
#else
926977
return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_)));
978+
#endif
927979
}
928980

929981
friend avx2_inline fixed_size_simd<unsigned, 8> vectorcall operator+(const fixed_size_simd<unsigned, 8> v1,
@@ -1328,17 +1380,25 @@ force_inline fixed_size_simd<int, 8> vectorcall select(const fixed_size_simd<U,
13281380
const fixed_size_simd<int, 8> vec1,
13291381
const fixed_size_simd<int, 8> vec2) {
13301382
validate_mask(mask);
1383+
#if defined(USE_AVX2) || defined(USE_AVX512)
1384+
return _mm256_blendv_epi8(vec2.vec_, vec1.vec_, mask.vec_);
1385+
#else
13311386
return _mm256_castps_si256(
13321387
_mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_)));
1388+
#endif
13331389
}
13341390

13351391
template <typename U>
13361392
force_inline fixed_size_simd<unsigned, 8> vectorcall select(const fixed_size_simd<U, 8> mask,
13371393
const fixed_size_simd<unsigned, 8> vec1,
13381394
const fixed_size_simd<unsigned, 8> vec2) {
13391395
validate_mask(mask);
1396+
#if defined(USE_AVX2) || defined(USE_AVX512)
1397+
return _mm256_blendv_epi8(vec2.vec_, vec1.vec_, mask.vec_);
1398+
#else
13401399
return _mm256_castps_si256(
13411400
_mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_)));
1401+
#endif
13421402
}
13431403

13441404
} // namespace NS

0 commit comments

Comments
 (0)