Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 98 additions & 42 deletions src/include/OpenImageIO/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,9 @@ class vbool4 {
template<int i0, int i1, int i2, int i3>
OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -765,8 +766,9 @@ class vbool8 {
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -1158,8 +1160,9 @@ vint4 srl (const vint4& val, const unsigned int bits);
template<int i0, int i1, int i2, int i3>
OIIO_FORCEINLINE vint4 shuffle (const vint4& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -1458,8 +1461,9 @@ vint8 srl (const vint8& val, const unsigned int bits);
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
OIIO_FORCEINLINE vint8 shuffle (const vint8& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -1768,8 +1772,9 @@ template<int i> vint16 shuffle4 (const vint16& a);
template<int i0, int i1, int i2, int i3>
vint16 shuffle (const vint16& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> vint16 shuffle (const vint16& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> vint16 broadcast_element(const vint16& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -2093,8 +2098,9 @@ class vfloat4 {
template<int i0, int i1, int i2, int i3>
OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a);

/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
/// 2-bit indices packed into the template parameter i (going from the low
Expand Down Expand Up @@ -2716,8 +2722,8 @@ class vfloat8 {
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
/// broadcast_element<i>(a) is the same as shuffle<i,i,i,i,...>(a)
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -3046,8 +3052,9 @@ template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
template<int i0, int i1, int i2, int i3>
OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> vfloat16 shuffle (const vfloat16& a);
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> vfloat16 broadcast_element(const vfloat16& a);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -3468,11 +3475,17 @@ OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
#endif
}

/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
/// value a[i].
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a) {
return shuffle<i,i,i,i>(a);
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vbool4 shuffle(const vbool4& a) {
return broadcast_element<i>(a);
}


/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -3796,10 +3809,15 @@ OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
#endif
}

template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a) {
return shuffle<i,i,i,i,i,i,i,i>(a);
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vbool8 shuffle(const vbool8& a) {
return broadcast_element<i>(a);
}


template<int i>
OIIO_FORCEINLINE bool extract (const vbool8& a) {
Expand Down Expand Up @@ -4739,7 +4757,14 @@ OIIO_FORCEINLINE vint4 shuffle (const vint4& a) {
#endif
}

template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a) {
return shuffle<i,i,i,i>(a);
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vint4 shuffle(const vint4& a) {
return broadcast_element<i>(a);
}


template<int i>
Expand Down Expand Up @@ -5579,10 +5604,15 @@ OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
#endif
}

template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a) {
return shuffle<i,i,i,i,i,i,i,i>(a);
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vint8 shuffle(const vint8& a) {
return broadcast_element<i>(a);
}


template<int i>
OIIO_FORCEINLINE int extract (const vint8& v) {
Expand Down Expand Up @@ -6390,8 +6420,15 @@ vint16 shuffle (const vint16& a) {
#endif
}

template<int i> vint16 shuffle (const vint16& a) {
return shuffle<i,i,i,i> (a);
template<int i> vint16 broadcast_element(const vint16& a) {
return a[i];
}

// DEPRECATED(3.1): old name and nonstandard use
template<int i>
OIIO_DEPRECATED("Use broadcast_element (3.1)")
vint16 shuffle(const vint16& a) {
return broadcast_element<i> (a);
}


Expand Down Expand Up @@ -7248,19 +7285,26 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
#endif
}

template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a) {
return shuffle<i,i,i,i>(a);
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
return broadcast_element<i>(a);
}

#if OIIO_SIMD_NEON
template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<0> (const vfloat4& a) {
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
}
template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<1> (const vfloat4& a) {
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
}
template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<2> (const vfloat4& a) {
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
}
template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<3> (const vfloat4& a) {
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
}
#endif
Expand Down Expand Up @@ -8260,9 +8304,9 @@ OIIO_FORCEINLINE matrix44 matrix44::transposed () const {

OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
#if OIIO_SIMD_SSE
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
shuffle<2>(V) * m_row[2] + m_row[3];
R = R / shuffle<3>(R);
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
broadcast_element<2>(V) * m_row[2] + m_row[3];
R = R / broadcast_element<3>(R);
return vfloat3 (R.xyz0());
#else
value_t a, b, c, w;
Expand All @@ -8276,8 +8320,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {

OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
#if OIIO_SIMD_SSE
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
shuffle<2>(V) * m_row[2];
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
broadcast_element<2>(V) * m_row[2];
return vfloat3 (R.xyz0());
#else
value_t a, b, c;
Expand All @@ -8291,8 +8335,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
#if OIIO_SIMD_SSE
matrix44 T = transposed();
vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
shuffle<2>(V) * T[2];
vfloat4 R = broadcast_element<0>(V) * T[0] + broadcast_element<1>(V) * T[1] +
broadcast_element<2>(V) * T[2];
return vfloat3 (R.xyz0());
#else
value_t a, b, c;
Expand All @@ -8306,8 +8350,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M)
{
#if OIIO_SIMD_SSE
return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
return broadcast_element<0>(V) * M[0] + broadcast_element<1>(V) * M[1] +
broadcast_element<2>(V) * M[2] + broadcast_element<3>(V) * M[3];
#else
float a, b, c, w;
a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0];
Expand Down Expand Up @@ -9029,14 +9073,19 @@ OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
#endif
}

template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a) {
#if OIIO_SIMD_AVX >= 2
return _mm256_permutevar8x32_ps (a, vint8(i));
#else
return shuffle<i,i,i,i,i,i,i,i>(a);
return a[i];
#endif
}

// DEPRECATED(3.1): old name; use broadcast_element instead
template<int i> OIIO_FORCEINLINE vfloat8 shuffle(const vfloat8& a) {
return broadcast_element<i>(a);
}


template<int i>
OIIO_FORCEINLINE float extract (const vfloat8& v) {
Expand Down Expand Up @@ -9099,9 +9148,9 @@ OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) {
vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
// get efgh in the 0-idx slot
vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
vfloat8 efgh = broadcast_element<4>(abcd_0_0_0_efgh_0_0_0);
vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
return shuffle<0>(final_sum);
return broadcast_element<0>(final_sum);
#else
vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
return vfloat8(hadd4, hadd4);
Expand Down Expand Up @@ -9908,7 +9957,14 @@ vfloat16 shuffle (const vfloat16& a) {
#endif
}

template<int i> vfloat16 shuffle (const vfloat16& a) {
template<int i> vfloat16 broadcast_element(const vfloat16& a) {
return a[i];
}

// DEPRECATED(3.1): old name and nonstandard use
template<int i>
OIIO_DEPRECATED("Use broadcast_element (3.1)")
vfloat16 shuffle(const vfloat16& a) {
return shuffle<i,i,i,i> (a);
}

Expand Down
2 changes: 1 addition & 1 deletion src/libOpenImageIO/imagebufalgo_pixelmath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1733,7 +1733,7 @@ over_impl_rgbafloat(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
for (int x = 0; x < w; ++x, r += 4, a += 4, b += 4) {
vfloat4 a_simd(a);
vfloat4 b_simd(b);
vfloat4 alpha = shuffle<3>(a_simd);
vfloat4 alpha = broadcast_element<3>(a_simd);
vfloat4 one_minus_alpha = one - clamp(alpha, zero, one);
vfloat4 result = a_simd + one_minus_alpha * b_simd;
result.store(r);
Expand Down
22 changes: 12 additions & 10 deletions src/libtexture/texturesys.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3016,10 +3016,10 @@ TextureSystemImpl::sample_bicubic(
if (s_onetile & t_onetile) {
// If we thought it was one tile, realize that it isn't unless
// it's ascending.
s_onetile &= all(stex
== (simd::shuffle<0>(stex) + (*(vint4*)iota)));
t_onetile &= all(ttex
== (simd::shuffle<0>(ttex) + (*(vint4*)iota)));
s_onetile &= all(
stex == (simd::broadcast_element<0>(stex) + (*(vint4*)iota)));
t_onetile &= all(
ttex == (simd::broadcast_element<0>(ttex) + (*(vint4*)iota)));
}
bool onetile = (s_onetile & t_onetile);
if (onetile & allvalid) {
Expand Down Expand Up @@ -3199,15 +3199,17 @@ TextureSystemImpl::sample_bicubic(
simd::vfloat4 col[4];
for (int j = 0; j < 4; ++j) {
simd::vfloat4 lx = lerp(texel_simd[j][0], texel_simd[j][1],
shuffle<0>(h) /*h0x*/);
broadcast_element<0>(h) /*h0x*/);
simd::vfloat4 rx = lerp(texel_simd[j][2], texel_simd[j][3],
shuffle<1>(h) /*h1x*/);
col[j] = lerp(lx, rx, shuffle<1>(g) /*g1x*/);
broadcast_element<1>(h) /*h1x*/);
col[j] = lerp(lx, rx, broadcast_element<1>(g) /*g1x*/);
}
simd::vfloat4 ly = lerp(col[0], col[1], shuffle<2>(h) /*h0y*/);
simd::vfloat4 ry = lerp(col[2], col[3], shuffle<3>(h) /*h1y*/);
simd::vfloat4 ly = lerp(col[0], col[1],
broadcast_element<2>(h) /*h0y*/);
simd::vfloat4 ry = lerp(col[2], col[3],
broadcast_element<3>(h) /*h1y*/);
simd::vfloat4 weight_simd = weight;
accum += weight_simd * lerp(ly, ry, shuffle<3>(g) /*g1y*/);
accum += weight_simd * lerp(ly, ry, broadcast_element<3>(g) /*g1y*/);
if (daccumds_) {
simd::vfloat4 scalex = weight_simd * float(spec.width);
simd::vfloat4 scaley = weight_simd * float(spec.height);
Expand Down
Loading
Loading