diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h index 599f1c10d6..a523ef6709 100644 --- a/src/include/OpenImageIO/simd.h +++ b/src/include/OpenImageIO/simd.h @@ -614,8 +614,9 @@ class vbool4 { template OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -765,8 +766,9 @@ class vbool8 { template OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -1158,8 +1160,9 @@ vint4 srl (const vint4& val, const unsigned int bits); template OIIO_FORCEINLINE vint4 shuffle (const vint4& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vint4 shuffle (const vint4& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -1458,8 +1461,9 @@ vint8 srl (const vint8& val, const unsigned int bits); template OIIO_FORCEINLINE vint8 shuffle (const vint8& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vint8 shuffle (const vint8& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -1768,8 +1772,9 @@ template vint16 shuffle4 (const vint16& a); template vint16 shuffle (const vint16& a); -/// shuffle(a) is the same as shuffle(a) -template vint16 shuffle (const vint16& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template vint16 broadcast_element(const vint16& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -2093,8 +2098,9 @@ class vfloat4 { template OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a); /// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted /// 2-bit indices packed into the template parameter i (going from the low @@ -2716,8 +2722,8 @@ class vfloat8 { template OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a); -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a); +/// broadcast_element(a) is the same as shuffle(a) +template OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -3046,8 +3052,9 @@ template OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a); template OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a); -/// shuffle(a) is the same as shuffle(a) -template vfloat16 shuffle (const vfloat16& a); +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template vfloat16 broadcast_element(const vfloat16& a); /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -3468,11 +3475,17 @@ OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) { #endif } -/// shuffle(a) is the same as shuffle(a) -template OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) { +/// broadcast_element(a) returns a simd variable in which all lanes have +/// value a[i]. +template OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a) { return shuffle(a); } +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vbool4 shuffle(const vbool4& a) { + return broadcast_element(a); +} + /// Helper: as rapid as possible extraction of one component, when the /// index is fixed. @@ -3796,10 +3809,15 @@ OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) { #endif } -template OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) { +template OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a) { return shuffle(a); } +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vbool8 shuffle(const vbool8& a) { + return broadcast_element(a); +} + template OIIO_FORCEINLINE bool extract (const vbool8& a) { @@ -4739,7 +4757,14 @@ OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { #endif } -template OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle(a); } +template OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a) { + return shuffle(a); +} + +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vint4 shuffle(const vint4& a) { + return broadcast_element(a); +} template @@ -5579,10 +5604,15 @@ OIIO_FORCEINLINE vint8 shuffle (const vint8& a) { #endif } -template OIIO_FORCEINLINE vint8 shuffle (const vint8& a) { +template OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a) { return shuffle(a); } +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vint8 shuffle(const vint8& a) { + return broadcast_element(a); +} + template OIIO_FORCEINLINE int extract (const vint8& v) { @@ -6390,8 +6420,15 @@ vint16 shuffle (const vint16& a) { #endif } -template vint16 shuffle (const vint16& a) { - return shuffle (a); +template vint16 broadcast_element(const vint16& a) { + return a[i]; +} + +// DEPRECATED(3.1): old name and nonstandard use +template +OIIO_DEPRECATED("Use broadcast_element (3.1)") +vint16 shuffle(const vint16& a) { + return broadcast_element (a); } @@ -7248,19 +7285,26 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { #endif } -template OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle(a); } +template OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a) { + return shuffle(a); +} + +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { + return broadcast_element(a); +} #if OIIO_SIMD_NEON -template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) { +template<> OIIO_FORCEINLINE vfloat4 broadcast_element<0> (const vfloat4& a) { float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0); } -template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) { +template<> OIIO_FORCEINLINE vfloat4 broadcast_element<1> (const vfloat4& a) { float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1); } -template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) { +template<> OIIO_FORCEINLINE vfloat4 broadcast_element<2> (const vfloat4& a) { float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0); } -template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) { +template<> OIIO_FORCEINLINE vfloat4 broadcast_element<3> (const vfloat4& a) { float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1); } #endif @@ -8260,9 +8304,9 @@ OIIO_FORCEINLINE matrix44 matrix44::transposed () const { OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const { #if OIIO_SIMD_SSE - vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] + - shuffle<2>(V) * m_row[2] + m_row[3]; - R = R / shuffle<3>(R); + vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] + + broadcast_element<2>(V) * m_row[2] + m_row[3]; + R = R / broadcast_element<3>(R); return vfloat3 (R.xyz0()); #else value_t a, b, c, w; @@ -8276,8 +8320,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const { OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const { #if OIIO_SIMD_SSE - vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] + - shuffle<2>(V) * m_row[2]; + vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] + + broadcast_element<2>(V) * m_row[2]; return vfloat3 (R.xyz0()); #else value_t a, b, c; @@ -8291,8 +8335,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const { OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const { #if OIIO_SIMD_SSE matrix44 T = transposed(); - vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] + - shuffle<2>(V) * T[2]; + vfloat4 R = broadcast_element<0>(V) * T[0] + broadcast_element<1>(V) * T[1] + + broadcast_element<2>(V) * T[2]; return vfloat3 (R.xyz0()); #else value_t a, b, c; @@ -8306,8 +8350,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const { OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M) { #if OIIO_SIMD_SSE - return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] + - shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3]; + return broadcast_element<0>(V) * M[0] + broadcast_element<1>(V) * M[1] + + broadcast_element<2>(V) * M[2] + broadcast_element<3>(V) * M[3]; #else float a, b, c, w; a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0]; @@ -9029,14 +9073,19 @@ OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) { #endif } -template OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) { +template OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a) { #if OIIO_SIMD_AVX >= 2 return _mm256_permutevar8x32_ps (a, vint8(i)); #else - return shuffle(a); + return a[i]; #endif } +// DEPRECATED(3.1): old name; use broadcast_element instead +template OIIO_FORCEINLINE vfloat8 shuffle(const vfloat8& a) { + return broadcast_element(a); +} + template OIIO_FORCEINLINE float extract (const vfloat8& v) { @@ -9099,9 +9148,9 @@ OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) { vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps()); vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps()); // get efgh in the 0-idx slot - vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0); + vfloat8 efgh = broadcast_element<4>(abcd_0_0_0_efgh_0_0_0); vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh; - return shuffle<0>(final_sum); + return broadcast_element<0>(final_sum); #else vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi()); return vfloat8(hadd4, hadd4); @@ -9908,7 +9957,14 @@ vfloat16 shuffle (const vfloat16& a) { #endif } -template vfloat16 shuffle (const vfloat16& a) { +template vfloat16 broadcast_element(const vfloat16& a) { + return a[i]; +} + +// DEPRECATED(3.1): old name and nonstandard use +template +OIIO_DEPRECATED("Use broadcast_element (3.1)") +vfloat16 shuffle(const vfloat16& a) { return shuffle (a); } diff --git a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp index 75ccc53c1d..33c98a709a 100644 --- a/src/libOpenImageIO/imagebufalgo_pixelmath.cpp +++ b/src/libOpenImageIO/imagebufalgo_pixelmath.cpp @@ -1733,7 +1733,7 @@ over_impl_rgbafloat(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi, for (int x = 0; x < w; ++x, r += 4, a += 4, b += 4) { vfloat4 a_simd(a); vfloat4 b_simd(b); - vfloat4 alpha = shuffle<3>(a_simd); + vfloat4 alpha = broadcast_element<3>(a_simd); vfloat4 one_minus_alpha = one - clamp(alpha, zero, one); vfloat4 result = a_simd + one_minus_alpha * b_simd; result.store(r); diff --git a/src/libtexture/texturesys.cpp b/src/libtexture/texturesys.cpp index 305394d467..84ee61fbfd 100644 --- a/src/libtexture/texturesys.cpp +++ b/src/libtexture/texturesys.cpp @@ -3016,10 +3016,10 @@ TextureSystemImpl::sample_bicubic( if (s_onetile & t_onetile) { // If we thought it was one tile, realize that it isn't unless // it's ascending. - s_onetile &= all(stex - == (simd::shuffle<0>(stex) + (*(vint4*)iota))); - t_onetile &= all(ttex - == (simd::shuffle<0>(ttex) + (*(vint4*)iota))); + s_onetile &= all( + stex == (simd::broadcast_element<0>(stex) + (*(vint4*)iota))); + t_onetile &= all( + ttex == (simd::broadcast_element<0>(ttex) + (*(vint4*)iota))); } bool onetile = (s_onetile & t_onetile); if (onetile & allvalid) { @@ -3199,15 +3199,17 @@ TextureSystemImpl::sample_bicubic( simd::vfloat4 col[4]; for (int j = 0; j < 4; ++j) { simd::vfloat4 lx = lerp(texel_simd[j][0], texel_simd[j][1], - shuffle<0>(h) /*h0x*/); + broadcast_element<0>(h) /*h0x*/); simd::vfloat4 rx = lerp(texel_simd[j][2], texel_simd[j][3], - shuffle<1>(h) /*h1x*/); - col[j] = lerp(lx, rx, shuffle<1>(g) /*g1x*/); + broadcast_element<1>(h) /*h1x*/); + col[j] = lerp(lx, rx, broadcast_element<1>(g) /*g1x*/); } - simd::vfloat4 ly = lerp(col[0], col[1], shuffle<2>(h) /*h0y*/); - simd::vfloat4 ry = lerp(col[2], col[3], shuffle<3>(h) /*h1y*/); + simd::vfloat4 ly = lerp(col[0], col[1], + broadcast_element<2>(h) /*h0y*/); + simd::vfloat4 ry = lerp(col[2], col[3], + broadcast_element<3>(h) /*h1y*/); simd::vfloat4 weight_simd = weight; - accum += weight_simd * lerp(ly, ry, shuffle<3>(g) /*g1y*/); + accum += weight_simd * lerp(ly, ry, broadcast_element<3>(g) /*g1y*/); if (daccumds_) { simd::vfloat4 scalex = weight_simd * float(spec.width); simd::vfloat4 scaley = weight_simd * float(spec.height); diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp index c12b62f4db..b34fb6bbd6 100644 --- a/src/libutil/simd_test.cpp +++ b/src/libutil/simd_test.cpp @@ -1205,14 +1205,14 @@ test_shuffle4() OIIO_CHECK_SIMD_EQUAL((shuffle<0, 0, 2, 2>(a)), VEC(0, 0, 2, 2)); OIIO_CHECK_SIMD_EQUAL((shuffle<1, 1, 3, 3>(a)), VEC(1, 1, 3, 3)); OIIO_CHECK_SIMD_EQUAL((shuffle<0, 1, 0, 1>(a)), VEC(0, 1, 0, 1)); - OIIO_CHECK_SIMD_EQUAL((shuffle<2>(a)), VEC(ELEM(2))); + OIIO_CHECK_SIMD_EQUAL((broadcast_element<2>(a)), VEC(ELEM(2))); benchmark("shuffle<...> ", [&](const VEC& v) { return shuffle<3, 2, 1, 0>(v); }, a); - benchmark("shuffle<0> ", [&](const VEC& v) { return shuffle<0>(v); }, a); - benchmark("shuffle<1> ", [&](const VEC& v) { return shuffle<1>(v); }, a); - benchmark("shuffle<2> ", [&](const VEC& v) { return shuffle<2>(v); }, a); - benchmark("shuffle<3> ", [&](const VEC& v) { return shuffle<3>(v); }, a); + benchmark("broadcast_element<0> ", [&](const VEC& v) { return broadcast_element<0>(v); }, a); + benchmark("broadcast_element<1> ", [&](const VEC& v) { return broadcast_element<1>(v); }, a); + benchmark("broadcast_element<2> ", [&](const VEC& v) { return broadcast_element<2>(v); }, a); + benchmark("broadcast_element<3> ", [&](const VEC& v) { return broadcast_element<3>(v); }, a); } @@ -1227,17 +1227,17 @@ void test_shuffle8 () OIIO_CHECK_SIMD_EQUAL ((shuffle<0,0,2,2,0,0,2,2>(a)), VEC(0,0,2,2,0,0,2,2)); OIIO_CHECK_SIMD_EQUAL ((shuffle<1,1,3,3,1,1,3,3>(a)), VEC(1,1,3,3,1,1,3,3)); OIIO_CHECK_SIMD_EQUAL ((shuffle<0,1,0,1,0,1,0,1>(a)), VEC(0,1,0,1,0,1,0,1)); - OIIO_CHECK_SIMD_EQUAL ((shuffle<2>(a)), VEC(ELEM(2))); + OIIO_CHECK_SIMD_EQUAL ((broadcast_element<2>(a)), VEC(ELEM(2))); benchmark ("shuffle<...> ", [&](const VEC& v){ return shuffle<7,6,5,4,3,2,1,0>(v); }, a); - benchmark ("shuffle<0> ", [&](const VEC& v){ return shuffle<0>(v); }, a); - benchmark ("shuffle<1> ", [&](const VEC& v){ return shuffle<1>(v); }, a); - benchmark ("shuffle<2> ", [&](const VEC& v){ return shuffle<2>(v); }, a); - benchmark ("shuffle<3> ", [&](const VEC& v){ return shuffle<3>(v); }, a); - benchmark ("shuffle<4> ", [&](const VEC& v){ return shuffle<4>(v); }, a); - benchmark ("shuffle<5> ", [&](const VEC& v){ return shuffle<5>(v); }, a); - benchmark ("shuffle<6> ", [&](const VEC& v){ return shuffle<6>(v); }, a); - benchmark ("shuffle<7> ", [&](const VEC& v){ return shuffle<7>(v); }, a); + benchmark ("broadcast_element<0> ", [&](const VEC& v){ return broadcast_element<0>(v); }, a); + benchmark ("broadcast_element<1> ", [&](const VEC& v){ return broadcast_element<1>(v); }, a); + benchmark ("broadcast_element<2> ", [&](const VEC& v){ return broadcast_element<2>(v); }, a); + benchmark ("broadcast_element<3> ", [&](const VEC& v){ return broadcast_element<3>(v); }, a); + benchmark ("broadcast_element<4> ", [&](const VEC& v){ return broadcast_element<4>(v); }, a); + benchmark ("broadcast_element<5> ", [&](const VEC& v){ return broadcast_element<5>(v); }, a); + benchmark ("broadcast_element<6> ", [&](const VEC& v){ return broadcast_element<6>(v); }, a); + benchmark ("broadcast_element<7> ", [&](const VEC& v){ return broadcast_element<7>(v); }, a); } @@ -1257,11 +1257,11 @@ void test_shuffle16 () // Shuffle within groups of 4 OIIO_CHECK_SIMD_EQUAL ((shuffle<3,2,1,0>(a)), VEC(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12)); - OIIO_CHECK_SIMD_EQUAL ((shuffle<3>(a)), - VEC(3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15)); + OIIO_CHECK_SIMD_EQUAL ((broadcast_element<3>(a)), VEC(3)); - benchmark ("shuffle4<> ", [&](const VEC& v){ return shuffle<3,2,1,0>(v); }, a); - benchmark ("shuffle<> ", [&](const VEC& v){ return shuffle<3,2,1,0>(v); }, a); + benchmark ("shuffle4<> ", [&](const VEC& v){ return shuffle4<3,2,1,0>(v); }, a); + benchmark ("shuffle4<> ", [&](const VEC& v){ return shuffle4<3>(v); }, a); + benchmark ("broadcast_element<> ", [&](const VEC& v){ return broadcast_element<3>(v); }, a); }