Skip to content

Commit 76972d3

Browse files
lgritzzachlewis
authored andcommitted
simd.h: renaming and fixing of shuffle template (AcademySoftwareFoundation#4739)
While I was investigating some SonarQube warnings about our simd `shuffle<>` templates (which are a false positive and I have a separate PR to simply silence it), I did get to thinking about the naming, and want to make a couple changes: * I decided that the 1-template-argument version of this function, `shuffle<int i>(simd_type)` would actually be more clear and self-documenting if renamed `broadcast_element` to emphasize that it is taking just one simd lane/element and broadcasting it to all lanes. (The multi-argument shuffle really is doing a true shuffle, giving an index for each lane to make a permutation of swizzle, so I'm not renaming that one.) To avoid breaking source compatibility, I am leaving the old name as well as a synonym, but commenting it as deprecated and I will phase out its use. It will disappear entirely from a future OIIO version that's safe to break compatibility. * For 16-wide simd, the 1-arg template we called shuffle was not doing the same operation -- it was replicating one group of 4 elements instead of a single element. We didn't use it anywhere, so I'm redefining it to do the analogous thing as it does for 4-wide and 8-wide. Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent 82fe312 commit 76972d3

4 files changed

Lines changed: 129 additions & 71 deletions

File tree

src/include/OpenImageIO/simd.h

Lines changed: 98 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -619,8 +619,9 @@ class vbool4 {
619619
template<int i0, int i1, int i2, int i3>
620620
OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
621621

622-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
623-
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
622+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
623+
/// value a[i].
624+
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a);
624625

625626
/// Helper: as rapid as possible extraction of one component, when the
626627
/// index is fixed.
@@ -770,8 +771,9 @@ class vbool8 {
770771
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
771772
OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
772773

773-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
774-
template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
774+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
775+
/// value a[i].
776+
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a);
775777

776778
/// Helper: as rapid as possible extraction of one component, when the
777779
/// index is fixed.
@@ -1163,8 +1165,9 @@ vint4 srl (const vint4& val, const unsigned int bits);
11631165
template<int i0, int i1, int i2, int i3>
11641166
OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
11651167

1166-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1167-
template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1168+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1169+
/// value a[i].
1170+
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a);
11681171

11691172
/// Helper: as rapid as possible extraction of one component, when the
11701173
/// index is fixed.
@@ -1463,8 +1466,9 @@ vint8 srl (const vint8& val, const unsigned int bits);
14631466
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
14641467
OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
14651468

1466-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1467-
template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1469+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1470+
/// value a[i].
1471+
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a);
14681472

14691473
/// Helper: as rapid as possible extraction of one component, when the
14701474
/// index is fixed.
@@ -1773,8 +1777,9 @@ template<int i> vint16 shuffle4 (const vint16& a);
17731777
template<int i0, int i1, int i2, int i3>
17741778
vint16 shuffle (const vint16& a);
17751779

1776-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1777-
template<int i> vint16 shuffle (const vint16& a);
1780+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
1781+
/// value a[i].
1782+
template<int i> vint16 broadcast_element(const vint16& a);
17781783

17791784
/// Helper: as rapid as possible extraction of one component, when the
17801785
/// index is fixed.
@@ -2098,8 +2103,9 @@ class vfloat4 {
20982103
template<int i0, int i1, int i2, int i3>
20992104
OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
21002105

2101-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2102-
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2106+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
2107+
/// value a[i].
2108+
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a);
21032109

21042110
/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
21052111
/// 2-bit indices packed into the template parameter i (going from the low
@@ -2721,8 +2727,8 @@ class vfloat8 {
27212727
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
27222728
OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
27232729

2724-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2725-
template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2730+
/// broadcast_element<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2731+
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a);
27262732

27272733
/// Helper: as rapid as possible extraction of one component, when the
27282734
/// index is fixed.
@@ -3051,8 +3057,9 @@ template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
30513057
template<int i0, int i1, int i2, int i3>
30523058
OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);
30533059

3054-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3055-
template<int i> vfloat16 shuffle (const vfloat16& a);
3060+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
3061+
/// value a[i].
3062+
template<int i> vfloat16 broadcast_element(const vfloat16& a);
30563063

30573064
/// Helper: as rapid as possible extraction of one component, when the
30583065
/// index is fixed.
@@ -3473,11 +3480,17 @@ OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
34733480
#endif
34743481
}
34753482

3476-
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3477-
template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3483+
/// broadcast_element<i>(a) returns a simd variable in which all lanes have
3484+
/// value a[i].
3485+
template<int i> OIIO_FORCEINLINE vbool4 broadcast_element(const vbool4& a) {
34783486
return shuffle<i,i,i,i>(a);
34793487
}
34803488

3489+
// DEPRECATED(3.1): old name; use broadcast_element instead
3490+
template<int i> OIIO_FORCEINLINE vbool4 shuffle(const vbool4& a) {
3491+
return broadcast_element<i>(a);
3492+
}
3493+
34813494

34823495
/// Helper: as rapid as possible extraction of one component, when the
34833496
/// index is fixed.
@@ -3801,10 +3814,15 @@ OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
38013814
#endif
38023815
}
38033816

3804-
template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3817+
template<int i> OIIO_FORCEINLINE vbool8 broadcast_element(const vbool8& a) {
38053818
return shuffle<i,i,i,i,i,i,i,i>(a);
38063819
}
38073820

3821+
// DEPRECATED(3.1): old name; use broadcast_element instead
3822+
template<int i> OIIO_FORCEINLINE vbool8 shuffle(const vbool8& a) {
3823+
return broadcast_element<i>(a);
3824+
}
3825+
38083826

38093827
template<int i>
38103828
OIIO_FORCEINLINE bool extract (const vbool8& a) {
@@ -4744,7 +4762,14 @@ OIIO_FORCEINLINE vint4 shuffle (const vint4& a) {
47444762
#endif
47454763
}
47464764

4747-
template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
4765+
template<int i> OIIO_FORCEINLINE vint4 broadcast_element(const vint4& a) {
4766+
return shuffle<i,i,i,i>(a);
4767+
}
4768+
4769+
// DEPRECATED(3.1): old name; use broadcast_element instead
4770+
template<int i> OIIO_FORCEINLINE vint4 shuffle(const vint4& a) {
4771+
return broadcast_element<i>(a);
4772+
}
47484773

47494774

47504775
template<int i>
@@ -5584,10 +5609,15 @@ OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
55845609
#endif
55855610
}
55865611

5587-
template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5612+
template<int i> OIIO_FORCEINLINE vint8 broadcast_element(const vint8& a) {
55885613
return shuffle<i,i,i,i,i,i,i,i>(a);
55895614
}
55905615

5616+
// DEPRECATED(3.1): old name; use broadcast_element instead
5617+
template<int i> OIIO_FORCEINLINE vint8 shuffle(const vint8& a) {
5618+
return broadcast_element<i>(a);
5619+
}
5620+
55915621

55925622
template<int i>
55935623
OIIO_FORCEINLINE int extract (const vint8& v) {
@@ -6395,8 +6425,15 @@ vint16 shuffle (const vint16& a) {
63956425
#endif
63966426
}
63976427

6398-
template<int i> vint16 shuffle (const vint16& a) {
6399-
return shuffle<i,i,i,i> (a);
6428+
template<int i> vint16 broadcast_element(const vint16& a) {
6429+
return a[i];
6430+
}
6431+
6432+
// DEPRECATED(3.1): old name and nonstandard use
6433+
template<int i>
6434+
OIIO_DEPRECATED("Use broadcast_element (3.1)")
6435+
vint16 shuffle(const vint16& a) {
6436+
return broadcast_element<i> (a);
64006437
}
64016438

64026439

@@ -7253,19 +7290,26 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
72537290
#endif
72547291
}
72557292

7256-
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
7293+
template<int i> OIIO_FORCEINLINE vfloat4 broadcast_element(const vfloat4& a) {
7294+
return shuffle<i,i,i,i>(a);
7295+
}
7296+
7297+
// DEPRECATED(3.1): old name; use broadcast_element instead
7298+
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) {
7299+
return broadcast_element<i>(a);
7300+
}
72577301

72587302
#if OIIO_SIMD_NEON
7259-
template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
7303+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<0> (const vfloat4& a) {
72607304
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
72617305
}
7262-
template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
7306+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<1> (const vfloat4& a) {
72637307
float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
72647308
}
7265-
template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
7309+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<2> (const vfloat4& a) {
72667310
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
72677311
}
7268-
template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7312+
template<> OIIO_FORCEINLINE vfloat4 broadcast_element<3> (const vfloat4& a) {
72697313
float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
72707314
}
72717315
#endif
@@ -8265,9 +8309,9 @@ OIIO_FORCEINLINE matrix44 matrix44::transposed () const {
82658309

82668310
OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
82678311
#if OIIO_SIMD_SSE
8268-
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8269-
shuffle<2>(V) * m_row[2] + m_row[3];
8270-
R = R / shuffle<3>(R);
8312+
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
8313+
broadcast_element<2>(V) * m_row[2] + m_row[3];
8314+
R = R / broadcast_element<3>(R);
82718315
return vfloat3 (R.xyz0());
82728316
#else
82738317
value_t a, b, c, w;
@@ -8281,8 +8325,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformp (const vfloat3 &V) const {
82818325

82828326
OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
82838327
#if OIIO_SIMD_SSE
8284-
vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8285-
shuffle<2>(V) * m_row[2];
8328+
vfloat4 R = broadcast_element<0>(V) * m_row[0] + broadcast_element<1>(V) * m_row[1] +
8329+
broadcast_element<2>(V) * m_row[2];
82868330
return vfloat3 (R.xyz0());
82878331
#else
82888332
value_t a, b, c;
@@ -8296,8 +8340,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformv (const vfloat3 &V) const {
82968340
OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
82978341
#if OIIO_SIMD_SSE
82988342
matrix44 T = transposed();
8299-
vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8300-
shuffle<2>(V) * T[2];
8343+
vfloat4 R = broadcast_element<0>(V) * T[0] + broadcast_element<1>(V) * T[1] +
8344+
broadcast_element<2>(V) * T[2];
83018345
return vfloat3 (R.xyz0());
83028346
#else
83038347
value_t a, b, c;
@@ -8311,8 +8355,8 @@ OIIO_FORCEINLINE vfloat3 matrix44::transformvT (const vfloat3 &V) const {
83118355
OIIO_FORCEINLINE vfloat4 operator* (const vfloat4 &V, const matrix44& M)
83128356
{
83138357
#if OIIO_SIMD_SSE
8314-
return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8315-
shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8358+
return broadcast_element<0>(V) * M[0] + broadcast_element<1>(V) * M[1] +
8359+
broadcast_element<2>(V) * M[2] + broadcast_element<3>(V) * M[3];
83168360
#else
83178361
float a, b, c, w;
83188362
a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0];
@@ -9034,14 +9078,19 @@ OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
90349078
#endif
90359079
}
90369080

9037-
template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
9081+
template<int i> OIIO_FORCEINLINE vfloat8 broadcast_element(const vfloat8& a) {
90389082
#if OIIO_SIMD_AVX >= 2
90399083
return _mm256_permutevar8x32_ps (a, vint8(i));
90409084
#else
9041-
return shuffle<i,i,i,i,i,i,i,i>(a);
9085+
return a[i];
90429086
#endif
90439087
}
90449088

9089+
// DEPRECATED(3.1): old name; use broadcast_element instead
9090+
template<int i> OIIO_FORCEINLINE vfloat8 shuffle(const vfloat8& a) {
9091+
return broadcast_element<i>(a);
9092+
}
9093+
90459094

90469095
template<int i>
90479096
OIIO_FORCEINLINE float extract (const vfloat8& v) {
@@ -9104,9 +9153,9 @@ OIIO_FORCEINLINE vfloat8 vreduce_add (const vfloat8& v) {
91049153
vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
91059154
vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
91069155
// get efgh in the 0-idx slot
9107-
vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
9156+
vfloat8 efgh = broadcast_element<4>(abcd_0_0_0_efgh_0_0_0);
91089157
vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
9109-
return shuffle<0>(final_sum);
9158+
return broadcast_element<0>(final_sum);
91109159
#else
91119160
vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
91129161
return vfloat8(hadd4, hadd4);
@@ -9913,7 +9962,14 @@ vfloat16 shuffle (const vfloat16& a) {
99139962
#endif
99149963
}
99159964

9916-
template<int i> vfloat16 shuffle (const vfloat16& a) {
9965+
template<int i> vfloat16 broadcast_element(const vfloat16& a) {
9966+
return a[i];
9967+
}
9968+
9969+
// DEPRECATED(3.1): old name and nonstandard use
9970+
template<int i>
9971+
OIIO_DEPRECATED("Use broadcast_element (3.1)")
9972+
vfloat16 shuffle(const vfloat16& a) {
99179973
return shuffle<i,i,i,i> (a);
99189974
}
99199975

src/libOpenImageIO/imagebufalgo_pixelmath.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1733,7 +1733,7 @@ over_impl_rgbafloat(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
17331733
for (int x = 0; x < w; ++x, r += 4, a += 4, b += 4) {
17341734
vfloat4 a_simd(a);
17351735
vfloat4 b_simd(b);
1736-
vfloat4 alpha = shuffle<3>(a_simd);
1736+
vfloat4 alpha = broadcast_element<3>(a_simd);
17371737
vfloat4 one_minus_alpha = one - clamp(alpha, zero, one);
17381738
vfloat4 result = a_simd + one_minus_alpha * b_simd;
17391739
result.store(r);

src/libtexture/texturesys.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3018,10 +3018,10 @@ TextureSystemImpl::sample_bicubic(
30183018
if (s_onetile & t_onetile) {
30193019
// If we thought it was one tile, realize that it isn't unless
30203020
// it's ascending.
3021-
s_onetile &= all(stex
3022-
== (simd::shuffle<0>(stex) + (*(vint4*)iota)));
3023-
t_onetile &= all(ttex
3024-
== (simd::shuffle<0>(ttex) + (*(vint4*)iota)));
3021+
s_onetile &= all(
3022+
stex == (simd::broadcast_element<0>(stex) + (*(vint4*)iota)));
3023+
t_onetile &= all(
3024+
ttex == (simd::broadcast_element<0>(ttex) + (*(vint4*)iota)));
30253025
}
30263026
bool onetile = (s_onetile & t_onetile);
30273027
if (onetile & allvalid) {
@@ -3201,15 +3201,17 @@ TextureSystemImpl::sample_bicubic(
32013201
simd::vfloat4 col[4];
32023202
for (int j = 0; j < 4; ++j) {
32033203
simd::vfloat4 lx = lerp(texel_simd[j][0], texel_simd[j][1],
3204-
shuffle<0>(h) /*h0x*/);
3204+
broadcast_element<0>(h) /*h0x*/);
32053205
simd::vfloat4 rx = lerp(texel_simd[j][2], texel_simd[j][3],
3206-
shuffle<1>(h) /*h1x*/);
3207-
col[j] = lerp(lx, rx, shuffle<1>(g) /*g1x*/);
3206+
broadcast_element<1>(h) /*h1x*/);
3207+
col[j] = lerp(lx, rx, broadcast_element<1>(g) /*g1x*/);
32083208
}
3209-
simd::vfloat4 ly = lerp(col[0], col[1], shuffle<2>(h) /*h0y*/);
3210-
simd::vfloat4 ry = lerp(col[2], col[3], shuffle<3>(h) /*h1y*/);
3209+
simd::vfloat4 ly = lerp(col[0], col[1],
3210+
broadcast_element<2>(h) /*h0y*/);
3211+
simd::vfloat4 ry = lerp(col[2], col[3],
3212+
broadcast_element<3>(h) /*h1y*/);
32113213
simd::vfloat4 weight_simd = weight;
3212-
accum += weight_simd * lerp(ly, ry, shuffle<3>(g) /*g1y*/);
3214+
accum += weight_simd * lerp(ly, ry, broadcast_element<3>(g) /*g1y*/);
32133215
if (daccumds_) {
32143216
simd::vfloat4 scalex = weight_simd * float(spec.width);
32153217
simd::vfloat4 scaley = weight_simd * float(spec.height);

0 commit comments

Comments
 (0)