11// SPDX-License-Identifier: BSD-3-Clause
22// Copyright Contributors to the OpenColorIO Project.
33
4-
54#ifndef INCLUDED_OCIO_AVX_H
65#define INCLUDED_OCIO_AVX_H
76
109
1110#include < immintrin.h>
1211
13- #include < OpenColorIO/OpenColorIO.h>
1412#include " BitDepthUtils.h"
13+ #include < OpenColorIO/OpenColorIO.h>
1514
1615// Macros for alignment declarations
1716#define AVX_SIMD_BYTES 32
@@ -31,14 +30,21 @@ inline __m256 avx_movehl_ps(__m256 a, __m256 b)
3130 return _mm256_castpd_ps (_mm256_unpackhi_pd (_mm256_castps_pd (b), _mm256_castps_pd (a)));
3231}
3332
34- inline __m256 avx_clamp (__m256 value, const __m256& maxValue)
33+ inline __m256 avx_clamp (__m256 value, const __m256 & maxValue)
3534{
3635 value = _mm256_max_ps (value, _mm256_setzero_ps ());
3736 return _mm256_min_ps (value, maxValue);
3837}
3938
40- inline void avxRGBATranspose_4x4_4x4 (__m256 row0, __m256 row1, __m256 row2, __m256 row3,
41- __m256 &out_r, __m256 &out_g, __m256 &out_b, __m256 &out_a )
39+ inline void avxRGBATranspose_4x4_4x4 (
40+ __m256 row0,
41+ __m256 row1,
42+ __m256 row2,
43+ __m256 row3,
44+ __m256 & out_r,
45+ __m256 & out_g,
46+ __m256 & out_b,
47+ __m256 & out_a)
4248{
4349 // the rgba transpose result will look this
4450 //
@@ -61,14 +67,13 @@ inline void avxRGBATranspose_4x4_4x4(__m256 row0, __m256 row1, __m256 row2, __m2
6167 out_g = avx_movehl_ps (tmp2, tmp0);
6268 out_b = avx_movelh_ps (tmp1, tmp3);
6369 out_a = avx_movehl_ps (tmp3, tmp1);
64-
6570}
6671
6772inline __m256i avx_load_u8 (__m128i a)
6873{
69- __m128i b = _mm_shuffle_epi32 (a, _MM_SHUFFLE (1 ,0 , 3 , 1 ));
70- b = _mm_cvtepu8_epi32 (b);
71- a = _mm_cvtepu8_epi32 (a);
74+ __m128i b = _mm_shuffle_epi32 (a, _MM_SHUFFLE (1 , 0 , 3 , 1 ));
75+ b = _mm_cvtepu8_epi32 (b);
76+ a = _mm_cvtepu8_epi32 (a);
7277
7378 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (a), b, 1 );
7479}
@@ -93,31 +98,34 @@ inline __m128i avx_pack_u8(__m256i a, __m256i b)
9398
9499// Note Packing functions perform no 0.0 - 1.0 normalization
95100// but perform 0 - max value clamping for integer formats
96- template <BitDepth BD > struct AVXRGBAPack {};
101+ template <BitDepth BD > struct AVXRGBAPack
102+ {
103+ };
97104
98- template <>
99- struct AVXRGBAPack <BIT_DEPTH_UINT8 >
105+ template <> struct AVXRGBAPack <BIT_DEPTH_UINT8 >
100106{
101- static inline void Load (const uint8_t *in, __m256& r, __m256& g, __m256& b, __m256& a)
107+ static inline void Load (const uint8_t * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
102108 {
103- __m256i rgba_00_07 = _mm256_loadu_si256 ((const __m256i*)in);
109+ __m256i rgba_00_07 = _mm256_loadu_si256 ((const __m256i *)in);
104110
105- __m128i rgba_00_03 =_mm256_castsi256_si128 (rgba_00_07);
106- __m128i rgba_04_07 =_mm256_extractf128_si256 (rgba_00_07, 1 );
111+ __m128i rgba_00_03 = _mm256_castsi256_si128 (rgba_00_07);
112+ __m128i rgba_04_07 = _mm256_extractf128_si256 (rgba_00_07, 1 );
107113
108114 // : 0, 1, 2, 3 | 4, 5, 6, 7 | 8, 9, 10, 11 | 12, 13, 14, 15
109115 // rgba_x03 : r0, g0, b0, a0 | r1, g1, b1, a1 | r2, g2, b2, a2 | r3, g3, b3, a3
110116 // rgba_x47 : r4, g4, b4, a4 | r5, g5, b5, a5 | r6, g6, b6, a6 | r7, g7, b7, a7
111117
112118 __m256 rgba0 = _mm256_cvtepi32_ps (avx_load_u8 (rgba_00_03));
113- __m256 rgba1 = _mm256_cvtepi32_ps (avx_load_u8 (_mm_shuffle_epi32 (rgba_00_03, _MM_SHUFFLE (3 , 2 , 3 , 2 ))));
119+ __m256 rgba1 = _mm256_cvtepi32_ps (
120+ avx_load_u8 (_mm_shuffle_epi32 (rgba_00_03, _MM_SHUFFLE (3 , 2 , 3 , 2 ))));
114121
115122 __m256 rgba2 = _mm256_cvtepi32_ps (avx_load_u8 (rgba_04_07));
116- __m256 rgba3 = _mm256_cvtepi32_ps (avx_load_u8 (_mm_shuffle_epi32 (rgba_04_07, _MM_SHUFFLE (3 , 2 , 3 , 2 ))));
123+ __m256 rgba3 = _mm256_cvtepi32_ps (
124+ avx_load_u8 (_mm_shuffle_epi32 (rgba_04_07, _MM_SHUFFLE (3 , 2 , 3 , 2 ))));
117125
118126 avxRGBATranspose_4x4_4x4 (rgba0, rgba1, rgba2, rgba3, r, g, b, a);
119127 }
120- static inline void Store (uint8_t *out, __m256 r, __m256 g, __m256 b, __m256 a)
128+ static inline void Store (uint8_t * out, __m256 r, __m256 g, __m256 b, __m256 a)
121129 {
122130 __m256 rgba0, rgba1, rgba2, rgba3;
123131 const __m256 maxValue = _mm256_set1_ps (255 .0f );
@@ -129,7 +137,8 @@ struct AVXRGBAPack<BIT_DEPTH_UINT8>
129137 rgba2 = avx_clamp (rgba2, maxValue);
130138 rgba3 = avx_clamp (rgba3, maxValue);
131139
132- // NOTE note using cvtps which will round based on MXCSR register defaults to _MM_ROUND_NEAREST
140+ // NOTE note using cvtps which will round based on MXCSR register defaults to
141+ // _MM_ROUND_NEAREST
133142 __m256i rgba01 = _mm256_cvtps_epi32 (rgba0);
134143 __m256i rgba23 = _mm256_cvtps_epi32 (rgba1);
135144 __m256i rgba45 = _mm256_cvtps_epi32 (rgba2);
@@ -140,15 +149,15 @@ struct AVXRGBAPack<BIT_DEPTH_UINT8>
140149
141150 __m256i rgba = _mm256_insertf128_si256 (_mm256_castsi128_si256 (lo), hi, 1 );
142151
143- _mm256_storeu_si256 ((__m256i*)out, rgba);
152+ _mm256_storeu_si256 ((__m256i *)out, rgba);
144153 }
145154};
146155
147156inline __m256i avx_unpack_u16 (__m128i a)
148157{
149- __m128i b = _mm_shuffle_epi32 (a, _MM_SHUFFLE (1 ,0 , 3 , 2 ));
150- b = _mm_cvtepu16_epi32 (b);
151- a = _mm_cvtepu16_epi32 (a);
158+ __m128i b = _mm_shuffle_epi32 (a, _MM_SHUFFLE (1 , 0 , 3 , 2 ));
159+ b = _mm_cvtepu16_epi32 (b);
160+ a = _mm_cvtepu16_epi32 (a);
152161
153162 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (a), b, 1 );
154163}
@@ -176,16 +185,15 @@ inline __m128i avx_pack_u16(__m256i a)
176185 return _mm_or_si128 (lo, hi);
177186}
178187
179- template <BitDepth BD >
180- struct AVXRGBAPack16
188+ template <BitDepth BD > struct AVXRGBAPack16
181189{
182190 typedef typename BitDepthInfo<BD >::Type Type;
183191
184- static inline void Load (const Type *in, __m256& r, __m256& g, __m256& b, __m256& a)
192+ static inline void Load (const Type * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
185193 {
186194 // const __m256 scale = _mm256_set1_ps(1.0f / (float)BitDepthInfo<BD>::maxValue);
187- __m256i rgba_00_03 = _mm256_loadu_si256 ((const __m256i*)(in + 0 ));
188- __m256i rgba_04_07 = _mm256_loadu_si256 ((const __m256i*)(in + 16 ));
195+ __m256i rgba_00_03 = _mm256_loadu_si256 ((const __m256i *)(in + 0 ));
196+ __m256i rgba_04_07 = _mm256_loadu_si256 ((const __m256i *)(in + 16 ));
189197
190198 __m256 rgba0 = _mm256_cvtepi32_ps (avx_unpack_u16 (_mm256_castsi256_si128 (rgba_00_03)));
191199 __m256 rgba1 = _mm256_cvtepi32_ps (avx_unpack_u16 (_mm256_extractf128_si256 (rgba_00_03, 1 )));
@@ -195,7 +203,7 @@ struct AVXRGBAPack16
195203 avxRGBATranspose_4x4_4x4 (rgba0, rgba1, rgba2, rgba3, r, g, b, a);
196204 }
197205
198- static inline void Store (Type *out, __m256 r, __m256 g, __m256 b, __m256 a)
206+ static inline void Store (Type * out, __m256 r, __m256 g, __m256 b, __m256 a)
199207 {
200208 __m256 rgba0, rgba1, rgba2, rgba3;
201209 __m128i lo, hi;
@@ -209,7 +217,8 @@ struct AVXRGBAPack16
209217 rgba2 = avx_clamp (rgba2, maxValue);
210218 rgba3 = avx_clamp (rgba3, maxValue);
211219
212- // NOTE note using cvtps which will round based on MXCSR register defaults to _MM_ROUND_NEAREST
220+ // NOTE note using cvtps which will round based on MXCSR register defaults to
221+ // _MM_ROUND_NEAREST
213222 __m256i rgba01 = _mm256_cvtps_epi32 (rgba0);
214223 __m256i rgba23 = _mm256_cvtps_epi32 (rgba1);
215224 __m256i rgba45 = _mm256_cvtps_epi32 (rgba2);
@@ -219,65 +228,61 @@ struct AVXRGBAPack16
219228 hi = avx_pack_u16 (rgba23);
220229
221230 rgba = _mm256_insertf128_si256 (_mm256_castsi128_si256 (lo), hi, 1 );
222- _mm256_storeu_si256 ((__m256i*)(out+ 0 ), rgba);
231+ _mm256_storeu_si256 ((__m256i *)(out + 0 ), rgba);
223232
224233 lo = avx_pack_u16 (rgba45);
225234 hi = avx_pack_u16 (rgba67);
226235
227236 rgba = _mm256_insertf128_si256 (_mm256_castsi128_si256 (lo), hi, 1 );
228- _mm256_storeu_si256 ((__m256i*)(out+ 16 ), rgba);
237+ _mm256_storeu_si256 ((__m256i *)(out + 16 ), rgba);
229238 }
230239};
231240
232- template <>
233- struct AVXRGBAPack <BIT_DEPTH_UINT10 >
241+ template <> struct AVXRGBAPack <BIT_DEPTH_UINT10 >
234242{
235- static inline void Load (const uint16_t *in, __m256& r, __m256& g, __m256& b, __m256& a)
243+ static inline void Load (const uint16_t * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
236244 {
237245 AVXRGBAPack16<BIT_DEPTH_UINT10 >::Load (in, r, g, b, a);
238246 }
239- static inline void Store (uint16_t *out, __m256 r, __m256 g, __m256 b, __m256 a)
247+ static inline void Store (uint16_t * out, __m256 r, __m256 g, __m256 b, __m256 a)
240248 {
241249 AVXRGBAPack16<BIT_DEPTH_UINT10 >::Store (out, r, g, b, a);
242250 }
243251};
244252
245- template <>
246- struct AVXRGBAPack <BIT_DEPTH_UINT12 >
253+ template <> struct AVXRGBAPack <BIT_DEPTH_UINT12 >
247254{
248- static inline void Load (const uint16_t *in, __m256& r, __m256& g, __m256& b, __m256& a)
255+ static inline void Load (const uint16_t * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
249256 {
250257 AVXRGBAPack16<BIT_DEPTH_UINT12 >::Load (in, r, g, b, a);
251258 }
252- static inline void Store (uint16_t *out, __m256 r, __m256 g, __m256 b, __m256 a)
259+ static inline void Store (uint16_t * out, __m256 r, __m256 g, __m256 b, __m256 a)
253260 {
254261 AVXRGBAPack16<BIT_DEPTH_UINT12 >::Store (out, r, g, b, a);
255262 }
256263};
257264
258- template <>
259- struct AVXRGBAPack <BIT_DEPTH_UINT16 >
265+ template <> struct AVXRGBAPack <BIT_DEPTH_UINT16 >
260266{
261- static inline void Load (const uint16_t *in, __m256& r, __m256& g, __m256& b, __m256& a)
267+ static inline void Load (const uint16_t * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
262268 {
263269 AVXRGBAPack16<BIT_DEPTH_UINT16 >::Load (in, r, g, b, a);
264270 }
265- static inline void Store (uint16_t *out, __m256 r, __m256 g, __m256 b, __m256 a)
271+ static inline void Store (uint16_t * out, __m256 r, __m256 g, __m256 b, __m256 a)
266272 {
267273 AVXRGBAPack16<BIT_DEPTH_UINT16 >::Store (out, r, g, b, a);
268274 }
269275};
270276
271277#if OCIO_USE_F16C
272278
273- template <>
274- struct AVXRGBAPack <BIT_DEPTH_F16 >
279+ template <> struct AVXRGBAPack <BIT_DEPTH_F16 >
275280{
276- static inline void Load (const half *in, __m256& r, __m256& g, __m256& b, __m256& a)
281+ static inline void Load (const half * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
277282 {
278283
279- __m256i rgba_00_03 = _mm256_loadu_si256 ((const __m256i*)(in + 0 ));
280- __m256i rgba_04_07 = _mm256_loadu_si256 ((const __m256i*)(in + 16 ));
284+ __m256i rgba_00_03 = _mm256_loadu_si256 ((const __m256i *)(in + 0 ));
285+ __m256i rgba_04_07 = _mm256_loadu_si256 ((const __m256i *)(in + 16 ));
281286
282287 __m256 rgba0 = _mm256_cvtph_ps (_mm256_castsi256_si128 (rgba_00_03));
283288 __m256 rgba1 = _mm256_cvtph_ps (_mm256_extractf128_si256 (rgba_00_03, 1 ));
@@ -287,7 +292,7 @@ struct AVXRGBAPack<BIT_DEPTH_F16>
287292 avxRGBATranspose_4x4_4x4 (rgba0, rgba1, rgba2, rgba3, r, g, b, a);
288293 }
289294
290- static inline void Store (half *out, __m256 r, __m256 g, __m256 b, __m256 a)
295+ static inline void Store (half * out, __m256 r, __m256 g, __m256 b, __m256 a)
291296 {
292297 __m256 rgba0, rgba1, rgba2, rgba3;
293298 __m256i rgba;
@@ -300,36 +305,34 @@ struct AVXRGBAPack<BIT_DEPTH_F16>
300305 __m128i rgba12_16 = _mm256_cvtps_ph (rgba3, 0 );
301306
302307 rgba = _mm256_insertf128_si256 (_mm256_castsi128_si256 (rgba00_03), rgba04_07, 1 );
303- _mm256_storeu_si256 ((__m256i*)(out+ 0 ), rgba);
308+ _mm256_storeu_si256 ((__m256i *)(out + 0 ), rgba);
304309
305310 rgba = _mm256_insertf128_si256 (_mm256_castsi128_si256 (rgba08_11), rgba12_16, 1 );
306- _mm256_storeu_si256 ((__m256i*)(out+ 16 ), rgba);
311+ _mm256_storeu_si256 ((__m256i *)(out + 16 ), rgba);
307312 }
308313};
309314
310315#endif
311316
312- template <>
313- struct AVXRGBAPack <BIT_DEPTH_F32 >
317+ template <> struct AVXRGBAPack <BIT_DEPTH_F32 >
314318{
315- static inline void Load (const float *in, __m256& r, __m256& g, __m256& b, __m256& a)
319+ static inline void Load (const float * in, __m256 & r, __m256 & g, __m256 & b, __m256 & a)
316320 {
317- __m256 rgba0 = _mm256_loadu_ps (in + 0 );
318- __m256 rgba1 = _mm256_loadu_ps (in + 8 );
321+ __m256 rgba0 = _mm256_loadu_ps (in + 0 );
322+ __m256 rgba1 = _mm256_loadu_ps (in + 8 );
319323 __m256 rgba2 = _mm256_loadu_ps (in + 16 );
320324 __m256 rgba3 = _mm256_loadu_ps (in + 24 );
321325
322326 avxRGBATranspose_4x4_4x4 (rgba0, rgba1, rgba2, rgba3, r, g, b, a);
323-
324327 }
325328
326- static inline void Store (float *out, __m256 r, __m256 g, __m256 b, __m256 a)
329+ static inline void Store (float * out, __m256 r, __m256 g, __m256 b, __m256 a)
327330 {
328331 __m256 rgba0, rgba1, rgba2, rgba3;
329332 avxRGBATranspose_4x4_4x4 (r, g, b, a, rgba0, rgba1, rgba2, rgba3);
330333
331- _mm256_storeu_ps (out + 0 , rgba0);
332- _mm256_storeu_ps (out + 8 , rgba1);
334+ _mm256_storeu_ps (out + 0 , rgba0);
335+ _mm256_storeu_ps (out + 8 , rgba1);
333336 _mm256_storeu_ps (out + 16 , rgba2);
334337 _mm256_storeu_ps (out + 24 , rgba3);
335338 }
0 commit comments