1515
1616#define simd_float_16_init (var , value ) \
1717float var##arr[16] = { value, value, value, value, value, value, value, value, value, value, value, value, value, value, value, value };\
18- simd_float16 var = simd_make_float16(*(simd_float16 *)(var##arr));\
18+ simd_float16 var = simd_make_float16(*(simd_packed_float16 *)(var##arr));\
1919
2020#define simd_float_8_init (var , value ) \
2121float var##arr[8] = { value, value, value, value, value, value, value, value };\
22- simd_float8 var = simd_make_float8(*(simd_float8 *)(var##arr));\
22+ simd_float8 var = simd_make_float8(*(simd_packed_float8 *)(var##arr));\
2323
2424#define simd_float_4_init (var , value ) \
2525float var##arr[4] = { value, value, value, value };\
26- simd_float4 var = simd_make_float4(*(simd_float4 *)(var##arr));\
26+ simd_float4 var = simd_make_float4(*(simd_packed_float4 *)(var##arr));\
2727
2828#define simd_float_3_init (var , value ) \
2929float var##arr[3] = { value, value, value };\
3030simd_float3 var = simd_make_float3(*(simd_float3 *)(var##arr));\
3131
3232#define simd_float_2_init (var , value ) \
3333float var##arr[2] = { value, value };\
34- simd_float2 var = simd_make_float2(*(simd_float2 *)(var##arr));\
34+ simd_float2 var = simd_make_float2(*(simd_packed_float2 *)(var##arr));\
3535
3636
3737#define vector_dot_ (NUM ) float op_vec_dot_##NUM(const float* a, const float *b, int size)\
@@ -83,7 +83,7 @@ float op_vec_dot_3(const float* a, const float *b, int size)
8383 {\
8484 simd_float_##NUM##_init(s_min, min)\
8585 simd_float_##NUM##_init(s_max, max)\
86- ((simd_float##NUM*) c)[i] = simd_clamp(((simd_float ##NUM*) a)[i], s_min, s_max);\
86+ ((simd_float##NUM*) c)[i] = simd_clamp(((simd_packed_float ##NUM*) a)[i], s_min, s_max);\
8787 }\
8888 int left = size % NUM;\
8989 for (int i = 0; i < left; ++i)\
@@ -93,7 +93,6 @@ float op_vec_dot_3(const float* a, const float *b, int size)
9393}
9494
9595op_vec_clamp_ (2 )
96- op_vec_clamp_ (3 )
9796op_vec_clamp_ (4 )
9897op_vec_clamp_ (8 )
9998op_vec_clamp_ (16 )
@@ -104,8 +103,8 @@ op_vec_clamp_(16)
104103 int iterations = size / NUM;\
105104 for (int i = 0; i < iterations; ++i)\
106105 {\
107- simd_float_##NUM##_init(s_b, b)\
108- ((simd_float##NUM*) c)[i] = simd_max(((simd_float ##NUM*) a)[i], s_b);\
106+ simd_float_##NUM##_init(s_b, b)\
107+ ((simd_float##NUM*) c)[i] = simd_max(((simd_packed_float ##NUM*) a)[i], s_b);\
109108 }\
110109 int left = size % NUM;\
111110 for (int i = 0; i < left; ++i)\
@@ -114,14 +113,23 @@ op_vec_clamp_(16)
114113 }\
115114}
116115
116+ void op_vec_max_sc_4 (const float * a , float b , float * c , int size )\
117+ {\
118+ int iterations = size / 4 ;
119+ for (int i = 0 ; i < iterations ; ++ i )
120+ {
121+ simd_float_4_init (s_b , b )
122+ simd_float4 s_a = ((simd_packed_float4 * ) a )[i ];
123+ ((simd_packed_float4 * ) c )[i ] = simd_max (s_a , s_b );
124+ }
125+ int left = size % 4 ;
126+ for (int i = 0 ; i < left ; ++ i )
127+ {
128+ c [iterations * 4 + i ] = simd_max (a [iterations * 4 + i ], b );
129+ }
130+ }
117131
118132
119- op_vec_max_sc_ (2 )
120- op_vec_max_sc_ (3 )
121- op_vec_max_sc_ (4 )
122- op_vec_max_sc_ (8 )
123- op_vec_max_sc_ (16 )
124-
125133typedef enum {
126134 two = 2 , three = 3 , four = 4 , eight = 8 , sixteen = 16
127135}optimal_vector_size ;
@@ -146,24 +154,6 @@ optimal_vector_size get_optimal_vector_size(int size){
146154 return values [optimalIndex ];
147155}
148156
149- #define get_optimized (func ) func##_fn func##_get_optimized(int size){\
150- optimal_vector_size value = get_optimal_vector_size(size);\
151- switch (value) {\
152- case two:\
153- return func##_2;\
154- case three:\
155- return func##_3;\
156- case four:\
157- return func##_4;\
158- case eight:\
159- return func##_8;\
160- case sixteen:\
161- return func##_16;\
162- default:\
163- return func##_4;\
164- }\
165- }
166-
167157typedef float (* op_vec_dot_fn )(const float * a , const float * b , int size );
168158
169159typedef void (* op_vec_clamp_fn )(const float * a , float * c , float min , float max , int size );
@@ -176,9 +166,6 @@ op_vec_clamp_fn op_vec_clamp_get_optimized(int size);
176166
177167op_vec_max_sc_fn op_vec_max_sc_get_optimized (int size );
178168
179- get_optimized (op_vec_dot )
180- get_optimized (op_vec_clamp )
181- get_optimized (op_vec_max_sc )
182169
183170void op_vec_max (const float * a , const float * b , float * c , int size ){
184171 vDSP_vmax (a , 1 , b , 1 , c , 1 , size );
@@ -194,9 +181,6 @@ float op_vec_dot(const float *a, const float *b, int size) {
194181#else
195182 return op_vec_dot_4 (a , b , size );
196183#endif
197- // float c;
198- // vDSP_dotpr(a, 1, b, 1, &c, size);
199- // return c;
200184}
201185
202186void op_vec_clamp (const float * a , float * c , float min , float max , int size ){
@@ -215,7 +199,7 @@ void op_vec_sub(const float *a, const float *b, float *result, int size){
215199 vDSP_vsub (b , 1 , a , 1 , result , 1 , size );
216200}
217201
218- void op_vec_sum (const float * a , float * result , int size ){
202+ void op_vec_sum (const float * a , float * result , int size ) {
219203 vDSP_sve (a , 1 , result , size );
220204}
221205
@@ -247,6 +231,15 @@ void op_vec_exp(const float *a, float *c, int size) {
247231 vvexpf (c , a , & size );
248232}
249233
234+ void op_vec_pow (const float * a , const float * b , float * c , int size ) {
235+ vvpowf (c , b , a , & size );
236+ }
237+
238+ void op_vec_pow_sc (const float * a , const float b , float * c , int size ) {
239+ vvpowsf (c , & b , a , & size );
240+ }
241+
242+
250243void op_vec_log (const float * a , float * c , int size ) {
251244 vvlogf (c , a , & size );
252245}
@@ -263,7 +256,7 @@ void op_vec_div(const float *a, const float *b, float *c, int size) {
263256 vDSP_vdiv (b , 1 , a , 1 , c , 1 , size );
264257}
265258
266- void op_vec_magnitudes (float * a , float * b , float * c , int size ) {
259+ void op_vec_magn_sq (float * a , float * b , float * c , int size ) {
267260 DSPSplitComplex split = {a , b };
268261 vDSP_zvmags (& split , 1 , c , 1 , size );
269262}
@@ -281,3 +274,5 @@ void op_mat_transp(const float *a, float *b, int M, int N) {
281274}
282275
283276
277+
278+
0 commit comments