@@ -313,13 +313,13 @@ exp_neg_rational_approx_f16(MLAS_FLOAT16X8 x)
313313 MLAS_FLOAT16X8 d1v = MlasBroadcastF16Float16x8 (d1);
314314 MLAS_FLOAT16X8 d2v = MlasBroadcastF16Float16x8 (d2);
315315 MLAS_FLOAT16X8 x2 = MlasMultiplyFloat16 (x, x);
316- MLAS_FLOAT16X8 num = MlasMultiplyAddFloat16 (c1v, x,c0v);
317- num = MlasMultiplyAddFloat16 (c2v, x2,num);
318- MLAS_FLOAT16X8 den = MlasMultiplyAddFloat16 (d1v, x,d0v);
319- den = MlasMultiplyAddFloat16 (d2v, x2,den);
320- MLAS_FLOAT16X8 recip = MlasapproximatereciprocalFloat16 (den);
321- recip = MlasMultiplyFloat16 (recip, MlasreciprocalsqrtFloat16 (den, recip));
322- recip = MlasMultiplyFloat16 (recip, MlasreciprocalsqrtFloat16 (den, recip));
316+ MLAS_FLOAT16X8 num = MlasMultiplyAddFloat16 (c1v, x, c0v);
317+ num = MlasMultiplyAddFloat16 (c2v, x2, num);
318+ MLAS_FLOAT16X8 den = MlasMultiplyAddFloat16 (d1v, x, d0v);
319+ den = MlasMultiplyAddFloat16 (d2v, x2, den);
320+ MLAS_FLOAT16X8 recip = MlasApproximateReciprocalFloat16 (den);
321+ recip = MlasMultiplyFloat16 (recip, MlasReciprocalSqrtFloat16 (den, recip));
322+ recip = MlasMultiplyFloat16 (recip, MlasReciprocalSqrtFloat16 (den, recip));
323323 MLAS_FLOAT16X8 result = MlasMultiplyFloat16 (num, recip);
324324 return result;
325325}
@@ -354,32 +354,32 @@ MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
354354 size_t i = 0 ;
355355 for (; i + 8 <= N; i += 8 ) {
356356 MLAS_FLOAT16X8 x = MlasLoadFloat16x8 (&Input[i]);
357- MLAS_UINT16X8 neg_mask = MlasComparelessthanFloat16 (x, vzero);
358- MLAS_FLOAT16X8 sign = MlasselectFloat16 (neg_mask, vneg_one, vone);
357+ MLAS_UINT16X8 neg_mask = MlasCompareLessThanFloat16 (x, vzero);
358+ MLAS_FLOAT16X8 sign = MlasSelectFloat16 (neg_mask, vneg_one, vone);
359359 MLAS_FLOAT16X8 absx = MlasAbsFloat16 (x);
360- MLAS_UINT16X8 use_mask = MlasComparelessthanFloat16 (absx, vth);
360+ MLAS_UINT16X8 use_mask = MlasCompareLessThanFloat16 (absx, vth);
361361 MLAS_FLOAT16X8 absx_clamped = MlasMinimumFloat16 (absx, vth);
362- MLAS_FLOAT16X8 denom = MlasMultiplyAddFloat16 (vp, absx_clamped,vone);
363- MLAS_FLOAT16X8 t = MlasapproximatereciprocalFloat16 (denom);
364- t = MlasMultiplyFloat16 (t, MlasreciprocalsqrtFloat16 (denom, t));
365- t = MlasMultiplyFloat16 (t, MlasreciprocalsqrtFloat16 (denom, t));
362+ MLAS_FLOAT16X8 denom = MlasMultiplyAddFloat16 (vp, absx_clamped, vone);
363+ MLAS_FLOAT16X8 t = MlasApproximateReciprocalFloat16 (denom);
364+ t = MlasMultiplyFloat16 (t, MlasReciprocalSqrtFloat16 (denom, t));
365+ t = MlasMultiplyFloat16 (t, MlasReciprocalSqrtFloat16 (denom, t));
366366 MLAS_FLOAT16X8 t2 = MlasMultiplyFloat16 (t, t);
367367 MLAS_FLOAT16X8 t3 = MlasMultiplyFloat16 (t2, t);
368368 MLAS_FLOAT16X8 t4 = MlasMultiplyFloat16 (t3, t);
369369 MLAS_FLOAT16X8 t5 = MlasMultiplyFloat16 (t4, t);
370370 MLAS_FLOAT16X8 poly = MlasMultiplyFloat16 (va1, t);
371- poly = MlasMultiplyAddFloat16 (va2, t2,poly);
372- poly = MlasMultiplyAddFloat16 (va3, t3,poly);
373- poly = MlasMultiplyAddFloat16 (va4, t4,poly);
374- poly = MlasMultiplyAddFloat16 (va5, t5,poly);
371+ poly = MlasMultiplyAddFloat16 (va2, t2, poly);
372+ poly = MlasMultiplyAddFloat16 (va3, t3, poly);
373+ poly = MlasMultiplyAddFloat16 (va4, t4, poly);
374+ poly = MlasMultiplyAddFloat16 (va5, t5, poly);
375375 MLAS_FLOAT16X8 x2 = MlasMultiplyFloat16 (absx_clamped, absx_clamped);
376376 MLAS_FLOAT16X8 exp_neg_x2 = exp_neg_rational_approx_f16 (x2);
377377 MLAS_FLOAT16X8 poly_mul_exp = MlasMultiplyFloat16 (poly, exp_neg_x2);
378378 MLAS_FLOAT16X8 one_minus_term = MlasSubtractFloat16 (vone, poly_mul_exp);
379379 MLAS_FLOAT16X8 erf_approx = MlasMultiplyFloat16 (sign, one_minus_term);
380380 erf_approx = MlasMinimumFloat16 (erf_approx, vone);
381381 erf_approx = MlasMaximumFloat16 (erf_approx, vneg_one);
382- MLAS_FLOAT16X8 result = MlasselectFloat16 (use_mask, erf_approx, sign);
382+ MLAS_FLOAT16X8 result = MlasSelectFloat16 (use_mask, erf_approx, sign);
383383 MlasStoreFloat16x8 (&Output[i], result);
384384 }
385385
0 commit comments