@@ -28,16 +28,15 @@ static inline float fast_sin_poly(float x) {
2828
2929static inline float fast_sin_internal (float theta ) {
3030 // 1. Range Reduction to [-PI, PI]
31- // Use roundf for branchless centering
32- float x = theta - roundf (theta * INV_TWO_PI_F ) * TWO_PI_F ;
31+ // Manual round implementation to avoid undefined 'roundf' in some libm builds
32+ float quot = theta * INV_TWO_PI_F ;
33+ float x = theta - (float )((int )(quot + (quot > 0 ? 0.5f : -0.5f ))) * TWO_PI_F ;
3334
3435 // 2. Branchless Symmetry Reduction to [-PI/2, PI/2]
35- // Ternary operators here are mapped to conditional move instructions (VSEL)
36- // which prevents CPU pipeline stalls.
3736 float x_abs = fabsf (x );
3837 float x_folded = (x_abs > HALF_PI_F ) ? PI_F - x_abs : x_abs ;
3938
40- // Restore sign
39+ // Restore sign branchlessly
4140 float result_x = (x < 0.0f ) ? - x_folded : x_folded ;
4241
4342 return fast_sin_poly (result_x );
@@ -47,14 +46,14 @@ static inline float fast_atan2_internal(float y, float x) {
4746 float ay = fabsf (y ) + 1e-10f ;
4847 float ax = fabsf (x );
4948
50- // Determine which axis is dominant without standard branching
49+ // Determine which axis is dominant
5150 float z = (ax >= ay ) ? y / ax : x / ay ;
5251 float abs_z = fabsf (z );
5352
5453 // Parabolic approximation for atan(z)
5554 float angle = (0.7853982f + 0.273f * (1.0f - abs_z )) * z ;
5655
57- // Quadrant adjustment (mapped to VSEL)
56+ // Quadrant adjustment
5857 angle = (ax < ay ) ? 1.5707963f - angle : angle ;
5958
6059 if (x < 0.0f ) {
0 commit comments