Skip to content

Commit 5c96bf1

Browse files
committed
hopefully get rid of branching shit
1 parent e21b01d commit 5c96bf1

1 file changed

Lines changed: 35 additions & 18 deletions

File tree

pybricks/experimental/pb_module_experimental.c

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,39 +10,53 @@
1010
#include "py/runtime.h"
1111
#include <math.h>
1212

13+
// Hardcoded constants for 98MHz register-speed access
1314
static const float PI_F = 3.1415926535f;
1415
static const float TWO_PI_F = 6.2831853071f;
1516
static const float HALF_PI_F = 1.5707963267f;
1617
static const float INV_TWO_PI_F = 0.1591549431f;
1718

1819
// -----------------------------------------------------------------------------
19-
// Internal Engines (Inlined)
20+
// Core Math Engines (Optimized for ARM VSEL/VCMPE instructions)
2021
// -----------------------------------------------------------------------------
2122

2223
static inline float fast_sin_poly(float x) {
2324
float x2 = x * x;
25+
// Horner's Method: 3 muls, 2 adds
2426
return x * (1.0f + x2 * (-0.1666665f + x2 * 0.0083322f));
2527
}
2628

2729
static inline float fast_sin_internal(float theta) {
28-
float quot = theta * INV_TWO_PI_F;
29-
float x = theta - (float)((int)(quot + (quot > 0 ? 0.5f : -0.5f))) * TWO_PI_F;
30-
if (x > HALF_PI_F) { x = PI_F - x; }
31-
else if (x < -HALF_PI_F) { x = -PI_F - x; }
32-
return fast_sin_poly(x);
30+
// 1. Range Reduction to [-PI, PI]
31+
// Use roundf for branchless centering
32+
float x = theta - roundf(theta * INV_TWO_PI_F) * TWO_PI_F;
33+
34+
// 2. Branchless Symmetry Reduction to [-PI/2, PI/2]
35+
// Ternary operators here are mapped to conditional move instructions (VSEL)
36+
// which prevents CPU pipeline stalls.
37+
float x_abs = fabsf(x);
38+
float x_folded = (x_abs > HALF_PI_F) ? PI_F - x_abs : x_abs;
39+
40+
// Restore sign
41+
float result_x = (x < 0.0f) ? -x_folded : x_folded;
42+
43+
return fast_sin_poly(result_x);
3344
}
3445

3546
static inline float fast_atan2_internal(float y, float x) {
3647
float ay = fabsf(y) + 1e-10f;
3748
float ax = fabsf(x);
38-
float z, angle;
39-
if (ax >= ay) {
40-
z = y / ax;
41-
angle = (0.7853982f + 0.273f * (1.0f - fabsf(z))) * z;
42-
} else {
43-
z = x / ay;
44-
angle = 1.5707963f - (0.7853982f + 0.273f * (1.0f - fabsf(z))) * z;
45-
}
49+
50+
// Determine which axis is dominant without standard branching
51+
float z = (ax >= ay) ? y / ax : x / ay;
52+
float abs_z = fabsf(z);
53+
54+
// Parabolic approximation for atan(z)
55+
float angle = (0.7853982f + 0.273f * (1.0f - abs_z)) * z;
56+
57+
// Quadrant adjustment (mapped to VSEL)
58+
angle = (ax < ay) ? 1.5707963f - angle : angle;
59+
4660
if (x < 0.0f) {
4761
angle += (y >= 0.0f) ? PI_F : -PI_F;
4862
}
@@ -69,7 +83,7 @@ static mp_obj_t experimental_atan2(mp_obj_t y_in, mp_obj_t x_in) {
6983
static MP_DEFINE_CONST_FUN_OBJ_2(experimental_atan2_obj, experimental_atan2);
7084

7185
// -----------------------------------------------------------------------------
72-
// Detailed Internal Benchmark (Defeats Compiler Shortcuts)
86+
// Anti-Optimization Benchmark
7387
// -----------------------------------------------------------------------------
7488

7589
static mp_obj_t experimental_benchmark_detailed(mp_obj_t n_in) {
@@ -80,7 +94,6 @@ static mp_obj_t experimental_benchmark_detailed(mp_obj_t n_in) {
8094

8195
t0 = mp_hal_ticks_ms();
8296
for (int32_t i = 0; i < n; i++) {
83-
// Varying input prevents the compiler from pre-calculating
8497
result += fast_sin_internal((float)i * inv_n);
8598
}
8699

@@ -91,7 +104,6 @@ static mp_obj_t experimental_benchmark_detailed(mp_obj_t n_in) {
91104

92105
t2 = mp_hal_ticks_ms();
93106
for (int32_t i = 0; i < n; i++) {
94-
// Changing y and x forces different branches in atan2
95107
result += fast_atan2_internal((float)i, (float)(n - i));
96108
}
97109
t3 = mp_hal_ticks_ms();
@@ -106,6 +118,10 @@ static mp_obj_t experimental_benchmark_detailed(mp_obj_t n_in) {
106118
}
107119
static MP_DEFINE_CONST_FUN_OBJ_1(experimental_benchmark_detailed_obj, experimental_benchmark_detailed);
108120

121+
// -----------------------------------------------------------------------------
122+
// Registry
123+
// -----------------------------------------------------------------------------
124+
109125
static const mp_rom_map_elem_t experimental_globals_table[] = {
110126
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_experimental) },
111127
{ MP_ROM_QSTR(MP_QSTR_sin), MP_ROM_PTR(&experimental_sin_obj) },
@@ -123,4 +139,5 @@ const mp_obj_module_t pb_module_experimental = {
123139
#if !MICROPY_MODULE_BUILTIN_SUBPACKAGES
124140
MP_REGISTER_MODULE(MP_QSTR_pybricks_dot_experimental, pb_module_experimental);
125141
#endif
126-
#endif
142+
143+
#endif // PYBRICKS_PY_EXPERIMENTAL

0 commit comments

Comments
 (0)