1010#include "py/runtime.h"
1111#include <math.h>
1212
13- // ARM Cortex-M4 Hardware Cycle Counter registers
14- #define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
15- #define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
16- #define DEMCR (*((volatile uint32_t*)0xE000EDFC))
13+ // Architecture Detection
14+ #if defined(__ARM_ARCH_7M__ ) || defined(__ARM_ARCH_7EM__ )
15+ #define IS_CORTEX_M 1
16+ #define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
17+ #define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
18+ #define DEMCR (*((volatile uint32_t*)0xE000EDFC))
19+ #else
20+ #define IS_CORTEX_M 0
21+ #endif
1722
1823static const float PI_F = 3.141592653589793f ;
1924static const float TWO_PI_F = 6.283185307179586f ;
2025static const float HALF_PI_F = 1.570796326794896f ;
2126static const float INV_TWO_PI_F = 0.159154943091895f ;
2227
23- // -----------------------------------------------------------------------------
24- // Core Math Engines (High-Precision VMLA )
25- // -----------------------------------------------------------------------------
28+ // ------------------------------------------------------------
29+ // Core Math Engines (Optimized per Architecture )
30+ // ------------------------------------------------------------
2631
2732static inline float fast_sin_internal (float theta ) {
33+ // Range Reduction (Common)
2834 float x = theta * INV_TWO_PI_F ;
2935 x = theta - (float )((int )(x + (x > 0 ? 0.5f : -0.5f ))) * TWO_PI_F ;
3036
3137 if (x > HALF_PI_F ) { x = PI_F - x ; }
3238 else if (x < - HALF_PI_F ) { x = - PI_F - x ; }
3339
3440 float x2 = x * x ;
35- float res = -0.000195152f ;
36- res = 0.008332152f + (x2 * res );
37- res = -0.166666567f + (x2 * res );
38- res = 1.0f + (x2 * res );
41+ float res ;
42+
43+ #if IS_CORTEX_M
44+ // Spike Optimization: Pattern for Hardware VMLA (Multiply-Accumulate)
45+ res = -0.000195152f ;
46+ res = 0.008332152f + (x2 * res );
47+ res = -0.166666567f + (x2 * res );
48+ res = 1.0f + (x2 * res );
49+ #else
50+ // EV3 Optimization: Software-Float Register-Friendly Polynomial
51+ // Horner's method helps the ARM9 emulator keep the 'x2' value
52+ // in a CPU register (r4-r7) to avoid re-calculating or memory loads.
53+ float c3 = -0.000195152f ;
54+ float c2 = 0.008332152f ;
55+ float c1 = -0.166666567f ;
56+ res = 1.0f + x2 * (c1 + x2 * (c2 + x2 * c3 ));
57+ #endif
3958
4059 return x * res ;
4160}
@@ -45,22 +64,32 @@ static inline float fast_atan2_internal(float y, float x) {
4564 float abs_y = fabsf (y ) + 1e-10f ;
4665 float abs_x = fabsf (x );
4766 float angle ;
67+
68+ float r = (abs_x >= abs_y ) ? (y / x ) : (x / y );
69+ float r2 = r * r ;
70+
71+ #if IS_CORTEX_M
72+ float den = 1.0f + (r2 * 0.28086f );
73+ #else
74+ // EV3: Avoid unnecessary float constants to save on pool loads
75+ float coeff = 0.28086f ;
76+ float den = 1.0f + (r2 * coeff );
77+ #endif
78+
79+ float res_atan = r * (1.0f / den );
80+
4881 if (abs_x >= abs_y ) {
49- float r = y / x ;
50- float den = 1.0f + (r * r * 0.28086f );
51- angle = r * (1.0f / den );
52- if (x < 0.0f ) { angle += (y >= 0.0f ) ? PI_F : - PI_F ; }
82+ angle = res_atan ;
83+ if (x < 0.0f ) angle += (y >= 0.0f ) ? PI_F : - PI_F ;
5384 } else {
54- float r = x / y ;
55- float den = 1.0f + (r * r * 0.28086f );
56- angle = (y > 0.0f ? HALF_PI_F : - HALF_PI_F ) - r * (1.0f / den );
85+ angle = (y > 0.0f ? HALF_PI_F : - HALF_PI_F ) - res_atan ;
5786 }
5887 return angle ;
5988}
6089
61- // -----------------------------------------------------------------------------
90+ // ------------------------------------------------------------
6291// Wrappers & Hardware Benchmarks
63- // -----------------------------------------------------------------------------
92+ // ----------------------------
6493
6594static mp_obj_t experimental_sin (mp_obj_t theta_in ) {
6695 return mp_obj_new_float_from_f (fast_sin_internal (mp_obj_get_float (theta_in )));
@@ -77,26 +106,29 @@ static mp_obj_t experimental_atan2(mp_obj_t y_in, mp_obj_t x_in) {
77106}
78107static MP_DEFINE_CONST_FUN_OBJ_2 (experimental_atan2_obj , experimental_atan2 ) ;
79108
80- // Benchmark with seed to prevent constant folding
81109static mp_obj_t experimental_benchmark_hardware (mp_obj_t seed_in ) {
82110 float seed = mp_obj_get_float (seed_in );
83- DEMCR |= 0x01000000 ; DWT_CONTROL |= 1 ;
84-
85- uint32_t start , cyc_sin ;
86- volatile float res ;
87-
88- DWT_CYCCNT = 0 ;
89- start = DWT_CYCCNT ;
90- // Chain operations to force the FPU to wait for the previous result
91- res = fast_sin_internal (seed );
92- res = fast_sin_internal (res + 0.01f );
93- __asm volatile ("dsb" );
94- cyc_sin = DWT_CYCCNT - start ;
95-
96- (void )res ;
97-
98- // Return avg cycles for one operation
99- return mp_obj_new_int (cyc_sin / 2 );
111+
112+ #if IS_CORTEX_M
113+ DEMCR |= 0x01000000 ; DWT_CONTROL |= 1 ;
114+ DWT_CYCCNT = 0 ;
115+ uint32_t start = DWT_CYCCNT ;
116+ volatile float res = fast_sin_internal (seed );
117+ res = fast_sin_internal (res + 0.01f );
118+ __asm volatile ("dsb" );
119+ uint32_t cyc = DWT_CYCCNT - start ;
120+ return mp_obj_new_int (cyc / 2 );
121+ #else
122+ // EV3: No Cycle Counter. Use microsecond-level timing via ticks_ms.
123+ uint32_t t0 = mp_hal_ticks_ms ();
124+ volatile float res = seed ;
125+ for (int i = 0 ; i < 20000 ; i ++ ) {
126+ res = fast_sin_internal (res );
127+ }
128+ uint32_t dt = mp_hal_ticks_ms () - t0 ;
129+ // Return time in microseconds for 100 ops to keep scale
130+ return mp_obj_new_int ((dt * 1000 ) / 20000 );
131+ #endif
100132}
101133static MP_DEFINE_CONST_FUN_OBJ_1 (experimental_benchmark_hardware_obj , experimental_benchmark_hardware ) ;
102134
0 commit comments