1010#include "py/runtime.h"
1111#include <math.h>
1212
13- // Architecture Detection
13+ // Architecture Detection & Optimization Macros
1414#if defined(__ARM_ARCH_7M__ ) || defined(__ARM_ARCH_7EM__ )
1515 #define IS_CORTEX_M 1
16+ #define ACCEL_RAM __attribute__((section(".data"), noinline))
1617 #define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
1718 #define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
1819 #define DEMCR (*((volatile uint32_t*)0xE000EDFC))
1920#else
2021 #define IS_CORTEX_M 0
22+ #define ACCEL_RAM // EV3/Linux handles RAM loading automatically
2123#endif
2224
2325static const float PI_F = 3.141592653589793f ;
2426static const float TWO_PI_F = 6.283185307179586f ;
2527static const float HALF_PI_F = 1.570796326794896f ;
2628static const float INV_TWO_PI_F = 0.159154943091895f ;
2729
28- // ------------------------------------------------------------
29- // Core Math Engines (Optimized per Architecture )
30- // ------------------------------------------------------------
30+ // -----------------------------------------------------------------------------
31+ // Core Math Engines (RAM Accelerated on Spike )
32+ // -----------------------------------------------------------------------------
3133
32- static inline float fast_sin_internal (float theta ) {
33- // Range Reduction (Common)
34+ ACCEL_RAM static float fast_sin_internal (float theta ) {
3435 float x = theta * INV_TWO_PI_F ;
3536 x = theta - (float )((int )(x + (x > 0 ? 0.5f : -0.5f ))) * TWO_PI_F ;
3637
@@ -41,41 +42,24 @@ static inline float fast_sin_internal(float theta) {
4142 float res ;
4243
4344 #if IS_CORTEX_M
44- // Spike Optimization: Pattern for Hardware VMLA (Multiply-Accumulate)
4545 res = -0.000195152f ;
4646 res = 0.008332152f + (x2 * res );
4747 res = -0.166666567f + (x2 * res );
4848 res = 1.0f + (x2 * res );
4949 #else
50- // EV3 Optimization: Software-Float Register-Friendly Polynomial
51- // Horner's method helps the ARM9 emulator keep the 'x2' value
52- // in a CPU register (r4-r7) to avoid re-calculating or memory loads.
53- float c3 = -0.000195152f ;
54- float c2 = 0.008332152f ;
55- float c1 = -0.166666567f ;
56- res = 1.0f + x2 * (c1 + x2 * (c2 + x2 * c3 ));
50+ res = 1.0f + x2 * (-0.166666567f + x2 * (0.008332152f + x2 * -0.000195152f ));
5751 #endif
5852
5953 return x * res ;
6054}
6155
62- static inline float fast_atan2_internal (float y , float x ) {
56+ ACCEL_RAM static float fast_atan2_internal (float y , float x ) {
6357 if (x == 0.0f && y == 0.0f ) return 0.0f ;
6458 float abs_y = fabsf (y ) + 1e-10f ;
6559 float abs_x = fabsf (x );
6660 float angle ;
67-
6861 float r = (abs_x >= abs_y ) ? (y / x ) : (x / y );
69- float r2 = r * r ;
70-
71- #if IS_CORTEX_M
72- float den = 1.0f + (r2 * 0.28086f );
73- #else
74- // EV3: Avoid unnecessary float constants to save on pool loads
75- float coeff = 0.28086f ;
76- float den = 1.0f + (r2 * coeff );
77- #endif
78-
62+ float den = 1.0f + (r * r * 0.28086f );
7963 float res_atan = r * (1.0f / den );
8064
8165 if (abs_x >= abs_y ) {
@@ -87,9 +71,9 @@ static inline float fast_atan2_internal(float y, float x) {
8771 return angle ;
8872}
8973
90- // ------------------------------------------------------------
74+ // -----------------------------------------------------------------------------
9175// Wrappers & Hardware Benchmarks
92- // ----------------------------
76+ // -----------------------------------------------------------------------------
9377
9478static mp_obj_t experimental_sin (mp_obj_t theta_in ) {
9579 return mp_obj_new_float_from_f (fast_sin_internal (mp_obj_get_float (theta_in )));
@@ -110,24 +94,25 @@ static mp_obj_t experimental_benchmark_hardware(mp_obj_t seed_in) {
11094 float seed = mp_obj_get_float (seed_in );
11195
11296 #if IS_CORTEX_M
97+ // Spike Prime High-Res Cycle Counter
11398 DEMCR |= 0x01000000 ; DWT_CONTROL |= 1 ;
11499 DWT_CYCCNT = 0 ;
115100 uint32_t start = DWT_CYCCNT ;
116101 volatile float res = fast_sin_internal (seed );
117102 res = fast_sin_internal (res + 0.01f );
118103 __asm volatile ("dsb" );
119- uint32_t cyc = DWT_CYCCNT - start ;
120- return mp_obj_new_int (cyc / 2 );
104+ return mp_obj_new_int ((DWT_CYCCNT - start ) / 2 );
121105 #else
122- // EV3: No Cycle Counter. Use microsecond-level timing via ticks_ms.
106+ // EV3 / Generic: Microsecond-based average
123107 uint32_t t0 = mp_hal_ticks_ms ();
124108 volatile float res = seed ;
125- for (int i = 0 ; i < 20000 ; i ++ ) {
126- res = fast_sin_internal (res );
109+ int loops = 50000 ;
110+ for (int i = 0 ; i < loops ; i ++ ) {
111+ res = fast_sin_internal (res + 0.0001f );
127112 }
128- uint32_t dt = mp_hal_ticks_ms () - t0 ;
129- // Return time in microseconds for 100 ops to keep scale
130- return mp_obj_new_int ((dt * 1000 ) / 20000 );
113+ uint32_t dt_ms = mp_hal_ticks_ms () - t0 ;
114+ // Return result scaled as "nanoseconds" to match Python expected format
115+ return mp_obj_new_int ((dt_ms * 1000000 ) / loops );
131116 #endif
132117}
133118static MP_DEFINE_CONST_FUN_OBJ_1 (experimental_benchmark_hardware_obj , experimental_benchmark_hardware ) ;
0 commit comments