1010#include "py/runtime.h"
1111#include <math.h>
1212
13+ // ARM Cortex-M4 Hardware Cycle Counter registers
14+ #define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
15+ #define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
16+ #define DEMCR (*((volatile uint32_t*)0xE000EDFC))
17+
1318static const float PI_F = 3.141592653589793f ;
1419static const float TWO_PI_F = 6.283185307179586f ;
1520static const float HALF_PI_F = 1.570796326794896f ;
@@ -28,8 +33,7 @@ static inline float fast_sin_internal(float theta) {
2833
2934 float x2 = x * x ;
3035
31- // By writing it as a = a + b * c, the GCC compiler for Cortex-M4
32- // will automatically generate the VMLA (Multiply-Accumulate) instruction.
36+ // GCC Pattern for VMLA (Multiply-Accumulate)
3337 float res = -0.000195152f ;
3438 res = 0.008332152f + (x2 * res );
3539 res = -0.166666567f + (x2 * res );
@@ -47,7 +51,6 @@ static inline float fast_atan2_internal(float y, float x) {
4751
4852 if (abs_x >= abs_y ) {
4953 float r = y / x ;
50- // Simplified for auto-VMLA
5154 float den = 1.0f + (r * r * 0.28086f );
5255 angle = r * (1.0f / den );
5356 if (x < 0.0f ) {
@@ -62,51 +65,7 @@ static inline float fast_atan2_internal(float y, float x) {
6265}
6366
6467// -----------------------------------------------------------------------------
65- // Unrolled Benchmark
66- // -----------------------------------------------------------------------------
67-
68- static mp_obj_t experimental_benchmark_detailed (mp_obj_t n_in ) {
69- int32_t n = mp_obj_get_int (n_in );
70- volatile float result = 0.0f ;
71- uint32_t t0 , t1 , t2 , t3 ;
72- float inv_n = 1.0f / (float )n ;
73-
74- t0 = mp_hal_ticks_ms ();
75- // 4x Unrolling for Sin
76- for (int32_t i = 0 ; i < n ; i += 4 ) {
77- result += fast_sin_internal ((float )(i ) * inv_n );
78- result += fast_sin_internal ((float )(i + 1 ) * inv_n );
79- result += fast_sin_internal ((float )(i + 2 ) * inv_n );
80- result += fast_sin_internal ((float )(i + 3 ) * inv_n );
81- }
82-
83- t1 = mp_hal_ticks_ms ();
84- // 4x Unrolling for Cos
85- for (int32_t i = 0 ; i < n ; i += 4 ) {
86- result += fast_sin_internal (((float )(i ) * inv_n ) + HALF_PI_F );
87- result += fast_sin_internal (((float )(i + 1 ) * inv_n ) + HALF_PI_F );
88- result += fast_sin_internal (((float )(i + 2 ) * inv_n ) + HALF_PI_F );
89- result += fast_sin_internal (((float )(i + 3 ) * inv_n ) + HALF_PI_F );
90- }
91-
92- t2 = mp_hal_ticks_ms ();
93- for (int32_t i = 0 ; i < n ; i ++ ) {
94- result += fast_atan2_internal ((float )i , (float )(n - i ));
95- }
96- t3 = mp_hal_ticks_ms ();
97-
98- mp_obj_t tuple [4 ] = {
99- mp_obj_new_int (t1 - t0 ),
100- mp_obj_new_int (t2 - t1 ),
101- mp_obj_new_int (t3 - t2 ),
102- mp_obj_new_int (t3 - t0 )
103- };
104- return mp_obj_new_tuple (4 , tuple );
105- }
106- static MP_DEFINE_CONST_FUN_OBJ_1 (experimental_benchmark_detailed_obj , experimental_benchmark_detailed ) ;
107-
108- // -----------------------------------------------------------------------------
109- // Standard Wrappers
68+ // MicroPython Wrappers
11069// -----------------------------------------------------------------------------
11170
11271static mp_obj_t experimental_sin (mp_obj_t theta_in ) {
@@ -124,12 +83,55 @@ static mp_obj_t experimental_atan2(mp_obj_t y_in, mp_obj_t x_in) {
12483}
12584static MP_DEFINE_CONST_FUN_OBJ_2 (experimental_atan2_obj , experimental_atan2 ) ;
12685
86+ // -----------------------------------------------------------------------------
87+ // The "Truth" Hardware Benchmark
88+ // -----------------------------------------------------------------------------
89+
90+ static mp_obj_t experimental_benchmark_hardware (void ) {
91+ // Enable DWT Cycle Counter
92+ DEMCR |= 0x01000000 ;
93+ DWT_CYCCNT = 0 ;
94+ DWT_CONTROL |= 1 ;
95+
96+ float test_val = 1.1f ;
97+ uint32_t start , cyc_sin , cyc_cos , cyc_atan ;
98+
99+ // Measure Sin
100+ start = DWT_CYCCNT ;
101+ volatile float s = fast_sin_internal (test_val );
102+ cyc_sin = DWT_CYCCNT - start ;
103+
104+ // Measure Cos
105+ start = DWT_CYCCNT ;
106+ volatile float c = fast_sin_internal (test_val + HALF_PI_F );
107+ cyc_cos = DWT_CYCCNT - start ;
108+
109+ // Measure Atan2
110+ start = DWT_CYCCNT ;
111+ volatile float a = fast_atan2_internal (test_val , 0.5f );
112+ cyc_atan = DWT_CYCCNT - start ;
113+
114+ (void )s ; (void )c ; (void )a ; // Prevent optimization cleanup
115+
116+ mp_obj_t tuple [3 ] = {
117+ mp_obj_new_int (cyc_sin ),
118+ mp_obj_new_int (cyc_cos ),
119+ mp_obj_new_int (cyc_atan )
120+ };
121+ return mp_obj_new_tuple (3 , tuple );
122+ }
123+ static MP_DEFINE_CONST_FUN_OBJ_0 (experimental_benchmark_hardware_obj , experimental_benchmark_hardware ) ;
124+
125+ // -----------------------------------------------------------------------------
126+ // Registry
127+ // -----------------------------------------------------------------------------
128+
127129static const mp_rom_map_elem_t experimental_globals_table [] = {
128130 { MP_ROM_QSTR (MP_QSTR___name__ ), MP_ROM_QSTR (MP_QSTR_experimental ) },
129131 { MP_ROM_QSTR (MP_QSTR_sin ), MP_ROM_PTR (& experimental_sin_obj ) },
130132 { MP_ROM_QSTR (MP_QSTR_cos ), MP_ROM_PTR (& experimental_cos_obj ) },
131133 { MP_ROM_QSTR (MP_QSTR_atan2 ), MP_ROM_PTR (& experimental_atan2_obj ) },
132- { MP_ROM_QSTR (MP_QSTR_benchmark_detailed ), MP_ROM_PTR (& experimental_benchmark_detailed_obj ) },
134+ { MP_ROM_QSTR (MP_QSTR_benchmark_hardware ), MP_ROM_PTR (& experimental_benchmark_hardware_obj ) },
133135};
134136static MP_DEFINE_CONST_DICT (pb_module_experimental_globals , experimental_globals_table ) ;
135137
0 commit comments