Skip to content

Commit 0ceeae6

Browse files
committed
better benchmarking
1 parent cfc45e2 commit 0ceeae6

1 file changed

Lines changed: 51 additions & 49 deletions

File tree

pybricks/experimental/pb_module_experimental.c

Lines changed: 51 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
#include "py/runtime.h"
1111
#include <math.h>
1212

13+
// ARM Cortex-M4 Hardware Cycle Counter registers
14+
#define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
15+
#define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
16+
#define DEMCR (*((volatile uint32_t*)0xE000EDFC))
17+
1318
static const float PI_F = 3.141592653589793f;
1419
static const float TWO_PI_F = 6.283185307179586f;
1520
static const float HALF_PI_F = 1.570796326794896f;
@@ -28,8 +33,7 @@ static inline float fast_sin_internal(float theta) {
2833

2934
float x2 = x * x;
3035

31-
// By writing it as a = a + b * c, the GCC compiler for Cortex-M4
32-
// will automatically generate the VMLA (Multiply-Accumulate) instruction.
36+
// GCC Pattern for VMLA (Multiply-Accumulate)
3337
float res = -0.000195152f;
3438
res = 0.008332152f + (x2 * res);
3539
res = -0.166666567f + (x2 * res);
@@ -47,7 +51,6 @@ static inline float fast_atan2_internal(float y, float x) {
4751

4852
if (abs_x >= abs_y) {
4953
float r = y / x;
50-
// Simplified for auto-VMLA
5154
float den = 1.0f + (r * r * 0.28086f);
5255
angle = r * (1.0f / den);
5356
if (x < 0.0f) {
@@ -62,51 +65,7 @@ static inline float fast_atan2_internal(float y, float x) {
6265
}
6366

6467
// -----------------------------------------------------------------------------
65-
// Unrolled Benchmark
66-
// -----------------------------------------------------------------------------
67-
68-
static mp_obj_t experimental_benchmark_detailed(mp_obj_t n_in) {
69-
int32_t n = mp_obj_get_int(n_in);
70-
volatile float result = 0.0f;
71-
uint32_t t0, t1, t2, t3;
72-
float inv_n = 1.0f / (float)n;
73-
74-
t0 = mp_hal_ticks_ms();
75-
// 4x Unrolling for Sin
76-
for (int32_t i = 0; i < n; i += 4) {
77-
result += fast_sin_internal((float)(i) * inv_n);
78-
result += fast_sin_internal((float)(i+1) * inv_n);
79-
result += fast_sin_internal((float)(i+2) * inv_n);
80-
result += fast_sin_internal((float)(i+3) * inv_n);
81-
}
82-
83-
t1 = mp_hal_ticks_ms();
84-
// 4x Unrolling for Cos
85-
for (int32_t i = 0; i < n; i += 4) {
86-
result += fast_sin_internal(((float)(i) * inv_n) + HALF_PI_F);
87-
result += fast_sin_internal(((float)(i+1) * inv_n) + HALF_PI_F);
88-
result += fast_sin_internal(((float)(i+2) * inv_n) + HALF_PI_F);
89-
result += fast_sin_internal(((float)(i+3) * inv_n) + HALF_PI_F);
90-
}
91-
92-
t2 = mp_hal_ticks_ms();
93-
for (int32_t i = 0; i < n; i++) {
94-
result += fast_atan2_internal((float)i, (float)(n - i));
95-
}
96-
t3 = mp_hal_ticks_ms();
97-
98-
mp_obj_t tuple[4] = {
99-
mp_obj_new_int(t1 - t0),
100-
mp_obj_new_int(t2 - t1),
101-
mp_obj_new_int(t3 - t2),
102-
mp_obj_new_int(t3 - t0)
103-
};
104-
return mp_obj_new_tuple(4, tuple);
105-
}
106-
static MP_DEFINE_CONST_FUN_OBJ_1(experimental_benchmark_detailed_obj, experimental_benchmark_detailed);
107-
108-
// -----------------------------------------------------------------------------
109-
// Standard Wrappers
68+
// MicroPython Wrappers
11069
// -----------------------------------------------------------------------------
11170

11271
static mp_obj_t experimental_sin(mp_obj_t theta_in) {
@@ -124,12 +83,55 @@ static mp_obj_t experimental_atan2(mp_obj_t y_in, mp_obj_t x_in) {
12483
}
12584
static MP_DEFINE_CONST_FUN_OBJ_2(experimental_atan2_obj, experimental_atan2);
12685

86+
// -----------------------------------------------------------------------------
87+
// The "Truth" Hardware Benchmark
88+
// -----------------------------------------------------------------------------
89+
90+
static mp_obj_t experimental_benchmark_hardware(void) {
91+
// Enable DWT Cycle Counter
92+
DEMCR |= 0x01000000;
93+
DWT_CYCCNT = 0;
94+
DWT_CONTROL |= 1;
95+
96+
float test_val = 1.1f;
97+
uint32_t start, cyc_sin, cyc_cos, cyc_atan;
98+
99+
// Measure Sin
100+
start = DWT_CYCCNT;
101+
volatile float s = fast_sin_internal(test_val);
102+
cyc_sin = DWT_CYCCNT - start;
103+
104+
// Measure Cos
105+
start = DWT_CYCCNT;
106+
volatile float c = fast_sin_internal(test_val + HALF_PI_F);
107+
cyc_cos = DWT_CYCCNT - start;
108+
109+
// Measure Atan2
110+
start = DWT_CYCCNT;
111+
volatile float a = fast_atan2_internal(test_val, 0.5f);
112+
cyc_atan = DWT_CYCCNT - start;
113+
114+
(void)s; (void)c; (void)a; // Prevent optimization cleanup
115+
116+
mp_obj_t tuple[3] = {
117+
mp_obj_new_int(cyc_sin),
118+
mp_obj_new_int(cyc_cos),
119+
mp_obj_new_int(cyc_atan)
120+
};
121+
return mp_obj_new_tuple(3, tuple);
122+
}
123+
static MP_DEFINE_CONST_FUN_OBJ_0(experimental_benchmark_hardware_obj, experimental_benchmark_hardware);
124+
125+
// -----------------------------------------------------------------------------
126+
// Registry
127+
// -----------------------------------------------------------------------------
128+
127129
static const mp_rom_map_elem_t experimental_globals_table[] = {
128130
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_experimental) },
129131
{ MP_ROM_QSTR(MP_QSTR_sin), MP_ROM_PTR(&experimental_sin_obj) },
130132
{ MP_ROM_QSTR(MP_QSTR_cos), MP_ROM_PTR(&experimental_cos_obj) },
131133
{ MP_ROM_QSTR(MP_QSTR_atan2), MP_ROM_PTR(&experimental_atan2_obj) },
132-
{ MP_ROM_QSTR(MP_QSTR_benchmark_detailed), MP_ROM_PTR(&experimental_benchmark_detailed_obj) },
134+
{ MP_ROM_QSTR(MP_QSTR_benchmark_hardware), MP_ROM_PTR(&experimental_benchmark_hardware_obj) },
133135
};
134136
static MP_DEFINE_CONST_DICT(pb_module_experimental_globals, experimental_globals_table);
135137

0 commit comments

Comments
 (0)