Skip to content

Commit a18a6fa

Browse files
committed
ev3 bm fix and ram opt
1 parent 5f69eaa commit a18a6fa

1 file changed

Lines changed: 21 additions & 36 deletions

File tree

pybricks/experimental/pb_module_experimental.c

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,28 @@
1010
#include "py/runtime.h"
1111
#include <math.h>
1212

13-
// Architecture Detection
13+
// Architecture Detection & Optimization Macros
1414
#if defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
1515
#define IS_CORTEX_M 1
16+
#define ACCEL_RAM __attribute__((section(".data"), noinline))
1617
#define DWT_CONTROL (*((volatile uint32_t*)0xE0001000))
1718
#define DWT_CYCCNT (*((volatile uint32_t*)0xE0001004))
1819
#define DEMCR (*((volatile uint32_t*)0xE000EDFC))
1920
#else
2021
#define IS_CORTEX_M 0
22+
#define ACCEL_RAM // EV3/Linux handles RAM loading automatically
2123
#endif
2224

2325
static const float PI_F = 3.141592653589793f;
2426
static const float TWO_PI_F = 6.283185307179586f;
2527
static const float HALF_PI_F = 1.570796326794896f;
2628
static const float INV_TWO_PI_F = 0.159154943091895f;
2729

28-
// ------------------------------------------------------------
29-
// Core Math Engines (Optimized per Architecture)
30-
// ------------------------------------------------------------
30+
// -----------------------------------------------------------------------------
31+
// Core Math Engines (RAM Accelerated on Spike)
32+
// -----------------------------------------------------------------------------
3133

32-
static inline float fast_sin_internal(float theta) {
33-
// Range Reduction (Common)
34+
ACCEL_RAM static float fast_sin_internal(float theta) {
3435
float x = theta * INV_TWO_PI_F;
3536
x = theta - (float)((int)(x + (x > 0 ? 0.5f : -0.5f))) * TWO_PI_F;
3637

@@ -41,41 +42,24 @@ static inline float fast_sin_internal(float theta) {
4142
float res;
4243

4344
#if IS_CORTEX_M
44-
// Spike Optimization: Pattern for Hardware VMLA (Multiply-Accumulate)
4545
res = -0.000195152f;
4646
res = 0.008332152f + (x2 * res);
4747
res = -0.166666567f + (x2 * res);
4848
res = 1.0f + (x2 * res);
4949
#else
50-
// EV3 Optimization: Software-Float Register-Friendly Polynomial
51-
// Horner's method helps the ARM9 emulator keep the 'x2' value
52-
// in a CPU register (r4-r7) to avoid re-calculating or memory loads.
53-
float c3 = -0.000195152f;
54-
float c2 = 0.008332152f;
55-
float c1 = -0.166666567f;
56-
res = 1.0f + x2 * (c1 + x2 * (c2 + x2 * c3));
50+
res = 1.0f + x2 * (-0.166666567f + x2 * (0.008332152f + x2 * -0.000195152f));
5751
#endif
5852

5953
return x * res;
6054
}
6155

62-
static inline float fast_atan2_internal(float y, float x) {
56+
ACCEL_RAM static float fast_atan2_internal(float y, float x) {
6357
if (x == 0.0f && y == 0.0f) return 0.0f;
6458
float abs_y = fabsf(y) + 1e-10f;
6559
float abs_x = fabsf(x);
6660
float angle;
67-
6861
float r = (abs_x >= abs_y) ? (y / x) : (x / y);
69-
float r2 = r * r;
70-
71-
#if IS_CORTEX_M
72-
float den = 1.0f + (r2 * 0.28086f);
73-
#else
74-
// EV3: Avoid unnecessary float constants to save on pool loads
75-
float coeff = 0.28086f;
76-
float den = 1.0f + (r2 * coeff);
77-
#endif
78-
62+
float den = 1.0f + (r * r * 0.28086f);
7963
float res_atan = r * (1.0f / den);
8064

8165
if (abs_x >= abs_y) {
@@ -87,9 +71,9 @@ static inline float fast_atan2_internal(float y, float x) {
8771
return angle;
8872
}
8973

90-
// ------------------------------------------------------------
74+
// -----------------------------------------------------------------------------
9175
// Wrappers & Hardware Benchmarks
92-
// ----------------------------
76+
// -----------------------------------------------------------------------------
9377

9478
static mp_obj_t experimental_sin(mp_obj_t theta_in) {
9579
return mp_obj_new_float_from_f(fast_sin_internal(mp_obj_get_float(theta_in)));
@@ -110,24 +94,25 @@ static mp_obj_t experimental_benchmark_hardware(mp_obj_t seed_in) {
11094
float seed = mp_obj_get_float(seed_in);
11195

11296
#if IS_CORTEX_M
97+
// Spike Prime High-Res Cycle Counter
11398
DEMCR |= 0x01000000; DWT_CONTROL |= 1;
11499
DWT_CYCCNT = 0;
115100
uint32_t start = DWT_CYCCNT;
116101
volatile float res = fast_sin_internal(seed);
117102
res = fast_sin_internal(res + 0.01f);
118103
__asm volatile ("dsb");
119-
uint32_t cyc = DWT_CYCCNT - start;
120-
return mp_obj_new_int(cyc / 2);
104+
return mp_obj_new_int((DWT_CYCCNT - start) / 2);
121105
#else
122-
// EV3: No Cycle Counter. Use microsecond-level timing via ticks_ms.
106+
// EV3 / Generic: Microsecond-based average
123107
uint32_t t0 = mp_hal_ticks_ms();
124108
volatile float res = seed;
125-
for(int i=0; i<20000; i++) {
126-
res = fast_sin_internal(res);
109+
int loops = 50000;
110+
for(int i=0; i<loops; i++) {
111+
res = fast_sin_internal(res + 0.0001f);
127112
}
128-
uint32_t dt = mp_hal_ticks_ms() - t0;
129-
// Return time in microseconds for 100 ops to keep scale
130-
return mp_obj_new_int((dt * 1000) / 20000);
113+
uint32_t dt_ms = mp_hal_ticks_ms() - t0;
114+
// Return result scaled as "nanoseconds" to match Python expected format
115+
return mp_obj_new_int((dt_ms * 1000000) / loops);
131116
#endif
132117
}
133118
static MP_DEFINE_CONST_FUN_OBJ_1(experimental_benchmark_hardware_obj, experimental_benchmark_hardware);

0 commit comments

Comments
 (0)