Skip to content

Commit 5429fa3

Browse files
committed
ISA backends float3 and float4 - cleanup history squash
working for both neon32 and neon64 Update math_backend.cpp further sse simd additions avx2 float3 added added normalize_magnitude added divide fast to float3 may copy to float4 move static spheremesh to drawSphere (initialize on first use) so platform has a chance to load the math backend all float3 and float4 functions and isas completed all options of float3 and float4 functions in isas and math_c neon still to be done but that will be on mac. Update math_backend.cpp mac isa neon update added float3 restructured the classes to look more like the final version of the x86 classes linux required changes Update build-macos-clang.yml Update build-macos-clang.yml Revert "Update build-macos-clang.yml" This reverts commit 29dfc56. Revert "Update build-macos-clang.yml" This reverts commit 2abad2b. Update CMakeLists.txt fix macs stupid build remove god awful rolling average from frame time tracker.... use intrinsic headers instead each isa implementation now uses a header for that isa's intrinsic functions these are then used in the impl files. This will make it easier for matrix functions when those are implemented. fixed comment saying 256 when it should be 512 for avx512 consolidated initializers for function tables Update neon_intrinsics.h fixes for some neon intrinsics no idea if this is the best way to do these but they work at least v_cross is especially messy at the moment we basically just do it as a c math function need to look into getting this done correctly
1 parent 13ec0df commit 5429fa3

36 files changed

Lines changed: 1478 additions & 416 deletions

Engine/source/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,12 +503,17 @@ set(IS_ARM FALSE)
503503

504504
if(ARCH MATCHES "x86_64|amd64|i[3-6]86")
505505
set(IS_X86 TRUE)
506-
elseif(ARCH MATCHES "arm64|aarch64")
506+
endif()
507+
508+
if(ARCH MATCHES "arm64|aarch64|arm")
507509
set(IS_ARM TRUE)
508510
endif()
509511

510512
# always available
511513
add_math_backend(scalar MATH_SIMD_SCALAR)
514+
message(STATUS "Processor: ${CMAKE_SYSTEM_PROCESSOR}")
515+
message(STATUS "IS_X86=${IS_X86}")
516+
message(STATUS "IS_ARM=${IS_ARM}")
512517

513518
# x86 family
514519
if(IS_X86)

Engine/source/gfx/gfxDrawUtil.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,11 @@ void GFXDrawUtil::drawThickLine(F32 x1, F32 y1, F32 z1, F32 x2, F32 y2, F32 z2,
853853
// 3D World Draw Misc
854854
//-----------------------------------------------------------------------------
855855

856-
static SphereMesh gSphere;
856+
SphereMesh& getSphere()
857+
{
858+
static SphereMesh instance;
859+
return instance;
860+
}
857861

858862
void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const Point3F &pos, const ColorI &color, bool drawTop, bool drawBottom, const MatrixF *xfm )
859863
{
@@ -868,7 +872,7 @@ void GFXDrawUtil::drawSphere( const GFXStateBlockDesc &desc, F32 radius, const P
868872
GFX->pushWorldMatrix();
869873
GFX->multWorld(mat);
870874

871-
const SphereMesh::TriangleMesh * sphereMesh = gSphere.getMesh(2);
875+
const SphereMesh::TriangleMesh * sphereMesh = getSphere().getMesh(2);
872876
S32 numPoly = sphereMesh->numPoly;
873877
S32 totalPoly = 0;
874878
GFXVertexBufferHandle<GFXVertexPCT> verts(mDevice, numPoly*3, GFXBufferTypeVolatile);
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#pragma once
2+
#include <cmath> // for sqrtf, etc.
3+
#include "../mConstants.h"
4+
5+
// Safely loads a float3 -> simd 4 lane backend
6+
namespace math_backend::float3
7+
{
8+
//----------------------------------------------------------
9+
// Add two float4 vectors: r = a + b
10+
inline void float3_add_impl(const float* a, const float* b, float* r)
11+
{
12+
f32x4 va = v_load3_vec(a);
13+
f32x4 vb = v_load3_vec(b);
14+
f32x4 vr = v_add(va, vb);
15+
v_store3(r, vr);
16+
}
17+
18+
// Subtract: r = a - b
19+
inline void float3_sub_impl(const float* a, const float* b, float* r)
20+
{
21+
f32x4 va = v_load3_vec(a);
22+
f32x4 vb = v_load3_vec(b);
23+
f32x4 vr = v_sub(va, vb);
24+
v_store3(r, vr);
25+
}
26+
27+
// Multiply element-wise: r = a * b
28+
inline void float3_mul_impl(const float* a, const float* b, float* r)
29+
{
30+
f32x4 va = v_load3_vec(a);
31+
f32x4 vb = v_load3_vec(b);
32+
f32x4 vr = v_mul(va, vb);
33+
v_store3(r, vr);
34+
}
35+
36+
// Multiply by scalar: r = a * s
37+
inline void float3_mul_scalar_impl(const float* a, float s, float* r)
38+
{
39+
f32x4 va = v_load3_vec(a);
40+
f32x4 vs = v_set1(s);
41+
f32x4 vr = v_mul(va, vs);
42+
v_store3(r, vr);
43+
}
44+
45+
// Divide element-wise: r = a / b
46+
inline void float3_div_impl(const float* a, const float* b, float* r)
47+
{
48+
f32x4 va = v_load3_vec(a);
49+
f32x4 vb = v_load3_vec(b);
50+
f32x4 vr = v_div(va, vb);
51+
v_store3(r, vr);
52+
}
53+
54+
// Divide by scalar: r = a / s
55+
inline void float3_div_scalar_impl(const float* a, float s, float* r)
56+
{
57+
f32x4 va = v_load3_vec(a);
58+
f32x4 vs = v_set1(s);
59+
f32x4 vr = v_div(va, vs);
60+
v_store3(r, vr);
61+
}
62+
63+
// Dot product: returns scalar
64+
inline float float3_dot_impl(const float* a, const float* b)
65+
{
66+
f32x4 va = v_load3_vec(a);
67+
f32x4 vb = v_load3_vec(b);
68+
f32x4 vdot = v_dot3(va, vb);
69+
return v_extract0(vdot); // first lane is the sum of 3 elements
70+
}
71+
72+
// Length squared
73+
inline float float3_length_squared_impl(const float* a)
74+
{
75+
return float3_dot_impl(a, a);
76+
}
77+
78+
// Length
79+
inline float float3_length_impl(const float* a)
80+
{
81+
return std::sqrt(float3_length_squared_impl(a));
82+
}
83+
84+
// Normalize in-place
85+
inline void float3_normalize_impl(float* a)
86+
{
87+
f32x4 va = v_load3_vec(a);
88+
f32x4 invLen = v_rsqrt_nr(v_dot3(va, va)); // fully abstracted
89+
f32x4 vnorm = v_mul(va, invLen);
90+
v_store3(a, vnorm);
91+
}
92+
93+
// Normalize with magnitude: r = normalize(a) * r
94+
inline void float3_normalize_mag_impl(float* a, float r)
95+
{
96+
f32x4 va = v_load3_vec(a);
97+
98+
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
99+
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot3(va, va)));
100+
101+
f32x4 vnorm = v_mul(va, invLen);
102+
v_store(a, vnorm);
103+
}
104+
105+
// Linear interpolation: r = from + (to - from) * f
106+
inline void float3_lerp_impl(const float* from, const float* to, float f, float* r)
107+
{
108+
f32x4 vfrom = v_load3_vec(from);
109+
f32x4 vto = v_load3_vec(to);
110+
f32x4 vf = v_set1(f);
111+
f32x4 vr = v_add(vfrom, v_mul(vf, v_sub(vto, vfrom)));
112+
v_store3(r, vr);
113+
}
114+
115+
inline void float3_cross_impl(const float* a, const float* b, float* r)
116+
{
117+
f32x4 va = v_load3_vec(a);
118+
f32x4 vb = v_load3_vec(b);
119+
f32x4 vcross = v_cross(va, vb);
120+
v_store3(r, vcross);
121+
}
122+
123+
}

Engine/source/math/impl/float4_c.cpp

Lines changed: 0 additions & 60 deletions
This file was deleted.

Engine/source/math/impl/float4_impl.inl

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ namespace math_backend::float4
6565
{
6666
f32x4 va = v_load(a);
6767
f32x4 vb = v_load(b);
68-
f32x4 vmul = v_mul(va, vb);
69-
return v_hadd4(vmul);
68+
f32x4 vdot = v_dot4(va, vb); // calls ISA-specific implementation
69+
return v_extract0(vdot);
7070
}
7171

7272
// Length squared
@@ -84,21 +84,22 @@ namespace math_backend::float4
8484
// Normalize in-place
8585
inline void float4_normalize_impl(float* a)
8686
{
87-
float len = float4_length_impl(a);
88-
if (len > POINT_EPSILON) // safe threshold
89-
{
90-
float4_mul_scalar_impl(a, 1.0f / len, a);
91-
}
87+
f32x4 va = v_load(a);
88+
f32x4 invLen = v_rsqrt_nr(v_dot4(va, va)); // fully abstracted
89+
f32x4 vnorm = v_mul(va, invLen);
90+
v_store(a, vnorm);
9291
}
9392

9493
// Normalize with magnitude: r = normalize(a) * r
9594
inline void float4_normalize_mag_impl(float* a, float r)
9695
{
97-
float len = float4_length_impl(a);
98-
if (len > POINT_EPSILON)
99-
{
100-
float4_mul_scalar_impl(a, r / len, a);
101-
}
96+
f32x4 va = v_load(a);
97+
98+
// invLen = r / sqrt(dot(a,a)) = r * rsqrt(dot(a,a))
99+
f32x4 invLen = v_mul(v_set1(r), v_rsqrt_nr(v_dot4(va, va)));
100+
101+
f32x4 vnorm = v_mul(va, invLen);
102+
v_store(a, vnorm);
102103
}
103104

104105
// Linear interpolation: r = from + (to - from) * f
@@ -111,4 +112,12 @@ namespace math_backend::float4
111112
v_store(r, vr);
112113
}
113114

115+
inline void float4_cross_impl(const float* a, const float* b, float* r)
116+
{
117+
f32x4 va = v_load(a);
118+
f32x4 vb = v_load(b);
119+
f32x4 vcross = v_cross(va, vb);
120+
v_store(r, vcross);
121+
}
122+
114123
} // namespace math_backend::float4

0 commit comments

Comments
 (0)