Skip to content

Commit 13ec0df

Browse files
committed
Merge branch 'TEST-PR-Streaming-and-Lazy-loading-ghosts' of https://github.com/marauder2k9-torque/Torque3D into TEST-PR-Streaming-and-Lazy-loading-ghosts
2 parents 43d1a91 + 25ccdde commit 13ec0df

9 files changed

Lines changed: 248 additions & 16 deletions

File tree

Engine/source/math/impl/float4_impl.inl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22
#include <cmath> // for sqrtf, etc.
3+
#include "../mConstants.h"
34

45
namespace math_backend::float4
56
{
@@ -46,7 +47,7 @@ namespace math_backend::float4
4647
{
4748
f32x4 va = v_load(a);
4849
f32x4 vb = v_load(b);
49-
f32x4 vr = _mm_div_ps(va, vb);
50+
f32x4 vr = v_div(va, vb);
5051
v_store(r, vr);
5152
}
5253

@@ -84,7 +85,7 @@ namespace math_backend::float4
8485
inline void float4_normalize_impl(float* a)
8586
{
8687
float len = float4_length_impl(a);
87-
if (len > 1e-6f) // safe threshold
88+
if (len > POINT_EPSILON) // safe threshold
8889
{
8990
float4_mul_scalar_impl(a, 1.0f / len, a);
9091
}
@@ -94,7 +95,7 @@ namespace math_backend::float4
9495
inline void float4_normalize_mag_impl(float* a, float r)
9596
{
9697
float len = float4_length_impl(a);
97-
if (len > 1e-6f)
98+
if (len > POINT_EPSILON)
9899
{
99100
float4_mul_scalar_impl(a, r / len, a);
100101
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
2+
#include "float4_dispatch.h"
3+
#include <immintrin.h> // AVX/AVX2 intrinsics
4+
5+
namespace
6+
{
7+
typedef __m128 f32x4;
8+
9+
// Load 4 floats from memory into a SIMD register
10+
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
11+
12+
// Store 4 floats from SIMD register back to memory
13+
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
14+
15+
// Broadcast a single float across all 4 lanes
16+
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
17+
18+
// Element-wise multiply
19+
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
20+
21+
// Element-wise divide
22+
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
23+
24+
// Element-wise add
25+
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
26+
27+
// Element-wise subtract
28+
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
29+
30+
// Horizontal sum of all 4 elements (for dot product, length, etc.)
31+
inline float v_hadd4(f32x4 a)
32+
{
33+
__m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...]
34+
__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
35+
return _mm_cvtss_f32(t2); // extract first element
36+
}
37+
38+
// specialized dot product for AVX
39+
float float4_dot_avx(const float* a, const float* b)
40+
{
41+
f32x4 va = _mm_loadu_ps(a);
42+
f32x4 vb = _mm_loadu_ps(b);
43+
__m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
44+
return _mm_cvtss_f32(dp);
45+
}
46+
}
47+
48+
#include "float4_impl.inl"
49+
50+
namespace math_backend::float4::dispatch
51+
{
52+
// Install AVX backend
53+
void install_avx()
54+
{
55+
gFloat4.add = float4_add_impl;
56+
gFloat4.sub = float4_sub_impl;
57+
gFloat4.mul = float4_mul_impl;
58+
gFloat4.mul_scalar = float4_mul_scalar_impl;
59+
gFloat4.div = float4_div_impl;
60+
gFloat4.div_scalar = float4_div_scalar_impl;
61+
gFloat4.dot = float4_dot_avx;
62+
gFloat4.length = float4_length_impl;
63+
gFloat4.lengthSquared = float4_length_squared_impl;
64+
gFloat4.normalize = float4_normalize_impl;
65+
gFloat4.lerp = float4_lerp_impl;
66+
}
67+
}

Engine/source/math/isa/avx2/float4.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ namespace
3434
__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
3535
return _mm_cvtss_f32(t2); // extract first element
3636
}
37+
38+
// specialized dot product for AVX
39+
float float4_dot_avx(const float* a, const float* b)
40+
{
41+
f32x4 va = _mm_loadu_ps(a);
42+
f32x4 vb = _mm_loadu_ps(b);
43+
__m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
44+
return _mm_cvtss_f32(dp);
45+
}
3746
}
3847

3948
#include "float4_impl.inl"
@@ -49,7 +58,7 @@ namespace math_backend::float4::dispatch
4958
gFloat4.mul_scalar = float4_mul_scalar_impl;
5059
gFloat4.div = float4_div_impl;
5160
gFloat4.div_scalar = float4_div_scalar_impl;
52-
gFloat4.dot = float4_dot_impl;
61+
gFloat4.dot = float4_dot_avx;
5362
gFloat4.length = float4_length_impl;
5463
gFloat4.lengthSquared = float4_length_squared_impl;
5564
gFloat4.normalize = float4_normalize_impl;

Engine/source/math/isa/sse2/float4.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,19 @@ namespace
4040

4141
namespace math_backend::float4::dispatch
4242
{
43-
// Install AVX2 backend
43+
// Install SSE2 backend
4444
void install_sse2()
4545
{
46-
gFloat4.add = float4_add_impl;
47-
gFloat4.sub = float4_sub_impl;
48-
gFloat4.mul = float4_mul_impl;
49-
gFloat4.mul_scalar = float4_mul_scalar_impl;
50-
gFloat4.div = float4_div_impl;
51-
gFloat4.div_scalar = float4_div_scalar_impl;
52-
gFloat4.dot = float4_dot_impl;
53-
gFloat4.length = float4_length_impl;
46+
gFloat4.add = float4_add_impl;
47+
gFloat4.sub = float4_sub_impl;
48+
gFloat4.mul = float4_mul_impl;
49+
gFloat4.mul_scalar = float4_mul_scalar_impl;
50+
gFloat4.div = float4_div_impl;
51+
gFloat4.div_scalar = float4_div_scalar_impl;
52+
gFloat4.dot = float4_dot_impl;
53+
gFloat4.length = float4_length_impl;
5454
gFloat4.lengthSquared = float4_length_squared_impl;
55-
gFloat4.normalize = float4_normalize_impl;
56-
gFloat4.lerp = float4_lerp_impl;
55+
gFloat4.normalize = float4_normalize_impl;
56+
gFloat4.lerp = float4_lerp_impl;
5757
}
5858
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
2+
#include "float4_dispatch.h"
3+
#include <smmintrin.h> // SSE41 intrinsics
4+
5+
namespace
6+
{
7+
typedef __m128 f32x4;
8+
9+
// Load 4 floats from memory into a SIMD register
10+
inline f32x4 v_load(const float* p) { return _mm_loadu_ps(p); }
11+
12+
// Store 4 floats from SIMD register back to memory
13+
inline void v_store(float* dst, f32x4 v) { _mm_storeu_ps(dst, v); }
14+
15+
// Broadcast a single float across all 4 lanes
16+
inline f32x4 v_set1(float s) { return _mm_set1_ps(s); }
17+
18+
// Element-wise multiply
19+
inline f32x4 v_mul(f32x4 a, f32x4 b) { return _mm_mul_ps(a, b); }
20+
21+
// Element-wise divide
22+
inline f32x4 v_div(f32x4 a, f32x4 b) { return _mm_div_ps(a, b); }
23+
24+
// Element-wise add
25+
inline f32x4 v_add(f32x4 a, f32x4 b) { return _mm_add_ps(a, b); }
26+
27+
// Element-wise subtract
28+
inline f32x4 v_sub(f32x4 a, f32x4 b) { return _mm_sub_ps(a, b); }
29+
30+
// Horizontal sum of all 4 elements (for dot product, length, etc.)
31+
inline float v_hadd4(f32x4 a)
32+
{
33+
__m128 t1 = _mm_hadd_ps(a, a); // sums pairs: [a0+a1, a2+a3, ...]
34+
__m128 t2 = _mm_hadd_ps(t1, t1); // sums again: first element = a0+a1+a2+a3
35+
return _mm_cvtss_f32(t2); // extract first element
36+
}
37+
38+
// specialized dot product for SSE4.1
39+
float float4_dot_sse41(const float* a, const float* b)
40+
{
41+
f32x4 va = _mm_loadu_ps(a);
42+
f32x4 vb = _mm_loadu_ps(b);
43+
__m128 dp = _mm_dp_ps(va, vb, 0xF1); // multiply all 4, sum all 4, lowest lane
44+
return _mm_cvtss_f32(dp);
45+
}
46+
}
47+
48+
#include "float4_impl.inl"
49+
50+
namespace math_backend::float4::dispatch
51+
{
52+
// Install SSE41 backend
53+
void install_sse41()
54+
{
55+
gFloat4.add = float4_add_impl;
56+
gFloat4.sub = float4_sub_impl;
57+
gFloat4.mul = float4_mul_impl;
58+
gFloat4.mul_scalar = float4_mul_scalar_impl;
59+
gFloat4.div = float4_div_impl;
60+
gFloat4.div_scalar = float4_div_scalar_impl;
61+
gFloat4.dot = float4_dot_sse41;
62+
gFloat4.length = float4_length_impl;
63+
gFloat4.lengthSquared = float4_length_squared_impl;
64+
gFloat4.normalize = float4_normalize_impl;
65+
gFloat4.lerp = float4_lerp_impl;
66+
}
67+
}

Engine/source/math/public/float4_dispatch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
#pragma once
2+
#ifndef _FLOAT4_DISPATCH_H_
3+
#define _FLOAT4_DISPATCH_H_
4+
5+
26
#include <cstdint>
37

48
namespace math_backend::float4::dispatch
@@ -32,3 +36,5 @@ namespace math_backend::float4::dispatch
3236
// Centralized installer (engine calls this once)
3337
void install_preferred();
3438
}
39+
40+
#endif // !_FLOAT4_DISPATCH_H_
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#pragma once
2+
#include "math/public/math_backend.h"
3+
4+
math_backend::backend math_backend::choose_backend(U32 cpu_flags)
5+
{
6+
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
7+
8+
if (cpu_flags & CPU_PROP_AVX2) return backend::avx2;
9+
if (cpu_flags & CPU_PROP_AVX) return backend::avx;
10+
if (cpu_flags & CPU_PROP_SSE4_1) return backend::sse41;
11+
if (cpu_flags & CPU_PROP_SSE2) return backend::sse2;
12+
13+
#elif defined(__aarch64__) || defined(__ARM_NEON)
14+
15+
if (cpu_flags & CPU_NEON) return backend::neon;
16+
17+
#endif
18+
return backend::scalar;
19+
}
20+
21+
void math_backend::install_from_cpu_flags(uint32_t cpu_flags)
22+
{
23+
{
24+
g_backend = choose_backend(cpu_flags);
25+
26+
switch (g_backend)
27+
{
28+
case backend::avx2:
29+
float4::dispatch::install_avx2();
30+
break;
31+
32+
case backend::avx:
33+
//float4::dispatch::install_avx();
34+
break;
35+
36+
case backend::sse41:
37+
float4::dispatch::install_sse41();
38+
break;
39+
40+
case backend::sse2:
41+
float4::dispatch::install_sse2();
42+
break;
43+
44+
case backend::neon:
45+
float4::dispatch::install_neon();
46+
break;
47+
48+
default:
49+
float4::dispatch::install_scalar();
50+
break;
51+
}
52+
}
53+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#pragma once
2+
#ifndef _MCONSTANTS_H_
3+
#include "math/mConstants.h"
4+
#endif
5+
#ifndef _PLATFORMASSERT_H_
6+
#include "platform/platformAssert.h"
7+
#endif
8+
#ifndef _FLOAT4_DISPATCH_H_
9+
#include "math/public/float4_dispatch.h"
10+
#endif
11+
12+
namespace math_backend
13+
{
14+
enum class backend
15+
{
16+
scalar,
17+
sse2,
18+
sse41,
19+
avx,
20+
avx2,
21+
neon
22+
};
23+
24+
static backend g_backend = backend::scalar;
25+
backend choose_backend(U32 cpu_flags);
26+
void install_from_cpu_flags(uint32_t cpu_flags);
27+
}

Tools/CMake/torque_macros.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ function(add_math_backend name compile_defs)
154154

155155
# ISA flags
156156
if(MSVC)
157-
if(name STREQUAL "sse2" OR name STREQUAL "sse41")
157+
if(name STREQUAL "sse2")
158+
target_compile_options(math_${name} PRIVATE /arch:SSE2)
159+
elseif(name STREQUAL "sse41")
158160
target_compile_options(math_${name} PRIVATE /arch:SSE2)
159161
elseif(name STREQUAL "avx")
160162
target_compile_options(math_${name} PRIVATE /arch:AVX)

0 commit comments

Comments
 (0)