Skip to content

Commit bf98c37

Browse files
committed
Added SIMD dot product functions.
Uses the best available implementation based on the instructions available on the target, including the SSE dp instructions. Removed scalar operations for SIMD matrix functions that were used as a fallback when hadd wasn't supported. Length and distance functions now always use SIMD operations when available for dsVector2d and dsVector4f/d.
1 parent e11dbee commit bf98c37

6 files changed

Lines changed: 873 additions & 216 deletions

File tree

Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,325 @@
1+
/*
2+
* Copyright 2026 Aaron Barany
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <DeepSea/Core/Config.h>
20+
#include <DeepSea/Math/SIMD/SIMD.h>
21+
22+
#ifdef __cplusplus
23+
extern "C"
24+
{
25+
#endif
26+
27+
/**
28+
* @brief File containing functions to perform dot products with SIMD operations.
29+
*
30+
* These will use the generally best operations available for the given platform, including special
31+
* instructions not available through the generic SIMD functions exposed across platforms.
32+
*/
33+
34+
#if DS_HAS_SIMD
35+
36+
/// @cond
37+
#if DS_X86 && (defined(__SSE4_1__) || defined(__AVX__) || DS_X86_ARCH_LEVEL >= 2)
38+
#define DS_SIMD_HAS_SSE_DP 1
39+
#else
40+
#define DS_SIMD_HAS_SSE_DP 0
41+
#endif
42+
/// @endcond
43+
44+
DS_SIMD_START(DS_SIMD_FLOAT4)
45+
46+
/**
47+
* @brief Performs the dot product between two 4-component vectors.
48+
* @remark This can be used when dsSIMDFeatures_Float4 is available, and will use the most efficient
49+
* implementation based on what is enabled at compile time.
50+
* @param a The first vector.
51+
* @return b The second vector.
52+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w).
53+
*/
54+
DS_ALWAYS_INLINE dsSIMD4f dsDot4SIMD4f(dsSIMD4f a, dsSIMD4f b)
55+
{
56+
#if DS_SIMD_HAS_SSE_DP
57+
return _mm_dp_ps(a, b, 0xFF);
58+
#elif DS_SIMD_ALWAYS_HADD
59+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
60+
ab = dsSIMD4f_hadd(ab, ab);
61+
return dsSIMD4f_hadd(ab, ab);
62+
#elif DS_X86
63+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
64+
// Assume additions are commutative. (should be the case if IEEE compliant)
65+
ab = dsSIMD4f_add(ab, _mm_shuffle_ps(ab, ab, _MM_SHUFFLE(2, 3, 0, 1)));
66+
return dsSIMD4f_add(ab, _mm_shuffle_ps(ab, ab, _MM_SHUFFLE(0, 0, 2, 2)));
67+
#else
68+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
69+
dsSIMD4f abxy = dsSIMD4f_add(dsSIMD4f_set1FromVec(ab, 0), dsSIMD4f_set1FromVec(ab, 1));
70+
dsSIMD4f abzw = dsSIMD4f_add(dsSIMD4f_set1FromVec(ab, 2), dsSIMD4f_set1FromVec(ab, 3));
71+
return dsSIMD4f_add(abxy, abzw);
72+
#endif
73+
}
74+
75+
/**
76+
* @brief Performs the dot product between two 3-component vectors.
77+
* @remark This can be used when dsSIMDFeatures_Float4 is available, and will use the most efficient
78+
* implementation based on what is enabled at compile time.
79+
* @param a The first vector.
80+
* @return b The second vector.
81+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + a.z*b.z.
82+
*/
83+
DS_ALWAYS_INLINE dsSIMD4f dsDot3SIMD4f(dsSIMD4f a, dsSIMD4f b)
84+
{
85+
#if DS_SIMD_HAS_SSE_DP
86+
return _mm_dp_ps(a, b, 0x7F);
87+
#else
88+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
89+
dsSIMD4f abxy = dsSIMD4f_add(dsSIMD4f_set1FromVec(ab, 0), dsSIMD4f_set1FromVec(ab, 1));
90+
return dsSIMD4f_add(abxy, dsSIMD4f_set1FromVec(ab, 2));
91+
#endif
92+
}
93+
94+
DS_SIMD_END()
95+
DS_SIMD_START(DS_SIMD_FLOAT4,DS_SIMD_FMA)
96+
97+
/**
98+
* @brief Performs the dot product between two 4-component vectors.
99+
* @remark This is intended when dsSIMDFeatures_Float4 and dsSIMDFeatures_FMA are available, where
100+
* more assumptions can be made about available operations. No FMA operations are performed.
101+
* @param a The first vector.
102+
* @return b The second vector.
103+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w).
104+
*/
105+
DS_ALWAYS_INLINE dsSIMD4f dsDot4FMA4f(dsSIMD4f a, dsSIMD4f b)
106+
{
107+
#if DS_X86
108+
return _mm_dp_ps(a, b, 0xFF);
109+
#else
110+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
111+
ab = dsSIMD4f_hadd(ab, ab);
112+
return dsSIMD4f_hadd(ab, ab);
113+
#endif
114+
}
115+
116+
/**
117+
* @brief Performs the dot product between two 3-component vectors.
118+
* @remark This is intended when dsSIMDFeatures_Float4 and dsSIMDFeatures_FMA are available, where
119+
* more assumptions can be made about available operations. No FMA operations are performed.
120+
* @param a The first vector.
121+
* @return b The second vector.
122+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + a.z*b.z.
123+
*/
124+
DS_ALWAYS_INLINE dsSIMD4f dsDot3FMA4f(dsSIMD4f a, dsSIMD4f b)
125+
{
126+
#if DS_X86
127+
return _mm_dp_ps(a, b, 0x7F);
128+
#else
129+
dsSIMD4f ab = dsSIMD4f_mul(a, b);
130+
dsSIMD4f abxy = dsSIMD4f_add(dsSIMD4f_set1FromVec(ab, 0), dsSIMD4f_set1FromVec(ab, 1));
131+
return dsSIMD4f_add(abxy, dsSIMD4f_set1FromVec(ab, 2));
132+
#endif
133+
}
134+
135+
DS_SIMD_END()
136+
DS_SIMD_START(DS_SIMD_DOUBLE2)
137+
138+
/**
139+
* @brief Performs the dot product between two 4-component vectors.
140+
* @remark This can be used when dsSIMDFeatures_Double2 is available, and will use the most
141+
* efficient implementation based on what is enabled at compile time.
142+
* @param a0 The first two components of the first vector.
143+
* @param a1 The second two components of the first vector.
144+
* @return b0 The first two components of the second vector.
145+
* @return b0 The second two components of the second vector.
146+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w).
147+
*/
148+
DS_ALWAYS_INLINE dsSIMD2d dsDot4SIMD2d(dsSIMD2d a0, dsSIMD2d a1, dsSIMD2d b0, dsSIMD2d b1)
149+
{
150+
#if DS_SIMD_HAS_SSE_DP
151+
return dsSIMD2d_add(_mm_dp_pd(a0, b0, 0x33), _mm_dp_pd(a1, b1, 0x33));
152+
#elif DS_SIMD_ALWAYS_HADD
153+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
154+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
155+
dsSIMD2d dot = dsSIMD2d_hadd(ab0, ab1);
156+
return dsSIMD2d_hadd(dot, dot);
157+
#elif DS_X86
158+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
159+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
160+
// Assume additions are commutative. (should be the case if IEEE compliant)
161+
dsSIMD2d abxy = dsSIMD2d_add(ab0, _mm_shuffle_pd(ab0, ab0, 0x1));
162+
dsSIMD2d abzw = dsSIMD2d_add(ab1, _mm_shuffle_pd(ab1, ab1, 0x1));
163+
return dsSIMD2d_add(abxy, abzw);
164+
#else
165+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
166+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
167+
dsSIMD2d abxy = dsSIMD2d_add(dsSIMD2d_set1FromVec(ab0, 0), dsSIMD2d_set1FromVec(ab0, 1));
168+
dsSIMD2d abzw = dsSIMD2d_add(dsSIMD2d_set1FromVec(ab1, 0), dsSIMD2d_set1FromVec(ab1, 1));
169+
return dsSIMD2d_add(abxy, abzw);
170+
#endif
171+
}
172+
173+
/**
174+
* @brief Performs the dot product between two 3-component vectors.
175+
* @remark This can be used when dsSIMDFeatures_Double2 is available, and will use the most
176+
* efficient implementation based on what is enabled at compile time.
177+
* @param a0 The first two components of the first vector.
178+
* @param a1 The second two components of the first vector.
179+
* @return b0 The first two components of the second vector.
180+
* @return b0 The second two components of the second vector.
181+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + a.z*b.z.
182+
*/
183+
DS_ALWAYS_INLINE dsSIMD2d dsDot3SIMD2d(dsSIMD2d a0, dsSIMD2d a1, dsSIMD2d b0, dsSIMD2d b1)
184+
{
185+
#if DS_SIMD_HAS_SSE_DP
186+
return dsSIMD2d_add(_mm_dp_pd(a0, b0, 0x33), dsSIMD2d_set1FromVec(dsSIMD2d_mul(a1, b1), 0));
187+
#else
188+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
189+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
190+
dsSIMD2d abxy = dsSIMD2d_add(dsSIMD2d_set1FromVec(ab0, 0), dsSIMD2d_set1FromVec(ab0, 1));
191+
return dsSIMD2d_add(abxy, dsSIMD2d_set1FromVec(ab1, 0));
192+
#endif
193+
}
194+
195+
/**
196+
* @brief Performs the dot product between two 2-component vectors.
197+
* @remark This can be used when dsSIMDFeatures_Double2 is available, and will use the most
198+
* efficient implementation based on what is enabled at compile time.
199+
* @param a The first vector.
200+
* @return b The second vector.
201+
* @return A vector with all components set to a.x*b.x + a.y*b.y.
202+
*/
203+
DS_ALWAYS_INLINE dsSIMD2d dsDot2SIMD2d(dsSIMD2d a, dsSIMD2d b)
204+
{
205+
#if DS_SIMD_HAS_SSE_DP
206+
return _mm_dp_pd(a, b, 0x33);
207+
#elif DS_SIMD_ALWAYS_HADD
208+
dsSIMD2d ab = dsSIMD2d_mul(a, b);
209+
return dsSIMD2d_hadd(ab, ab);
210+
#else
211+
dsSIMD2d ab = dsSIMD2d_mul(a, b);
212+
return dsSIMD2d_add(dsSIMD2d_set1FromVec(ab, 0), dsSIMD2d_set1FromVec(ab, 1));
213+
#endif
214+
}
215+
216+
DS_SIMD_END()
217+
DS_SIMD_START(DS_SIMD_DOUBLE2,DS_SIMD_FMA)
218+
219+
/**
220+
* @brief Performs the dot product between two 4-component vectors.
221+
* @remark This is intended when dsSIMDFeatures_Double2 and dsSIMDFeatures_FMA are available, where
222+
* more assumptions can be made about available operations. No FMA operations are performed.
223+
* @param a0 The first two components of the first vector.
224+
* @param a1 The second two components of the first vector.
225+
* @return b0 The first two components of the second vector.
226+
* @return b0 The second two components of the second vector.
227+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w).
228+
*/
229+
DS_ALWAYS_INLINE dsSIMD2d dsDot4FMA2d(dsSIMD2d a0, dsSIMD2d a1, dsSIMD2d b0, dsSIMD2d b1)
230+
{
231+
#if DS_X86
232+
return dsSIMD2d_add(_mm_dp_pd(a0, b0, 0x33), _mm_dp_pd(a1, b1, 0x33));
233+
#else
234+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
235+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
236+
dsSIMD2d dot = dsSIMD2d_hadd(ab0, ab1);
237+
return dsSIMD2d_hadd(dot, dot);
238+
#endif
239+
}
240+
241+
/**
242+
* @brief Performs the dot product between two 3-component vectors.
243+
* @remark This is intended when dsSIMDFeatures_Double2 and dsSIMDFeatures_FMA are available, where
244+
* more assumptions can be made about available operations. No FMA operations are performed.
245+
* @param a0 The first two components of the first vector.
246+
* @param a1 The second two components of the first vector.
247+
* @return b0 The first two components of the second vector.
248+
* @return b0 The second two components of the second vector.
249+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + a.z*b.z.
250+
*/
251+
DS_ALWAYS_INLINE dsSIMD2d dsDot3FMA2d(dsSIMD2d a0, dsSIMD2d a1, dsSIMD2d b0, dsSIMD2d b1)
252+
{
253+
#if DS_X86
254+
return dsSIMD2d_add(_mm_dp_pd(a0, b0, 0x33), dsSIMD2d_set1FromVec(dsSIMD2d_mul(a1, b1), 0));
255+
#else
256+
dsSIMD2d ab0 = dsSIMD2d_mul(a0, b0);
257+
dsSIMD2d ab1 = dsSIMD2d_mul(a1, b1);
258+
dsSIMD2d abxy = dsSIMD2d_add(dsSIMD2d_set1FromVec(ab0, 0), dsSIMD2d_set1FromVec(ab0, 1));
259+
return dsSIMD2d_add(abxy, dsSIMD2d_set1FromVec(ab1, 0));
260+
#endif
261+
}
262+
263+
/**
264+
* @brief Performs the dot product between two 2-component vectors.
265+
* @remark This can be used when dsSIMDFeatures_Double2 is available, and will use the most
266+
* efficient implementation based on what is enabled at compile time.
267+
* @param a The first vector.
268+
* @return b The second vector.
269+
* @return A vector with all components set to a.x*b.x + a.y*b.y.
270+
*/
271+
DS_ALWAYS_INLINE dsSIMD2d dsDot2FMA2d(dsSIMD2d a, dsSIMD2d b)
272+
{
273+
#if DS_X86
274+
return _mm_dp_pd(a, b, 0x33);
275+
#else
276+
dsSIMD2d ab = dsSIMD2d_mul(a, b);
277+
return dsSIMD2d_hadd(ab, ab);
278+
#endif
279+
}
280+
281+
DS_SIMD_END()
282+
DS_SIMD_START(DS_SIMD_DOUBLE4)
283+
284+
/**
285+
* @brief Performs the dot product between two 4-component vectors.
286+
* @remark This can be used when dsSIMDFeatures_Double4 is available, and will use the most efficient
287+
* implementation based on what is enabled at compile time.
288+
* @param a The first vector.
289+
* @return b The second vector.
290+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w).
291+
*/
292+
DS_ALWAYS_INLINE dsSIMD4d dsDot4SIMD4d(dsSIMD4d a, dsSIMD4d b)
293+
{
294+
dsSIMD4d ab = dsSIMD4d_mul(a, b);
295+
ab = dsSIMD4d_hadd(ab, ab);
296+
#if DS_X86
297+
// Expected to be faster since permutations are fairly slow across 128-bit boundaries.
298+
return dsSIMD4d_add(ab, _mm256_permute4x64_pd(ab, _MM_SHUFFLE(0, 0, 2, 2)));
299+
#else
300+
return dsSIMD4d_add(dsSIMD4d_set1FromVec(ab, 0), dsSIMD4d_set1FromVec(ab, 2));
301+
#endif
302+
}
303+
304+
/**
305+
* @brief Performs the dot product between two 3-component vectors.
306+
* @remark This can be used when dsSIMDFeatures_Double4 is available, and will use the most efficient
307+
* implementation based on what is enabled at compile time.
308+
* @param a The first vector.
309+
* @return b The second vector.
310+
* @return A vector with all components set to (a.x*b.x + a.y*b.y) + a.z*b.z.
311+
*/
312+
DS_ALWAYS_INLINE dsSIMD4d dsDot3SIMD4d(dsSIMD4d a, dsSIMD4d b)
313+
{
314+
dsSIMD4d ab = dsSIMD4d_mul(a, b);
315+
dsSIMD4d abxy = dsSIMD4d_add(dsSIMD4d_set1FromVec(ab, 0), dsSIMD4d_set1FromVec(ab, 1));
316+
return dsSIMD4d_add(abxy, dsSIMD4d_set1FromVec(ab, 2));
317+
}
318+
319+
DS_SIMD_END()
320+
321+
#endif // DS_HAS_SIMD
322+
323+
#ifdef __cplusplus
324+
}
325+
#endif

0 commit comments

Comments
 (0)