2020#include " Common/Data/Convert/SmallDataConvert.h"
2121#include " Common/Common.h"
2222#include " Common/CPUDetect.h"
23-
24- #ifdef _M_SSE
25- #include < emmintrin.h>
26- #include < smmintrin.h>
27- #endif
28-
29- #if PPSSPP_ARCH(ARM_NEON)
30- #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
31- #include < arm64_neon.h>
32- #else
33- #include < arm_neon.h>
34- #endif
35- #endif
23+ #include " Common/Math/SIMDHeaders.h"
3624
3725void ConvertBGRA8888ToRGBA8888 (u32 *dst, const u32 *src, u32 numPixels) {
38- #ifdef _M_SSE
26+ #if PPSSPP_ARCH(SSE2)
3927 const __m128i maskGA = _mm_set1_epi32 (0xFF00FF00 );
4028
4129 const __m128i *srcp = (const __m128i *)src;
@@ -76,47 +64,44 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
7664 }
7765}
7866
79- #if defined(_M_SSE)
80- #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
81- [[gnu::target("sse4.1")]]
82- #endif
83- static inline void ConvertRGBA8888ToRGBA5551_SSE4 (__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
84- const __m128i maskAG = _mm_set1_epi32 (0x8000F800 );
67+ #if PPSSPP_ARCH(SSE2)
68+ // fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
69+ static inline void ConvertRGBA8888ToRGBA5551 (__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
8570 const __m128i maskRB = _mm_set1_epi32 (0x00F800F8 );
86- const __m128i mask = _mm_set1_epi32 (0x0000FFFF );
71+ const __m128i maskGA = _mm_set1_epi32 (0x8000F800 );
72+ const __m128i mulRB = _mm_set1_epi32 (0x04000001 );
73+ const __m128i mulGA = _mm_set1_epi32 (0x00400001 );
8774
8875 for (u32 i = 0 ; i < sseChunks; i += 2 ) {
89- __m128i c1 = _mm_load_si128 (&srcp[i + 0 ]);
90- __m128i c2 = _mm_load_si128 (&srcp[i + 1 ]);
91- __m128i ag, rb;
92-
93- ag = _mm_and_si128 (c1, maskAG);
94- ag = _mm_or_si128 (_mm_srli_epi32 (ag, 16 ), _mm_srli_epi32 (ag, 6 ));
95- rb = _mm_and_si128 (c1, maskRB);
96- rb = _mm_or_si128 (_mm_srli_epi32 (rb, 3 ), _mm_srli_epi32 (rb, 9 ));
97- c1 = _mm_and_si128 (_mm_or_si128 (ag, rb), mask);
98-
99- ag = _mm_and_si128 (c2, maskAG);
100- ag = _mm_or_si128 (_mm_srli_epi32 (ag, 16 ), _mm_srli_epi32 (ag, 6 ));
101- rb = _mm_and_si128 (c2, maskRB);
102- rb = _mm_or_si128 (_mm_srli_epi32 (rb, 3 ), _mm_srli_epi32 (rb, 9 ));
103- c2 = _mm_and_si128 (_mm_or_si128 (ag, rb), mask);
104-
105- _mm_store_si128 (&dstp[i / 2 ], _mm_packus_epi32 (c1, c2));
76+ __m128i c0 = _mm_load_si128 (&srcp[i + 0 ]);
77+ __m128i c1 = _mm_load_si128 (&srcp[i + 1 ]);
78+
79+ __m128i rb0 = _mm_and_si128 (c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
80+ __m128i rb1 = _mm_and_si128 (c1, maskRB); // 00000000bbbbb00000000000rrrrr000
81+ __m128i ga0 = _mm_and_si128 (c0, maskGA); // a000000000000000ggggg00000000000
82+ __m128i ga1 = _mm_and_si128 (c1, maskGA); // a000000000000000ggggg00000000000
83+ rb0 = _mm_madd_epi16 (_mm_srli_epi32 (rb0, 3 ), mulRB); // 00000000000000000bbbbb00000rrrrr
84+ rb1 = _mm_madd_epi16 (_mm_srli_epi32 (rb1, 3 ), mulRB); // 00000000000000000bbbbb00000rrrrr
85+ ga0 = _mm_madd_epi16 (_mm_srli_epi32 (ga0, 11 ), mulGA); // 000000000000000000000a00000ggggg
86+ ga1 = _mm_madd_epi16 (_mm_srli_epi32 (ga1, 11 ), mulGA); // 000000000000000000000a00000ggggg
87+ __m128i rb = _mm_packs_epi32 (rb0, rb1);
88+ __m128i ga = _mm_slli_epi32 (_mm_packs_epi32 (ga0, ga1), 5 );
89+
90+ _mm_store_si128 (&dstp[i / 2 ], _mm_or_si128 (ga, rb));
10691 }
10792}
10893#endif
10994
11095void ConvertRGBA8888ToRGBA5551 (u16 *dst, const u32 *src, u32 numPixels) {
111- #if defined(_M_SSE )
96+ #if PPSSPP_ARCH(SSE2 )
11297 const __m128i *srcp = (const __m128i *)src;
11398 __m128i *dstp = (__m128i *)dst;
11499 u32 sseChunks = (numPixels / 4 ) & ~1 ;
115100 // SSE 4.1 required for _mm_packus_epi32.
116- if (((intptr_t )src & 0xF ) || ((intptr_t )dst & 0xF ) || !cpu_info. bSSE4_1 ) {
101+ if (((intptr_t )src & 0xF ) || ((intptr_t )dst & 0xF )) {
117102 sseChunks = 0 ;
118103 } else {
119- ConvertRGBA8888ToRGBA5551_SSE4 (dstp, srcp, sseChunks);
104+ ConvertRGBA8888ToRGBA5551 (dstp, srcp, sseChunks);
120105 }
121106
122107 // The remainder starts right after those done via SSE.
@@ -129,11 +114,13 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
129114 }
130115}
131116
132- #if defined(_M_SSE)
117+ #if PPSSPP_ARCH(SSE2)
118+ /*
133119#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
134120[[gnu::target("sse4.1")]]
135121#endif
136- static inline void ConvertBGRA8888ToRGBA5551_SSE4 (__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
122+ */
123+ static inline void ConvertBGRA8888ToRGBA5551 (__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
137124 const __m128i maskAG = _mm_set1_epi32 (0x8000F800 );
138125 const __m128i maskRB = _mm_set1_epi32 (0x00F800F8 );
139126 const __m128i mask = _mm_set1_epi32 (0x0000FFFF );
@@ -155,7 +142,14 @@ static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *
155142 rb = _mm_or_si128 (_mm_srli_epi32 (rb, 19 ), _mm_slli_epi32 (rb, 7 ));
156143 c2 = _mm_and_si128 (_mm_or_si128 (ag, rb), mask);
157144
145+ // Unfortunately no good SSE2 way to do _mm_packus_epi32.
146+ // We can approximate it with a few shuffles.
147+ #if 0
158148 _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
149+ #else
150+ // SSE2 path.
151+ _mm_store_si128 (&dstp[i / 2 ], _mm_packu2_epi32_SSE2 (c1, c2));
152+ #endif
159153 }
160154}
161155#endif
@@ -165,13 +159,11 @@ void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
165159 const __m128i *srcp = (const __m128i *)src;
166160 __m128i *dstp = (__m128i *)dst;
167161 u32 sseChunks = (numPixels / 4 ) & ~1 ;
168- // SSE 4.1 required for _mm_packus_epi32.
169- if (((intptr_t )src & 0xF ) || ((intptr_t )dst & 0xF ) || !cpu_info.bSSE4_1 ) {
162+ if (((intptr_t )src & 0xF ) || ((intptr_t )dst & 0xF )) {
170163 sseChunks = 0 ;
171164 } else {
172- ConvertBGRA8888ToRGBA5551_SSE4 (dstp, srcp, sseChunks);
165+ ConvertBGRA8888ToRGBA5551 (dstp, srcp, sseChunks);
173166 }
174-
175167 // The remainder starts right after those done via SSE.
176168 u32 i = sseChunks * 4 ;
177169#else
@@ -439,7 +431,7 @@ void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
439431}
440432
441433void ConvertRGBA4444ToABGR4444 (u16 *dst, const u16 *src, u32 numPixels) {
442- #ifdef _M_SSE
434+ #if PPSSPP_ARCH(SSE2)
443435 const __m128i mask0040 = _mm_set1_epi16 (0x00F0 );
444436
445437 const __m128i *srcp = (const __m128i *)src;
@@ -505,7 +497,7 @@ void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, u32 numPixels) {
505497}
506498
507499void ConvertRGBA5551ToABGR1555 (u16 *dst, const u16 *src, u32 numPixels) {
508- #ifdef _M_SSE
500+ #if PPSSPP_ARCH(SSE2)
509501 const __m128i maskB = _mm_set1_epi16 (0x003E );
510502 const __m128i maskG = _mm_set1_epi16 (0x07C0 );
511503
@@ -573,7 +565,7 @@ void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
573565}
574566
575567void ConvertRGB565ToBGR565 (u16 *dst, const u16 *src, u32 numPixels) {
576- #ifdef _M_SSE
568+ #if PPSSPP_ARCH(SSE2)
577569 const __m128i maskG = _mm_set1_epi16 (0x07E0 );
578570
579571 const __m128i *srcp = (const __m128i *)src;
0 commit comments