Skip to content

Commit 726ff84

Browse files
authored
Merge branch 'hrydgard:master' into master
2 parents a1e77b6 + 4b4d30e commit 726ff84

272 files changed

Lines changed: 3600 additions & 2510 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,8 @@ add_library(Common STATIC
781781
Common/Input/InputState.cpp
782782
Common/Input/InputState.h
783783
Common/Math/fast/fast_matrix.c
784-
Common/Math/CrossSIMD.h
784+
Common/Math/SIMDHeaders.h
785+
Common/Math/SIMDHeaders.h
785786
Common/Math/curves.cpp
786787
Common/Math/curves.h
787788
Common/Math/expression_parser.cpp
@@ -891,8 +892,6 @@ add_library(Common STATIC
891892
Common/Log.cpp
892893
Common/Log/ConsoleListener.cpp
893894
Common/Log/ConsoleListener.h
894-
Common/Log/StdioListener.cpp
895-
Common/Log/StdioListener.h
896895
Common/Log/LogManager.cpp
897896
Common/Log/LogManager.h
898897
Common/LogReporting.cpp
@@ -1907,6 +1906,8 @@ set(GPU_SOURCES
19071906
GPU/Common/Draw2D.cpp
19081907
GPU/Common/Draw2D.h
19091908
GPU/Common/DepthBufferCommon.cpp
1909+
GPU/Common/DepthRaster.cpp
1910+
GPU/Common/DepthRaster.h
19101911
GPU/Common/TextureShaderCommon.cpp
19111912
GPU/Common/TextureShaderCommon.h
19121913
GPU/Common/DepalettizeShaderCommon.cpp

Common/CPUDetect.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@
3232
#include <sys/sysctl.h>
3333
#endif
3434

35-
#include <algorithm>
3635
#include <cstdint>
3736
#include <memory.h>
3837
#include <set>
38+
#include <algorithm>
3939

4040
#include "Common/Common.h"
4141
#include "Common/CPUDetect.h"

Common/Common.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,3 @@
8787

8888
#define __forceinline inline __attribute__((always_inline))
8989
#endif
90-
91-
#if defined __SSE4_2__
92-
# define _M_SSE 0x402
93-
#elif defined __SSE4_1__
94-
# define _M_SSE 0x401
95-
#elif defined __SSSE3__
96-
# define _M_SSE 0x301
97-
#elif defined __SSE3__
98-
# define _M_SSE 0x300
99-
#elif defined __SSE2__
100-
# define _M_SSE 0x200
101-
#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
102-
# define _M_SSE 0x402
103-
#endif

Common/Common.vcxproj

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@
541541
<ClInclude Include="Math\lin\matrix4x4.h" />
542542
<ClInclude Include="Math\lin\vec3.h" />
543543
<ClInclude Include="Math\math_util.h" />
544+
<ClInclude Include="Math\SIMDHeaders.h" />
544545
<ClInclude Include="Math\Statistics.h" />
545546
<ClInclude Include="Net\HTTPNaettRequest.h" />
546547
<ClInclude Include="Net\NetBuffer.h" />
@@ -586,7 +587,6 @@
586587
<ClInclude Include="CommonTypes.h" />
587588
<ClInclude Include="CommonWindows.h" />
588589
<ClInclude Include="Log\ConsoleListener.h" />
589-
<ClInclude Include="Log\StdioListener.h" />
590590
<ClInclude Include="CPUDetect.h" />
591591
<ClInclude Include="Crypto\md5.h" />
592592
<ClInclude Include="Crypto\sha1.h" />
@@ -1060,7 +1060,6 @@
10601060
<ClCompile Include="Serialize\Serializer.cpp" />
10611061
<ClCompile Include="Data\Convert\ColorConv.cpp" />
10621062
<ClCompile Include="Log\ConsoleListener.cpp" />
1063-
<ClCompile Include="Log\StdioListener.cpp" />
10641063
<ClCompile Include="CPUDetect.cpp" />
10651064
<ClCompile Include="MipsCPUDetect.cpp">
10661065
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>

Common/Common.vcxproj.filters

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,6 @@
559559
<ClInclude Include="Render\Text\draw_text_cocoa.h">
560560
<Filter>Render\Text</Filter>
561561
</ClInclude>
562-
<ClInclude Include="Log\StdioListener.h" />
563562
<ClInclude Include="Log\ConsoleListener.h">
564563
<Filter>Log</Filter>
565564
</ClInclude>
@@ -677,10 +676,12 @@
677676
<ClInclude Include="Data\Collections\LinkedList.h">
678677
<Filter>Data\Collections</Filter>
679678
</ClInclude>
679+
<ClInclude Include="Math\SIMDHeaders.h">
680+
<Filter>Math</Filter>
681+
</ClInclude>
680682
</ItemGroup>
681683
<ItemGroup>
682684
<ClCompile Include="ABI.cpp" />
683-
<ClCompile Include="Log\StdioListener.cpp" />
684685
<ClCompile Include="CPUDetect.cpp" />
685686
<ClCompile Include="FakeCPUDetect.cpp" />
686687
<ClCompile Include="MipsCPUDetect.cpp" />

Common/Data/Convert/ColorConv.cpp

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,10 @@
2020
#include "Common/Data/Convert/SmallDataConvert.h"
2121
#include "Common/Common.h"
2222
#include "Common/CPUDetect.h"
23-
24-
#ifdef _M_SSE
25-
#include <emmintrin.h>
26-
#include <smmintrin.h>
27-
#endif
28-
29-
#if PPSSPP_ARCH(ARM_NEON)
30-
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
31-
#include <arm64_neon.h>
32-
#else
33-
#include <arm_neon.h>
34-
#endif
35-
#endif
23+
#include "Common/Math/SIMDHeaders.h"
3624

3725
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) {
38-
#ifdef _M_SSE
26+
#if PPSSPP_ARCH(SSE2)
3927
const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
4028

4129
const __m128i *srcp = (const __m128i *)src;
@@ -76,47 +64,44 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
7664
}
7765
}
7866

79-
#if defined(_M_SSE)
80-
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
81-
[[gnu::target("sse4.1")]]
82-
#endif
83-
static inline void ConvertRGBA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
84-
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
67+
#if PPSSPP_ARCH(SSE2)
68+
// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
69+
static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
8570
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
86-
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
71+
const __m128i maskGA = _mm_set1_epi32(0x8000F800);
72+
const __m128i mulRB = _mm_set1_epi32(0x04000001);
73+
const __m128i mulGA = _mm_set1_epi32(0x00400001);
8774

8875
for (u32 i = 0; i < sseChunks; i += 2) {
89-
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
90-
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
91-
__m128i ag, rb;
92-
93-
ag = _mm_and_si128(c1, maskAG);
94-
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
95-
rb = _mm_and_si128(c1, maskRB);
96-
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
97-
c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
98-
99-
ag = _mm_and_si128(c2, maskAG);
100-
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
101-
rb = _mm_and_si128(c2, maskRB);
102-
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
103-
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
104-
105-
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
76+
__m128i c0 = _mm_load_si128(&srcp[i + 0]);
77+
__m128i c1 = _mm_load_si128(&srcp[i + 1]);
78+
79+
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
80+
__m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000
81+
__m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000
82+
__m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000
83+
rb0 = _mm_madd_epi16(_mm_srli_epi32(rb0, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
84+
rb1 = _mm_madd_epi16(_mm_srli_epi32(rb1, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
85+
ga0 = _mm_madd_epi16(_mm_srli_epi32(ga0, 11), mulGA); // 000000000000000000000a00000ggggg
86+
ga1 = _mm_madd_epi16(_mm_srli_epi32(ga1, 11), mulGA); // 000000000000000000000a00000ggggg
87+
__m128i rb = _mm_packs_epi32(rb0, rb1);
88+
__m128i ga = _mm_slli_epi32(_mm_packs_epi32(ga0, ga1), 5);
89+
90+
_mm_store_si128(&dstp[i / 2], _mm_or_si128(ga, rb));
10691
}
10792
}
10893
#endif
10994

11095
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
111-
#if defined(_M_SSE)
96+
#if PPSSPP_ARCH(SSE2)
11297
const __m128i *srcp = (const __m128i *)src;
11398
__m128i *dstp = (__m128i *)dst;
11499
u32 sseChunks = (numPixels / 4) & ~1;
115100
// SSE 4.1 required for _mm_packus_epi32.
116-
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
101+
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
117102
sseChunks = 0;
118103
} else {
119-
ConvertRGBA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);
104+
ConvertRGBA8888ToRGBA5551(dstp, srcp, sseChunks);
120105
}
121106

122107
// The remainder starts right after those done via SSE.
@@ -129,11 +114,13 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
129114
}
130115
}
131116

132-
#if defined(_M_SSE)
117+
#if PPSSPP_ARCH(SSE2)
118+
/*
133119
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
134120
[[gnu::target("sse4.1")]]
135121
#endif
136-
static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
122+
*/
123+
static inline void ConvertBGRA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
137124
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
138125
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
139126
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
@@ -155,7 +142,14 @@ static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *
155142
rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
156143
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
157144

145+
// Unfortunately no good SSE2 way to do _mm_packus_epi32.
146+
// We can approximate it with a few shuffles.
147+
#if 0
158148
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
149+
#else
150+
// SSE2 path.
151+
_mm_store_si128(&dstp[i / 2], _mm_packu2_epi32_SSE2(c1, c2));
152+
#endif
159153
}
160154
}
161155
#endif
@@ -165,13 +159,11 @@ void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
165159
const __m128i *srcp = (const __m128i *)src;
166160
__m128i *dstp = (__m128i *)dst;
167161
u32 sseChunks = (numPixels / 4) & ~1;
168-
// SSE 4.1 required for _mm_packus_epi32.
169-
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
162+
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
170163
sseChunks = 0;
171164
} else {
172-
ConvertBGRA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks);
165+
ConvertBGRA8888ToRGBA5551(dstp, srcp, sseChunks);
173166
}
174-
175167
// The remainder starts right after those done via SSE.
176168
u32 i = sseChunks * 4;
177169
#else
@@ -439,7 +431,7 @@ void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
439431
}
440432

441433
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, u32 numPixels) {
442-
#ifdef _M_SSE
434+
#if PPSSPP_ARCH(SSE2)
443435
const __m128i mask0040 = _mm_set1_epi16(0x00F0);
444436

445437
const __m128i *srcp = (const __m128i *)src;
@@ -505,7 +497,7 @@ void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, u32 numPixels) {
505497
}
506498

507499
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
508-
#ifdef _M_SSE
500+
#if PPSSPP_ARCH(SSE2)
509501
const __m128i maskB = _mm_set1_epi16(0x003E);
510502
const __m128i maskG = _mm_set1_epi16(0x07C0);
511503

@@ -573,7 +565,7 @@ void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
573565
}
574566

575567
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, u32 numPixels) {
576-
#ifdef _M_SSE
568+
#if PPSSPP_ARCH(SSE2)
577569
const __m128i maskG = _mm_set1_epi16(0x07E0);
578570

579571
const __m128i *srcp = (const __m128i *)src;

Common/Data/Convert/SmallDataConvert.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,8 @@
66

77
#include "Common/Common.h"
88
#include "ppsspp_config.h"
9+
#include "Common/Math/SIMDHeaders.h"
910

10-
#ifdef _M_SSE
11-
#include <emmintrin.h>
12-
#endif
13-
#if PPSSPP_ARCH(ARM_NEON)
14-
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
15-
#include <arm64_neon.h>
16-
#else
17-
#include <arm_neon.h>
18-
#endif
19-
#endif
2011

2112
extern const float one_over_255_x4[4];
2213
extern const float exactly_255_x4[4];

Common/Data/Encoding/Compression.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44

55
#include <string>
66
#include <stdexcept>
7-
#include <iostream>
8-
#include <iomanip>
9-
#include <sstream>
107
#include <cstring>
118

129
#include <zlib.h>
@@ -48,8 +45,7 @@ bool compress_string(const std::string& str, std::string *dest, int compressionl
4845
deflateEnd(&zs);
4946

5047
if (ret != Z_STREAM_END) { // an error occurred that was not EOF
51-
std::ostringstream oss;
52-
oss << "Exception during zlib compression: (" << ret << ") " << zs.msg;
48+
ERROR_LOG(Log::IO, "Exception during zlib compression: (%d): %s", ret, zs.msg);
5349
return false;
5450
}
5551

Common/Data/Encoding/Utf8.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include <cstring>
2323
#include <cstdarg>
2424
#include <cstdint>
25-
2625
#include <algorithm>
2726
#include <string>
2827

Common/Data/Format/IniFile.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <cstdlib>
66
#include <cstdio>
7+
#include <algorithm> // for sort
78

89
#include <inttypes.h>
910

@@ -12,7 +13,6 @@
1213
#include <strings.h>
1314
#endif
1415

15-
#include <algorithm>
1616
#include <iostream>
1717
#include <fstream>
1818
#include <sstream>

0 commit comments

Comments
 (0)