11// Copyright (c) Six Labors.
22// Licensed under the Six Labors Split License.
33
4+ using System . Diagnostics ;
5+ using System . Diagnostics . CodeAnalysis ;
6+ using System . Runtime . CompilerServices ;
47using System . Runtime . InteropServices ;
58using System . Runtime . Intrinsics ;
69using System . Runtime . Intrinsics . X86 ;
@@ -17,11 +20,11 @@ internal static partial class ZigZag
1720#pragma warning restore SA1309
1821
1922 /// <summary>
20- /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3 "/>
23+ /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128 "/>
2124 /// zig zag implementation.
2225 /// </summary>
23- private static ReadOnlySpan < byte > SseShuffleMasks => new byte [ ]
24- {
26+ private static ReadOnlySpan < byte > SseShuffleMasks =>
27+ [
2528#pragma warning disable SA1515
2629 /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
2730 // A
@@ -83,14 +86,14 @@ internal static partial class ZigZag
8386 // H
8487 _ , _ , _ , _ , _ , _ , _ , _ , 10 , 11 , 12 , 13 , _ , _ , 14 , 15 ,
8588#pragma warning restore SA1515
86- } ;
89+ ] ;
8790
8891 /// <summary>
8992 /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
9093 /// zig zag implementation.
9194 /// </summary>
92- private static ReadOnlySpan < byte > AvxShuffleMasks => new byte [ ]
93- {
95+ private static ReadOnlySpan < byte > AvxShuffleMasks =>
96+ [
9497#pragma warning disable SA1515
9598 /* 01 */
9699 // [cr] crln_01_AB_CD
@@ -138,15 +141,15 @@ internal static partial class ZigZag
138141 // (in) GH
139142 _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 10 , 11 , 12 , 13 , 2 , 3 , _ , _ , _ , _ , _ , _ , 0 , 1 , 6 , 7 , 8 , 9 , 2 , 3 , 10 , 11 ,
140143#pragma warning restore SA1515
141- } ;
144+ ] ;
142145
143146 /// <summary>
144- /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
147+ /// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
145148 /// </summary>
146149 /// <param name="block">Input matrix.</param>
147- public static unsafe void ApplyTransposingZigZagOrderingSsse3 ( ref Block8x8 block )
150+ public static unsafe void ApplyTransposingZigZagOrderingVector128 ( ref Block8x8 block )
148151 {
149- DebugGuard . IsTrue ( Ssse3 . IsSupported , "Ssse3 support is required to run this operation!" ) ;
152+ DebugGuard . IsTrue ( Vector128 . IsHardwareAccelerated , "Vector128 support is required to run this operation!" ) ;
150153
151154 fixed ( byte * shuffleVectorsPtr = & MemoryMarshal . GetReference ( SseShuffleMasks ) )
152155 {
@@ -160,68 +163,68 @@ public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block
160163 Vector128 < byte > rowH = block . V7 . AsByte ( ) ;
161164
162165 // row0 - A0 B0 A1 A2 B1 C0 D0 C1
163- Vector128 < short > row0_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 0 ) ) ) . AsInt16 ( ) ;
164- Vector128 < short > row0_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 1 ) ) ) . AsInt16 ( ) ;
165- Vector128 < short > row0_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 2 ) ) ) . AsInt16 ( ) ;
166- Vector128 < short > row0 = Sse2 . Or ( Sse2 . Or ( row0_A , row0_B ) , row0_C ) ;
167- row0 = Sse2 . Insert ( row0 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
166+ Vector128 < short > row0_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 0 ) ) ) . AsInt16 ( ) ;
167+ Vector128 < short > row0_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 1 ) ) ) . AsInt16 ( ) ;
168+ Vector128 < short > row0_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 2 ) ) ) . AsInt16 ( ) ;
169+ Vector128 < short > row0 = row0_A | row0_B | row0_C ;
170+ row0 = row0 . AsUInt16 ( ) . WithElement ( 6 , rowD . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
168171
169172 // row1 - B2 A3 A4 B3 C2 D1 E0 F0
170- Vector128 < short > row1_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 3 ) ) ) . AsInt16 ( ) ;
171- Vector128 < short > row1_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 4 ) ) ) . AsInt16 ( ) ;
172- Vector128 < short > row1 = Sse2 . Or ( row1_A , row1_B ) ;
173- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 2 ) , 4 ) . AsInt16 ( ) ;
174- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 1 ) , 5 ) . AsInt16 ( ) ;
175- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
176- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 0 ) , 7 ) . AsInt16 ( ) ;
173+ Vector128 < short > row1_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 3 ) ) ) . AsInt16 ( ) ;
174+ Vector128 < short > row1_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 4 ) ) ) . AsInt16 ( ) ;
175+ Vector128 < short > row1 = row1_A | row1_B ;
176+ row1 = row1 . AsUInt16 ( ) . WithElement ( 4 , rowC . AsUInt16 ( ) . GetElement ( 2 ) ) . AsInt16 ( ) ;
177+ row1 = row1 . AsUInt16 ( ) . WithElement ( 5 , rowD . AsUInt16 ( ) . GetElement ( 1 ) ) . AsInt16 ( ) ;
178+ row1 = row1 . AsUInt16 ( ) . WithElement ( 6 , rowE . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
179+ row1 = row1 . AsUInt16 ( ) . WithElement ( 7 , rowF . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
177180
178181 // row2 - E1 D2 C3 B4 A5 A6 B5 C4
179- Vector128 < short > row2_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 5 ) ) ) . AsInt16 ( ) ;
180- Vector128 < short > row2_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 6 ) ) ) . AsInt16 ( ) ;
181- Vector128 < short > row2_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 7 ) ) ) . AsInt16 ( ) ;
182- Vector128 < short > row2 = Sse2 . Or ( Sse2 . Or ( row2_A , row2_B ) , row2_C ) ;
183- row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 2 ) , 1 ) . AsInt16 ( ) ;
184- row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 1 ) , 0 ) . AsInt16 ( ) ;
182+ Vector128 < short > row2_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 5 ) ) ) . AsInt16 ( ) ;
183+ Vector128 < short > row2_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 6 ) ) ) . AsInt16 ( ) ;
184+ Vector128 < short > row2_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 7 ) ) ) . AsInt16 ( ) ;
185+ Vector128 < short > row2 = row2_A | row2_B | row2_C ;
186+ row2 = row2 . AsUInt16 ( ) . WithElement ( 1 , rowD . AsUInt16 ( ) . GetElement ( 2 ) ) . AsInt16 ( ) ;
187+ row2 = row2 . AsUInt16 ( ) . WithElement ( 0 , rowE . AsUInt16 ( ) . GetElement ( 1 ) ) . AsInt16 ( ) ;
185188
186189 // row3 - D3 E2 F1 G0 H0 G1 F2 E3
187- Vector128 < short > row3_E = Ssse3 . Shuffle ( rowE , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 8 ) ) ) . AsInt16 ( ) ;
188- Vector128 < short > row3_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 9 ) ) ) . AsInt16 ( ) ;
189- Vector128 < short > row3_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 10 ) ) ) . AsInt16 ( ) ;
190- Vector128 < short > row3 = Sse2 . Or ( Sse2 . Or ( row3_E , row3_F ) , row3_G ) ;
191- row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 3 ) , 0 ) . AsInt16 ( ) ;
192- row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowH . AsUInt16 ( ) , 0 ) , 4 ) . AsInt16 ( ) ;
190+ Vector128 < short > row3_E = ZShuffle ( rowE , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 8 ) ) ) . AsInt16 ( ) ;
191+ Vector128 < short > row3_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 9 ) ) ) . AsInt16 ( ) ;
192+ Vector128 < short > row3_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 10 ) ) ) . AsInt16 ( ) ;
193+ Vector128 < short > row3 = row3_E | row3_F | row3_G ;
194+ row3 = row3 . AsUInt16 ( ) . WithElement ( 0 , rowD . AsUInt16 ( ) . GetElement ( 3 ) ) . AsInt16 ( ) ;
195+ row3 = row3 . AsUInt16 ( ) . WithElement ( 4 , rowH . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
193196
194197 // row4 - D4 C5 B6 A7 B7 C6 D5 E4
195- Vector128 < short > row4_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 11 ) ) ) . AsInt16 ( ) ;
196- Vector128 < short > row4_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 12 ) ) ) . AsInt16 ( ) ;
197- Vector128 < short > row4_D = Ssse3 . Shuffle ( rowD , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 13 ) ) ) . AsInt16 ( ) ;
198- Vector128 < short > row4 = Sse2 . Or ( Sse2 . Or ( row4_B , row4_C ) , row4_D ) ;
199- row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowA . AsUInt16 ( ) , 7 ) , 3 ) . AsInt16 ( ) ;
200- row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 4 ) , 7 ) . AsInt16 ( ) ;
198+ Vector128 < short > row4_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 11 ) ) ) . AsInt16 ( ) ;
199+ Vector128 < short > row4_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 12 ) ) ) . AsInt16 ( ) ;
200+ Vector128 < short > row4_D = ZShuffle ( rowD , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 13 ) ) ) . AsInt16 ( ) ;
201+ Vector128 < short > row4 = row4_B | row4_C | row4_D ;
202+ row4 = row4 . AsUInt16 ( ) . WithElement ( 3 , rowA . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
203+ row4 = row4 . AsUInt16 ( ) . WithElement ( 7 , rowE . AsUInt16 ( ) . GetElement ( 4 ) ) . AsInt16 ( ) ;
201204
202205 // row5 - F3 G2 H1 H2 G3 F4 E5 D6
203- Vector128 < short > row5_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 14 ) ) ) . AsInt16 ( ) ;
204- Vector128 < short > row5_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 15 ) ) ) . AsInt16 ( ) ;
205- Vector128 < short > row5_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 16 ) ) ) . AsInt16 ( ) ;
206- Vector128 < short > row5 = Sse2 . Or ( Sse2 . Or ( row5_F , row5_G ) , row5_H ) ;
207- row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 6 ) , 7 ) . AsInt16 ( ) ;
208- row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 5 ) , 6 ) . AsInt16 ( ) ;
206+ Vector128 < short > row5_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 14 ) ) ) . AsInt16 ( ) ;
207+ Vector128 < short > row5_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 15 ) ) ) . AsInt16 ( ) ;
208+ Vector128 < short > row5_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 16 ) ) ) . AsInt16 ( ) ;
209+ Vector128 < short > row5 = row5_F | row5_G | row5_H ;
210+ row5 = row5 . AsUInt16 ( ) . WithElement ( 7 , rowD . AsUInt16 ( ) . GetElement ( 6 ) ) . AsInt16 ( ) ;
211+ row5 = row5 . AsUInt16 ( ) . WithElement ( 6 , rowE . AsUInt16 ( ) . GetElement ( 5 ) ) . AsInt16 ( ) ;
209212
210213 // row6 - C7 D7 E6 F5 G4 H3 H4 G5
211- Vector128 < short > row6_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 17 ) ) ) . AsInt16 ( ) ;
212- Vector128 < short > row6_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 18 ) ) ) . AsInt16 ( ) ;
213- Vector128 < short > row6 = Sse2 . Or ( row6_G , row6_H ) ;
214- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 7 ) , 0 ) . AsInt16 ( ) ;
215- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
216- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 6 ) , 2 ) . AsInt16 ( ) ;
217- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 5 ) , 3 ) . AsInt16 ( ) ;
214+ Vector128 < short > row6_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 17 ) ) ) . AsInt16 ( ) ;
215+ Vector128 < short > row6_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 18 ) ) ) . AsInt16 ( ) ;
216+ Vector128 < short > row6 = row6_G | row6_H ;
217+ row6 = row6 . AsUInt16 ( ) . WithElement ( 0 , rowC . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
218+ row6 = row6 . AsUInt16 ( ) . WithElement ( 1 , rowD . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
219+ row6 = row6 . AsUInt16 ( ) . WithElement ( 2 , rowE . AsUInt16 ( ) . GetElement ( 6 ) ) . AsInt16 ( ) ;
220+ row6 = row6 . AsUInt16 ( ) . WithElement ( 3 , rowF . AsUInt16 ( ) . GetElement ( 5 ) ) . AsInt16 ( ) ;
218221
219222 // row7 - F6 E7 F7 G6 H5 H6 G7 H7
220- Vector128 < short > row7_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 19 ) ) ) . AsInt16 ( ) ;
221- Vector128 < short > row7_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 20 ) ) ) . AsInt16 ( ) ;
222- Vector128 < short > row7_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 21 ) ) ) . AsInt16 ( ) ;
223- Vector128 < short > row7 = Sse2 . Or ( Sse2 . Or ( row7_F , row7_G ) , row7_H ) ;
224- row7 = Sse2 . Insert ( row7 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
223+ Vector128 < short > row7_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 19 ) ) ) . AsInt16 ( ) ;
224+ Vector128 < short > row7_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 20 ) ) ) . AsInt16 ( ) ;
225+ Vector128 < short > row7_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 21 ) ) ) . AsInt16 ( ) ;
226+ Vector128 < short > row7 = row7_F | row7_G | row7_H ;
227+ row7 = row7 . AsUInt16 ( ) . WithElement ( 1 , rowE . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
225228
226229 block . V0 = row0 ;
227230 block . V1 = row1 ;
@@ -300,4 +303,20 @@ public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
300303 block . V67 = row67 . AsInt16 ( ) ;
301304 }
302305 }
306+
307+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
308+ private static Vector128 < byte > ZShuffle ( Vector128 < byte > source , Vector128 < byte > mask )
309+ {
310+ // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
311+ if ( Ssse3 . IsSupported )
312+ {
313+ return Ssse3 . Shuffle ( source , mask ) ;
314+ }
315+
316+ // For ARM and WASM, codegen will be optimal.
317+ return Vector128 . Shuffle ( source , mask ) ;
318+ }
319+
320+ [ DoesNotReturn ]
321+ private static void ThrowUnreachableException ( ) => throw new UnreachableException ( ) ;
303322}
0 commit comments