44using System . Runtime . CompilerServices ;
55using System . Runtime . InteropServices ;
66using System . Runtime . Intrinsics ;
7+ using System . Runtime . Intrinsics . Arm ;
78using System . Runtime . Intrinsics . X86 ;
89using Lagrange . Proto . Serialization ;
910
@@ -193,6 +194,15 @@ private static T ExtractFromVector<T>(ulong varintPart0, ulong varintPart1) wher
193194
194195 return T . CreateTruncating ( pt1 | ( varintPart1 & 0x0000000000000100 ) << 56 | ( varintPart1 & 0x000000000000007f ) << 56 ) ;
195196 }
197+ else if ( AdvSimd . Arm64 . IsSupported )
198+ {
199+ var b = Vector128 . Create ( varintPart0 , varintPart0 ) ;
200+ var d = AdvSimd . ShiftLogical ( b & Mask1 , - Shift1 . AsInt64 ( ) ) | AdvSimd . ShiftLogical ( b & Mask2 , - Shift2 . AsInt64 ( ) ) | AdvSimd . ShiftLogical ( b & Mask3 , - Shift3 . AsInt64 ( ) ) | AdvSimd . ShiftLogical ( b & Mask4 , - Shift4 . AsInt64 ( ) ) ;
201+ var e = d | Vector128 . Create ( d . GetElement ( 1 ) , 0ul ) ;
202+ ulong pt1 = e . ToScalar ( ) ;
203+
204+ return T . CreateTruncating ( pt1 | ( ( varintPart1 & 0x0000000000000100 ) << 55 ) | ( ( varintPart1 & 0x000000000000007f ) << 56 ) ) ;
205+ }
196206 else
197207 {
198208 return T . CreateTruncating ( ( varintPart0 & 0x000000000000007f ) | ( ( varintPart0 & 0x7f00000000000000 ) >> 7 ) | ( ( varintPart0 & 0x007f000000000000 ) >> 6 ) | ( ( varintPart0 & 0x00007f0000000000 ) >> 5 ) | ( ( varintPart0 & 0x0000007f00000000 ) >> 4 ) | ( ( varintPart0 & 0x000000007f000000 ) >> 3 ) | ( ( varintPart0 & 0x00000000007f0000 ) >> 2 ) | ( ( varintPart0 & 0x0000000000007f00 ) >> 1 ) | ( ( varintPart1 & 0x0000000000000100 ) << 55 ) | ( ( varintPart1 & 0x000000000000007f ) << 56 ) ) ;
@@ -204,45 +214,37 @@ public unsafe (TT, TU) DecodeVarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
204214 where TT : unmanaged, INumber < TT >
205215 where TU : unmanaged, INumber < TU >
206216 {
207- if ( ! Ssse3 . X64 . IsSupported ) throw new PlatformNotSupportedException ( ) ;
217+ if ( ! Ssse3 . X64 . IsSupported && ! AdvSimd . Arm64 . IsSupported ) throw new PlatformNotSupportedException ( ) ;
208218
209219 if ( sizeof ( TT ) + sizeof ( TU ) > 12 ) throw new NotSupportedException ( ) ;
210220
211221 if ( sizeof ( TT ) <= 4 && sizeof ( TU ) <= 4 ) return DecodeTwo32VarIntUnsafe < TT , TU > ( src ) ; // try to use fast path of lookup table
212222
213223 var b = Unsafe . As < byte , Vector128 < sbyte > > ( ref MemoryMarshal . GetReference ( src ) ) ;
214- uint bitmask = ( uint ) Sse2 . MoveMask ( b ) ;
224+ uint bitmask = b . AsByte ( ) . ExtractMostSignificantBits ( ) ;
215225 uint maskNot = ~ bitmask ;
216226 int firstLen = BitOperations . TrailingZeroCount ( maskNot ) + 1 ;
217227 uint maskNot2 = maskNot >> firstLen ;
218228 int secondLen = BitOperations . TrailingZeroCount ( maskNot2 ) + 1 ;
219-
229+
220230 var firstLenVec = Vector128 . Create ( ( sbyte ) firstLen ) ;
221- var firstMask = Sse2 . CompareLessThan ( Ascend , firstLenVec ) ;
222- var first = Sse2 . And ( b , firstMask ) ;
223-
224- var secondShuf = Sse2 . Add ( Ascend , firstLenVec ) ;
225- var secondShuffled = Ssse3 . Shuffle ( b , secondShuf ) ;
226- var secondMask = Sse2 . CompareLessThan ( Ascend , Vector128 . Create ( ( sbyte ) secondLen ) ) ;
227- var second = Sse2 . And ( secondShuffled , secondMask ) ;
231+ var firstMask = Vector128 . LessThan ( Ascend , firstLenVec ) ;
232+ var first = b & firstMask ;
233+
234+ var secondShuf = Ascend + firstLenVec ;
235+ var secondShuffled = Ssse3 . IsSupported ? Ssse3 . Shuffle ( b , secondShuf ) : AdvSimd . Arm64 . VectorTableLookup ( b . AsByte ( ) , secondShuf . AsByte ( ) ) . AsSByte ( ) ;
236+ var secondMask = Vector128 . LessThan ( Ascend , Vector128 . Create ( ( sbyte ) secondLen ) ) ;
237+ var second = secondShuffled & secondMask ;
228238
229239 TT firstNum ;
230240 TU secondNum ;
231241 if ( sizeof ( TT ) <= 4 && sizeof ( TU ) <= 4 && ! Bmi2 . X64 . IsSupported )
232242 {
233- var comb = Sse2 . Or ( first , Sse2 . ShiftLeftLogical128BitLane ( second , 8 ) ) . AsUInt64 ( ) ;
243+ var shifted = Sse2 . IsSupported ? Sse2 . ShiftLeftLogical128BitLane ( second , 8 ) : Vector128 . Create ( 0L , second . AsInt64 ( ) . ToScalar ( ) ) . AsSByte ( ) ;
244+ var comb = ( first | shifted ) . AsUInt64 ( ) ;
234245 var x = sizeof ( TT ) <= 1 && sizeof ( TU ) <= 1 ? DualU8Stage2 ( comb ) : sizeof ( TT ) <= 2 && sizeof ( TU ) <= 2 ? DualU16Stage2 ( comb ) : DualU32Stage2 ( comb ) ;
235- if ( Sse41 . X64 . IsSupported )
236- {
237- firstNum = TT . CreateTruncating ( Sse41 . X64 . Extract ( x , 0 ) ) ;
238- secondNum = TU . CreateTruncating ( Sse41 . X64 . Extract ( x , 1 ) ) ;
239- }
240- else
241- {
242- var x32 = x . AsUInt32 ( ) ;
243- firstNum = TT . CreateTruncating ( x32 [ 0 ] ) ;
244- secondNum = TU . CreateTruncating ( x32 [ 2 ] ) ;
245- }
246+ firstNum = TT . CreateTruncating ( x . GetElement ( 0 ) ) ;
247+ secondNum = TU . CreateTruncating ( x . GetElement ( 1 ) ) ;
246248 }
247249 else
248250 {
@@ -258,40 +260,26 @@ public unsafe (TT, TU) DecodeVarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
258260 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
259261 private static Vector128 < ulong > DualU8Stage2 ( Vector128 < ulong > comb )
260262 {
261- return Sse2 . Or (
262- Sse2 . And ( comb , Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) ,
263- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) , 1 )
264- ) ;
263+ var mask = Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ;
264+ return ( comb & mask ) | Vector128 . ShiftRightLogical ( comb & mask , 1 ) ;
265265 }
266266
267267 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
268268 private static Vector128 < ulong > DualU16Stage2 ( Vector128 < ulong > comb )
269269 {
270- return Sse2 . Or (
271- Sse2 . Or (
272- Sse2 . And ( comb , Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) ,
273- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x0000000000030000ul , 0x0000000000030000ul ) ) , 2 )
274- ) ,
275- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x0000000000007f00ul , 0x0000000000007f00ul ) ) , 1 )
276- ) ;
270+ return ( ( comb & Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) |
271+ Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x0000000000030000ul , 0x0000000000030000ul ) , 2 ) ) |
272+ Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x0000000000007f00ul , 0x0000000000007f00ul ) , 1 ) ;
277273 }
278274
279275 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
280276 private static Vector128 < ulong > DualU32Stage2 ( Vector128 < ulong > comb )
281277 {
282- return Sse2 . Or (
283- Sse2 . Or (
284- Sse2 . And ( comb , Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) ,
285- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x0000000f00000000ul , 0x0000000f00000000ul ) ) , 4 )
286- ) ,
287- Sse2 . Or (
288- Sse2 . Or (
289- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x000000007f000000ul , 0x000000007f000000ul ) ) , 3 ) ,
290- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x00000000007f0000ul , 0x00000000007f0000ul ) ) , 2 )
291- ) ,
292- Sse2 . ShiftRightLogical ( Sse2 . And ( comb , Vector128 . Create ( 0x0000000000007f00ul , 0x0000000000007f00ul ) ) , 1 )
293- )
294- ) ;
278+ return ( ( comb & Vector128 . Create ( 0x000000000000007ful , 0x000000000000007ful ) ) |
279+ Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x0000000f00000000ul , 0x0000000f00000000ul ) , 4 ) ) |
280+ ( ( Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x000000007f000000ul , 0x000000007f000000ul ) , 3 ) |
281+ Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x00000000007f0000ul , 0x00000000007f0000ul ) , 2 ) ) |
282+ Vector128 . ShiftRightLogical ( comb & Vector128 . Create ( 0x0000000000007f00ul , 0x0000000000007f00ul ) , 1 ) ) ;
295283 }
296284
297285
@@ -301,35 +289,31 @@ private unsafe (TT, TU) DecodeTwo32VarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
301289 where TU : unmanaged, INumber < TU >
302290 {
303291 var b = Unsafe . As < byte , Vector128 < sbyte > > ( ref MemoryMarshal . GetReference ( src ) ) ;
304- uint bitmask = ( uint ) Sse2 . MoveMask ( b ) & 0b1111111111 ;
292+ uint bitmask = b . AsByte ( ) . ExtractMostSignificantBits ( ) & 0b1111111111 ;
305293 var ( lookup , firstLen , secondLen ) = Lookup . DoubleStep1 [ ( int ) bitmask ] ;
306294 var shuf = Unsafe . Add ( ref MemoryMarshal . GetReference ( Lookup . DoubleVec ) , lookup ) ;
307- var comb = Ssse3 . Shuffle ( b , shuf ) . AsUInt64 ( ) ;
308-
295+
296+ Vector128 < ulong > comb ;
297+ if ( Ssse3 . IsSupported )
298+ comb = Ssse3 . Shuffle ( b , shuf ) . AsUInt64 ( ) ;
299+ else
300+ comb = AdvSimd . Arm64 . VectorTableLookup ( b . AsByte ( ) , shuf . AsByte ( ) ) . AsUInt64 ( ) ;
301+
309302 TT firstNum ;
310303 TU secondNum ;
311304
312305 if ( Bmi2 . X64 . IsSupported )
313306 {
314307 var shift = Sse2 . ShiftRightLogical128BitLane ( comb , 8 ) ;
315-
308+
316309 firstNum = ExtractFromVector < TT > ( comb [ 0 ] , comb [ 1 ] ) ;
317310 secondNum = ExtractFromVector < TU > ( shift [ 0 ] , shift [ 1 ] ) ;
318311 }
319312 else
320313 {
321314 var x = sizeof ( TT ) <= 1 && sizeof ( TU ) <= 1 ? DualU8Stage2 ( comb ) : sizeof ( TT ) <= 2 && sizeof ( TU ) <= 2 ? DualU16Stage2 ( comb ) : DualU32Stage2 ( comb ) ;
322- if ( Sse41 . X64 . IsSupported )
323- {
324- firstNum = TT . CreateTruncating ( Sse41 . X64 . Extract ( x , 0 ) ) ;
325- secondNum = TU . CreateTruncating ( Sse41 . X64 . Extract ( x , 1 ) ) ;
326- }
327- else
328- {
329- var x32 = x . AsUInt32 ( ) ;
330- firstNum = TT . CreateTruncating ( x32 [ 0 ] ) ;
331- secondNum = TU . CreateTruncating ( x32 [ 2 ] ) ;
332- }
315+ firstNum = TT . CreateTruncating ( x . GetElement ( 0 ) ) ;
316+ secondNum = TU . CreateTruncating ( x . GetElement ( 1 ) ) ;
333317 }
334318
335319 _offset += ( firstLen + secondLen ) >> 3 ; // in bits
0 commit comments