@@ -66,9 +66,9 @@ public static void Shuffle4Reduce(
6666 ref Span < float > destination ,
6767 [ ConstantExpected ] byte control )
6868 {
69- if ( ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeFloat ) ||
70- ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeFloat ) ||
71- Vector128 . IsHardwareAccelerated )
69+ if ( Vector512 . IsHardwareAccelerated ||
70+ Vector256 . IsHardwareAccelerated ||
71+ Vector128 . IsHardwareAccelerated )
7272 {
7373 int remainder = 0 ;
7474 if ( Vector512 . IsHardwareAccelerated )
@@ -112,9 +112,9 @@ public static void Shuffle4Reduce(
112112 ref Span < byte > destination ,
113113 [ ConstantExpected ] byte control )
114114 {
115- if ( ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeByte ) ||
116- ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeByte ) ||
117- ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte ) )
115+ if ( Vector512 . IsHardwareAccelerated ||
116+ Vector256 . IsHardwareAccelerated ||
117+ Vector128 . IsHardwareAccelerated )
118118 {
119119 int remainder = 0 ;
120120 if ( Vector512 . IsHardwareAccelerated )
@@ -158,7 +158,7 @@ public static void Shuffle3Reduce(
158158 ref Span < byte > destination ,
159159 [ ConstantExpected ] byte control )
160160 {
161- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsAlignRight )
161+ if ( Vector128 . IsHardwareAccelerated )
162162 {
163163 int remainder = source . Length % ( Vector128 < byte > . Count * 3 ) ;
164164
@@ -190,7 +190,7 @@ public static void Pad3Shuffle4Reduce(
190190 ref Span < byte > destination ,
191191 [ ConstantExpected ] byte control )
192192 {
193- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsShiftByte )
193+ if ( Vector128 . IsHardwareAccelerated )
194194 {
195195 int remainder = source . Length % ( Vector128 < byte > . Count * 3 ) ;
196196
@@ -223,7 +223,7 @@ public static void Shuffle4Slice3Reduce(
223223 ref Span < byte > destination ,
224224 [ ConstantExpected ] byte control )
225225 {
226- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsShiftByte )
226+ if ( Vector128 . IsHardwareAccelerated )
227227 {
228228 int remainder = source . Length & ( ( Vector128 < byte > . Count * 4 ) - 1 ) ; // bit-hack for modulo
229229
@@ -249,7 +249,7 @@ private static void Shuffle4(
249249 Span < float > destination ,
250250 [ ConstantExpected ] byte control )
251251 {
252- if ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeFloat )
252+ if ( Vector512 . IsHardwareAccelerated )
253253 {
254254 ref Vector512 < float > sourceBase = ref Unsafe . As < float , Vector512 < float > > ( ref MemoryMarshal . GetReference ( source ) ) ;
255255 ref Vector512 < float > destinationBase = ref Unsafe . As < float , Vector512 < float > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
@@ -277,7 +277,7 @@ private static void Shuffle4(
277277 }
278278 }
279279 }
280- else if ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeFloat )
280+ else if ( Vector256 . IsHardwareAccelerated )
281281 {
282282 ref Vector256 < float > sourceBase = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( source ) ) ;
283283 ref Vector256 < float > destinationBase = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
@@ -341,7 +341,7 @@ private static void Shuffle4(
341341 Span < byte > destination ,
342342 [ ConstantExpected ] byte control )
343343 {
344- if ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeByte )
344+ if ( Vector512 . IsHardwareAccelerated )
345345 {
346346 Span < byte > temp = stackalloc byte [ Vector512 < byte > . Count ] ;
347347 Shuffle . MMShuffleSpan ( ref temp , control ) ;
@@ -373,8 +373,13 @@ private static void Shuffle4(
373373 }
374374 }
375375 }
376- else if ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeByte )
376+ else if ( Vector256 . IsHardwareAccelerated )
377377 {
378+ // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
379+ // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
380+ // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
381+ // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
382+ // this usage is guaranteed to remain within-lane and non-zeroing.
378383 Span < byte > temp = stackalloc byte [ Vector256 < byte > . Count ] ;
379384 Shuffle . MMShuffleSpan ( ref temp , control ) ;
380385 Vector256 < byte > mask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( temp ) ) ;
@@ -391,21 +396,21 @@ private static void Shuffle4(
391396 ref Vector256 < byte > vs0 = ref Unsafe . Add ( ref sourceBase , i ) ;
392397 ref Vector256 < byte > vd0 = ref Unsafe . Add ( ref destinationBase , i ) ;
393398
394- vd0 = Vector256_ . ShuffleNative ( vs0 , mask ) ;
395- Unsafe . Add ( ref vd0 , ( nuint ) 1 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 1 ) , mask ) ;
396- Unsafe . Add ( ref vd0 , ( nuint ) 2 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 2 ) , mask ) ;
397- Unsafe . Add ( ref vd0 , ( nuint ) 3 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 3 ) , mask ) ;
399+ vd0 = Vector256_ . ShufflePerLane ( vs0 , mask ) ;
400+ Unsafe . Add ( ref vd0 , ( nuint ) 1 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 1 ) , mask ) ;
401+ Unsafe . Add ( ref vd0 , ( nuint ) 2 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 2 ) , mask ) ;
402+ Unsafe . Add ( ref vd0 , ( nuint ) 3 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 3 ) , mask ) ;
398403 }
399404
400405 if ( m > 0 )
401406 {
402407 for ( nuint i = u ; i < n ; i ++ )
403408 {
404- Unsafe . Add ( ref destinationBase , i ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref sourceBase , i ) , mask ) ;
409+ Unsafe . Add ( ref destinationBase , i ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref sourceBase , i ) , mask ) ;
405410 }
406411 }
407412 }
408- else if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte )
413+ else if ( Vector128 . IsHardwareAccelerated )
409414 {
410415 Span < byte > temp = stackalloc byte [ Vector128 < byte > . Count ] ;
411416 Shuffle . MMShuffleSpan ( ref temp , control ) ;
@@ -445,9 +450,7 @@ private static void Shuffle3(
445450 Span < byte > destination ,
446451 [ ConstantExpected ] byte control )
447452 {
448- if ( Vector128 . IsHardwareAccelerated &&
449- Vector128_ . SupportsShuffleNativeByte &&
450- Vector128_ . SupportsAlignRight )
453+ if ( Vector128 . IsHardwareAccelerated )
451454 {
452455 Vector128 < byte > maskPad4Nx16 = ShuffleMaskPad4Nx16 ( ) ;
453456 Vector128 < byte > maskSlice4Nx16 = ShuffleMaskSlice4Nx16 ( ) ;
@@ -507,10 +510,7 @@ private static void Pad3Shuffle4(
507510 Span < byte > destination ,
508511 [ ConstantExpected ] byte control )
509512 {
510- if ( Vector128 . IsHardwareAccelerated &&
511- Vector128_ . SupportsShuffleNativeByte &&
512- Vector128_ . SupportsShiftByte &&
513- Vector128_ . SupportsAlignRight )
513+ if ( Vector128 . IsHardwareAccelerated )
514514 {
515515 Vector128 < byte > maskPad4Nx16 = ShuffleMaskPad4Nx16 ( ) ;
516516 Vector128 < byte > fill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
@@ -553,10 +553,7 @@ private static void Shuffle4Slice3(
553553 Span < byte > destination ,
554554 [ ConstantExpected ] byte control )
555555 {
556- if ( Vector128 . IsHardwareAccelerated &&
557- Vector128_ . SupportsShuffleNativeByte &&
558- Vector128_ . SupportsShiftByte &&
559- Vector128_ . SupportsAlignRight )
556+ if ( Vector128 . IsHardwareAccelerated )
560557 {
561558 Vector128 < byte > maskSlice4Nx16 = ShuffleMaskSlice4Nx16 ( ) ;
562559 Vector128 < byte > maskE = Vector128_ . AlignRight ( maskSlice4Nx16 , maskSlice4Nx16 , 12 ) ;
0 commit comments