@@ -47,8 +47,10 @@ public static Vector128<byte> Average(Vector128<byte> left, Vector128<byte> righ
4747 return AdvSimd . FusedAddRoundedHalving ( left , right ) ;
4848 }
4949
50- // Portable fallback: (a + b + 1) >> 1
51- return ( left + right + Vector128 . Create ( ( byte ) 1 ) ) >> 1 ;
50+ // Account for potential 9th bit to ensure correct rounded result.
51+ return Vector128 . Narrow (
52+ ( Vector128 . WidenLower ( left ) + Vector128 . WidenLower ( right ) + Vector128 < ushort > . One ) >> 1 ,
53+ ( Vector128 . WidenUpper ( left ) + Vector128 . WidenUpper ( right ) + Vector128 < ushort > . One ) >> 1 ) ;
5254 }
5355
5456 /// <summary>
@@ -117,13 +119,17 @@ public static Vector128<short> ShuffleHigh(Vector128<short> value, [ConstantExpe
117119 }
118120
119121 // Don't use InverseMMShuffle here as we want to avoid the cast.
120- Vector64 < short > indices = Vector64 . Create (
121- ( short ) ( control & 0x3 ) ,
122- ( short ) ( ( control >> 2 ) & 0x3 ) ,
123- ( short ) ( ( control >> 4 ) & 0x3 ) ,
124- ( short ) ( ( control >> 6 ) & 0x3 ) ) ;
125-
126- return Vector128 . Create ( value . GetLower ( ) , Vector64 . Shuffle ( value . GetUpper ( ) , indices ) ) ;
122+ Vector128 < short > indices = Vector128 . Create (
123+ 0 ,
124+ 1 ,
125+ 2 ,
126+ 3 ,
127+ ( short ) ( ( control & 0x3 ) + 4 ) ,
128+ ( short ) ( ( ( control >> 2 ) & 0x3 ) + 4 ) ,
129+ ( short ) ( ( ( control >> 4 ) & 0x3 ) + 4 ) ,
130+ ( short ) ( ( ( control >> 6 ) & 0x3 ) + 4 ) ) ;
131+
132+ return Vector128 . Shuffle ( value , indices ) ;
127133 }
128134
129135 /// <summary>
@@ -144,13 +150,17 @@ public static Vector128<short> ShuffleLow(Vector128<short> value, [ConstantExpec
144150 }
145151
146152 // Don't use InverseMMShuffle here as we want to avoid the cast.
147- Vector64 < short > indices = Vector64 . Create (
148- ( short ) ( control & 0x3 ) ,
149- ( short ) ( ( control >> 2 ) & 0x3 ) ,
150- ( short ) ( ( control >> 4 ) & 0x3 ) ,
151- ( short ) ( ( control >> 6 ) & 0x3 ) ) ;
152-
153- return Vector128 . Create ( Vector64 . Shuffle ( value . GetLower ( ) , indices ) , value . GetUpper ( ) ) ;
153+ Vector128 < short > indices = Vector128 . Create (
154+ ( short ) ( control & 0x3 ) ,
155+ ( short ) ( ( control >> 2 ) & 0x3 ) ,
156+ ( short ) ( ( control >> 4 ) & 0x3 ) ,
157+ ( short ) ( ( control >> 6 ) & 0x3 ) ,
158+ 4 ,
159+ 5 ,
160+ 6 ,
161+ 7 ) ;
162+
163+ return Vector128 . Shuffle ( value , indices ) ;
154164 }
155165
156166 /// <summary>
@@ -237,28 +247,13 @@ public static Vector128<byte> ShiftLeftBytesInVector(Vector128<byte> value, [Con
237247 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
238248 public static Vector128 < short > ShiftLeftLogical ( Vector128 < short > value , [ ConstantExpected ] byte count )
239249 {
240- if ( Sse2 . IsSupported )
241- {
242- return Sse2 . ShiftLeftLogical ( value , count ) ;
243- }
244-
245250 // Zero lanes where count >= 16 to match SSE2
246251 if ( count >= 16 )
247252 {
248253 return Vector128 < short > . Zero ;
249254 }
250255
251- if ( AdvSimd . IsSupported )
252- {
253- return AdvSimd . ShiftLogical ( value , Vector128 . Create ( ( short ) count ) ) ;
254- }
255-
256- if ( PackedSimd . IsSupported )
257- {
258- return PackedSimd . ShiftLeft ( value , count ) ;
259- }
260-
261- return Vector128 . ShiftLeft ( value , count ) ;
256+ return value << count ;
262257 }
263258
264259 /// <summary>
@@ -536,6 +531,11 @@ public static Vector128<int> MultiplyAddAdjacent(Vector128<short> left, Vector12
536531 Vector128 < int > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
537532 Vector128 < int > prodHi = AdvSimd . MultiplyWideningLower ( left . GetUpper ( ) , right . GetUpper ( ) ) ;
538533
534+ if ( AdvSimd . Arm64 . IsSupported )
535+ {
536+ return AdvSimd . Arm64 . AddPairwise ( prodLo , prodHi ) ;
537+ }
538+
539539 Vector128 < long > v0 = AdvSimd . AddPairwiseWidening ( prodLo ) ;
540540 Vector128 < long > v1 = AdvSimd . AddPairwiseWidening ( prodHi ) ;
541541
@@ -587,50 +587,26 @@ public static Vector128<short> HorizontalAdd(Vector128<short> left, Vector128<sh
587587 return AdvSimd . Arm64 . AddPairwise ( left , right ) ;
588588 }
589589
590- // Extract the low and high parts of the products shuffling them to form a result we can add together.
591- // Use out-of-bounds to zero out the unused lanes.
592- Vector128 < short > even = Vector128 . Create ( 0 , 2 , 4 , 6 , 8 , 8 , 8 , 8 ) ;
593- Vector128 < short > odd = Vector128 . Create ( 1 , 3 , 5 , 7 , 8 , 8 , 8 , 8 ) ;
594- Vector128 < short > v0 = Vector128 . Shuffle ( right , even ) ;
595- Vector128 < short > v1 = Vector128 . Shuffle ( right , odd ) ;
596- Vector128 < short > v2 = Vector128 . Shuffle ( left , even ) ;
597- Vector128 < short > v3 = Vector128 . Shuffle ( left , odd ) ;
598-
599- return v0 + v1 + v2 + v3 ;
600- }
601-
602- /// <summary>
603- /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
604- /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
605- /// </summary>
606- /// <param name="left">
607- /// The first vector containing packed 16-bit integers to multiply.
608- /// </param>
609- /// <param name="right">
610- /// The second vector containing packed 16-bit integers to multiply.
611- /// </param>
612- /// <returns>
613- /// A vector containing the low 16 bits of the products of the packed 16-bit integers
614- /// from <paramref name="left"/> and <paramref name="right"/>.
615- /// </returns>
616- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
617- public static Vector128 < short > MultiplyLow ( Vector128 < short > left , Vector128 < short > right )
618- {
619- if ( Sse2 . IsSupported )
590+ if ( AdvSimd . IsSupported )
620591 {
621- return Sse2 . MultiplyLow ( left , right ) ;
622- }
592+ Vector128 < int > v0 = AdvSimd . AddPairwiseWidening ( left ) ;
593+ Vector128 < int > v1 = AdvSimd . AddPairwiseWidening ( right ) ;
623594
624- // Widen each half of the short vectors into two int vectors
625- ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
626- ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
595+ return Vector128 . Narrow ( v0 , v1 ) ;
596+ }
627597
628- // Elementwise multiply: each int lane now holds the full 32-bit product
629- Vector128 < int > prodLo = leftLo * rightLo ;
630- Vector128 < int > prodHi = leftHi * rightHi ;
598+ {
599+ // Extract the low and high parts of the products shuffling them to form a result we can add together.
600+ // Use out-of-bounds to zero out the unused lanes.
601+ Vector128 < short > even = Vector128 . Create ( 0 , 2 , 4 , 6 , 8 , 8 , 8 , 8 ) ;
602+ Vector128 < short > odd = Vector128 . Create ( 1 , 3 , 5 , 7 , 8 , 8 , 8 , 8 ) ;
603+ Vector128 < short > v0 = Vector128 . Shuffle ( right , even ) ;
604+ Vector128 < short > v1 = Vector128 . Shuffle ( right , odd ) ;
605+ Vector128 < short > v2 = Vector128 . Shuffle ( left , even ) ;
606+ Vector128 < short > v3 = Vector128 . Shuffle ( left , odd ) ;
631607
632- // Narrow the two int vectors back into one short vector
633- return Vector128 . Narrow ( prodLo , prodHi ) ;
608+ return v0 + v1 + v2 + v3 ;
609+ }
634610 }
635611
636612 /// <summary>
@@ -655,20 +631,33 @@ public static Vector128<short> MultiplyHigh(Vector128<short> left, Vector128<sho
655631 return Sse2 . MultiplyHigh ( left , right ) ;
656632 }
657633
658- // Widen each half of the short vectors into two int vectors
659- ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
660- ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
634+ if ( AdvSimd . IsSupported )
635+ {
636+ Vector128 < int > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
637+ Vector128 < int > prodHi = AdvSimd . MultiplyWideningUpper ( left , right ) ;
638+
639+ prodLo >>= 16 ;
640+ prodHi >>= 16 ;
641+
642+ return Vector128 . Narrow ( prodLo , prodHi ) ;
643+ }
644+
645+ {
646+ // Widen each half of the short vectors into two int vectors
647+ ( Vector128 < int > leftLo , Vector128 < int > leftHi ) = Vector128 . Widen ( left ) ;
648+ ( Vector128 < int > rightLo , Vector128 < int > rightHi ) = Vector128 . Widen ( right ) ;
661649
662- // Elementwise multiply: each int lane now holds the full 32-bit product
663- Vector128 < int > prodLo = leftLo * rightLo ;
664- Vector128 < int > prodHi = leftHi * rightHi ;
650+ // Elementwise multiply: each int lane now holds the full 32-bit product
651+ Vector128 < int > prodLo = leftLo * rightLo ;
652+ Vector128 < int > prodHi = leftHi * rightHi ;
665653
666- // Arithmetic shift right by 16 bits to extract the high word
667- prodLo >>= 16 ;
668- prodHi >>= 16 ;
654+ // Arithmetic shift right by 16 bits to extract the high word
655+ prodLo >>= 16 ;
656+ prodHi >>= 16 ;
669657
670- // Narrow the two int vectors back into one short vector
671- return Vector128 . Narrow ( prodLo , prodHi ) ;
658+ // Narrow the two int vectors back into one short vector
659+ return Vector128 . Narrow ( prodLo , prodHi ) ;
660+ }
672661 }
673662
674663 /// <summary>
@@ -693,20 +682,33 @@ public static Vector128<ushort> MultiplyHigh(Vector128<ushort> left, Vector128<u
693682 return Sse2 . MultiplyHigh ( left , right ) ;
694683 }
695684
696- // Widen each half of the short vectors into two uint vectors
697- ( Vector128 < uint > leftLo , Vector128 < uint > leftHi ) = Vector128 . Widen ( left ) ;
698- ( Vector128 < uint > rightLo , Vector128 < uint > rightHi ) = Vector128 . Widen ( right ) ;
685+ if ( AdvSimd . IsSupported )
686+ {
687+ Vector128 < uint > prodLo = AdvSimd . MultiplyWideningLower ( left . GetLower ( ) , right . GetLower ( ) ) ;
688+ Vector128 < uint > prodHi = AdvSimd . MultiplyWideningUpper ( left , right ) ;
699689
700- // Elementwise multiply: each int lane now holds the full 32-bit product
701- Vector128 < uint > prodLo = leftLo * rightLo ;
702- Vector128 < uint > prodHi = leftHi * rightHi ;
690+ prodLo >>= 16 ;
691+ prodHi >>= 16 ;
703692
704- // Arithmetic shift right by 16 bits to extract the high word
705- prodLo >>= 16 ;
706- prodHi >>= 16 ;
693+ return Vector128 . Narrow ( prodLo , prodHi ) ;
694+ }
695+
696+ {
697+ // Widen each half of the short vectors into two uint vectors
698+ ( Vector128 < uint > leftLo , Vector128 < uint > leftHi ) = Vector128 . Widen ( left ) ;
699+ ( Vector128 < uint > rightLo , Vector128 < uint > rightHi ) = Vector128 . Widen ( right ) ;
707700
708- // Narrow the two int vectors back into one short vector
709- return Vector128 . Narrow ( prodLo , prodHi ) ;
701+ // Elementwise multiply: each int lane now holds the full 32-bit product
702+ Vector128 < uint > prodLo = leftLo * rightLo ;
703+ Vector128 < uint > prodHi = leftHi * rightHi ;
704+
705+ // Arithmetic shift right by 16 bits to extract the high word
706+ prodLo >>= 16 ;
707+ prodHi >>= 16 ;
708+
709+ // Narrow the two int vectors back into one short vector
710+ return Vector128 . Narrow ( prodLo , prodHi ) ;
711+ }
710712 }
711713
712714 /// <summary>
@@ -1363,90 +1365,4 @@ public static Vector128<sbyte> SubtractSaturate(Vector128<sbyte> left, Vector128
13631365 // Narrow back to signed bytes
13641366 return Vector128 . Narrow ( diffLo , diffHi ) ;
13651367 }
1366-
1367- /// <summary>
1368- /// Create mask from the most significant bit of each 8-bit element in <paramref name="value"/>, and store the result.
1369- /// </summary>
1370- /// <param name="value">
1371- /// The vector containing packed 8-bit integers from which to create the mask.
1372- /// </param>
1373- /// <returns>
1374- /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element
1375- /// in <paramref name="value"/>.
1376- /// </returns>
1377- [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
1378- public static int MoveMask ( Vector128 < byte > value )
1379- {
1380- if ( Sse2 . IsSupported )
1381- {
1382- return Sse2 . MoveMask ( value ) ;
1383- }
1384-
1385- // AdvSimd versions ported from Stack Overflow answer:
1386- // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
1387- if ( AdvSimd . Arm64 . IsSupported )
1388- {
1389- // Shift values to align each MSB to its corresponding bit in the output
1390- Vector128 < sbyte > shift = Vector128 . Create ( - 7 , - 6 , - 5 , - 4 , - 3 , - 2 , - 1 , 0 , - 7 , - 6 , - 5 , - 4 , - 3 , - 2 , - 1 , 0 ) ;
1391-
1392- // Mask to isolate MSBs
1393- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1394- Vector128 < byte > masked = value & msbMask ;
1395-
1396- // Shift each MSB into the correct bit position
1397- Vector128 < byte > shifted = AdvSimd . ShiftLogical ( masked . AsSByte ( ) , shift ) . AsByte ( ) ;
1398-
1399- // Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15
1400- byte lo = AdvSimd . Arm64 . AddAcross ( shifted . GetLower ( ) ) . ToScalar ( ) ;
1401- byte hi = AdvSimd . Arm64 . AddAcross ( shifted . GetUpper ( ) ) . ToScalar ( ) ;
1402-
1403- return lo + ( hi << 8 ) ;
1404- }
1405-
1406- if ( AdvSimd . IsSupported )
1407- {
1408- Vector128 < byte > powers = Vector128 . Create ( 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 ) ;
1409- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1410- Vector128 < byte > normalized = AdvSimd . CompareEqual ( value & msbMask , msbMask ) ; // 0xFF or 0x00
1411- Vector128 < byte > masked = normalized & powers ;
1412-
1413- Vector128 < ushort > sum8 = AdvSimd . AddPairwiseWidening ( masked ) ;
1414- Vector128 < uint > sum16 = AdvSimd . AddPairwiseWidening ( sum8 ) ;
1415- Vector128 < ulong > sum32 = AdvSimd . AddPairwiseWidening ( sum16 ) ;
1416-
1417- // Extract lower 8 bits of each 64-bit lane
1418- byte lo = sum32 . AsByte ( ) . GetElement ( 0 ) ;
1419- byte hi = sum32 . AsByte ( ) . GetElement ( 8 ) ;
1420-
1421- return ( hi << 8 ) | lo ;
1422- }
1423-
1424- {
1425- // Step 1: isolate MSBs
1426- Vector128 < byte > msbMask = Vector128 . Create ( ( byte ) 0x80 ) ;
1427- Vector128 < byte > masked = value & msbMask ;
1428-
1429- // Step 2: shift each byte so MSB lands in bit position [0..15]
1430- // i.e. convert: 0x80 → 1 << i
1431- Vector128 < ushort > bitShifts = Vector128 . Create ( ( ushort ) 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 ) ;
1432- Vector128 < ushort > bitShiftsHigh = Vector128 . Create ( 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 , 32768 ) ;
1433-
1434- // Step 3: widen to ushort
1435- ( Vector128 < ushort > lo , Vector128 < ushort > hi ) = Vector128 . Widen ( masked ) ;
1436-
1437- // Step 4: compare > 0 to get 0xFFFF where MSB was set
1438- lo = Vector128 . ConditionalSelect ( Vector128 . Equals ( lo , Vector128 < ushort > . Zero ) , Vector128 < ushort > . Zero , bitShifts ) ;
1439- hi = Vector128 . ConditionalSelect ( Vector128 . Equals ( hi , Vector128 < ushort > . Zero ) , Vector128 < ushort > . Zero , bitShiftsHigh ) ;
1440-
1441- // Step 5: bitwise OR the two halves
1442- Vector128 < ushort > maskVector = lo | hi ;
1443-
1444- // Step 6: horizontal OR reduction via shuffles
1445- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 4 , 5 , 6 , 7 , 0 , 1 , 2 , 3 ) ) ;
1446- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 2 , 3 , 0 , 1 , 6 , 7 , 4 , 5 ) ) ;
1447- maskVector |= Vector128 . Shuffle ( maskVector , Vector128 . Create ( ( ushort ) 1 , 0 , 3 , 2 , 5 , 4 , 7 , 6 ) ) ;
1448-
1449- return maskVector . ToScalar ( ) ;
1450- }
1451- }
14521368}
0 commit comments