Skip to content

Commit 9a8bcd2

Browse files
committed
[Proto] Add ARMv8a SIMD Support for ProtoReader and ProtoWriter
1 parent bdac571 commit 9a8bcd2

4 files changed

Lines changed: 78 additions & 79 deletions

File tree

Lagrange.Proto.Test/EncoderTest.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Buffers;
2+
using System.Runtime.Intrinsics.Arm;
23
using System.Runtime.Intrinsics.X86;
34
using Lagrange.Proto.Primitives;
45

@@ -73,7 +74,7 @@ public void TestLongInt()
7374
[Test]
7475
public void TestReadTwoInt()
7576
{
76-
if (!Sse3.IsSupported) Assert.Ignore("SSE3 is not supported on this platform.");
77+
if (!Sse3.IsSupported && !AdvSimd.Arm64.IsSupported) Assert.Ignore("SSSE3/NEON is not supported on this platform.");
7778

7879
var reader = new ProtoReader(_twoInt);
7980
var (number1, number2) = reader.DecodeVarIntUnsafe<int, int>(_twoInt);
@@ -88,7 +89,7 @@ public void TestReadTwoInt()
8889
[Test]
8990
public void TestReadLongInt()
9091
{
91-
if (!Sse3.IsSupported) Assert.Ignore("SSE3 is not supported on this platform.");
92+
if (!Sse3.IsSupported && !AdvSimd.Arm64.IsSupported) Assert.Ignore("SSSE3/NEON is not supported on this platform.");
9293

9394
var reader = new ProtoReader(_longInt);
9495
var (number1, number2) = reader.DecodeVarIntUnsafe<long, int>(_longInt);
@@ -103,7 +104,7 @@ public void TestReadLongInt()
103104
[Test]
104105
public void TestUnsafeRead()
105106
{
106-
if (!Sse3.IsSupported) Assert.Ignore("SSE3 is not supported on this platform.");
107+
if (!Sse3.IsSupported && !AdvSimd.Arm64.IsSupported) Assert.Ignore("SSSE3/NEON is not supported on this platform.");
107108

108109
Span<byte> longerBuffer = stackalloc byte[256];
109110
_longInt.AsSpan().CopyTo(longerBuffer);

Lagrange.Proto.Test/ProtoWriterReaderTest.cs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Buffers;
2+
using System.Runtime.Intrinsics.Arm;
23
using System.Runtime.Intrinsics.X86;
34
using System.Text;
45
using Lagrange.Proto.Primitives;
@@ -114,9 +115,9 @@ public void TestVarInt_NegativeNumbers()
114115
[Test]
115116
public void TestDecodeVarIntUnsafe_DualValues()
116117
{
117-
if (!Sse3.IsSupported)
118+
if (!Sse3.IsSupported && !AdvSimd.Arm64.IsSupported)
118119
{
119-
Assert.Ignore("SSE3 is not supported on this platform.");
120+
Assert.Ignore("SSSE3/NEON is not supported on this platform.");
120121
return;
121122
}
122123

@@ -140,9 +141,9 @@ public void TestDecodeVarIntUnsafe_DualValues()
140141
[Test]
141142
public void TestDecodeVarIntUnsafe_MixedTypes()
142143
{
143-
if (!Sse3.IsSupported)
144+
if (!Sse3.IsSupported && !AdvSimd.Arm64.IsSupported)
144145
{
145-
Assert.Ignore("SSE3 is not supported on this platform.");
146+
Assert.Ignore("SSSE3/NEON is not supported on this platform.");
146147
return;
147148
}
148149

@@ -372,9 +373,9 @@ public void TestEncodeTwo32VarIntUnsafe_WithSsse3Fallback()
372373
[Test]
373374
public void TestEncodeTwo32VarIntUnsafe_RoundTripWithDecodeUnsafe()
374375
{
375-
if (!Ssse3.IsSupported)
376+
if (!Ssse3.IsSupported && !AdvSimd.Arm64.IsSupported)
376377
{
377-
Assert.Ignore("SSSE3 is not supported on this platform.");
378+
Assert.Ignore("SSSE3/NEON is not supported on this platform.");
378379
return;
379380
}
380381

Lagrange.Proto/Primitives/ProtoReader.cs

Lines changed: 45 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Runtime.CompilerServices;
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
7+
using System.Runtime.Intrinsics.Arm;
78
using System.Runtime.Intrinsics.X86;
89
using Lagrange.Proto.Serialization;
910

@@ -193,6 +194,15 @@ private static T ExtractFromVector<T>(ulong varintPart0, ulong varintPart1) wher
193194

194195
return T.CreateTruncating(pt1 | (varintPart1 & 0x0000000000000100) << 56 | (varintPart1 & 0x000000000000007f) << 56);
195196
}
197+
else if (AdvSimd.Arm64.IsSupported)
198+
{
199+
var b = Vector128.Create(varintPart0, varintPart0);
200+
var d = AdvSimd.ShiftLogical(b & Mask1, -Shift1.AsInt64()) | AdvSimd.ShiftLogical(b & Mask2, -Shift2.AsInt64()) | AdvSimd.ShiftLogical(b & Mask3, -Shift3.AsInt64()) | AdvSimd.ShiftLogical(b & Mask4, -Shift4.AsInt64());
201+
var e = d | Vector128.Create(d.GetElement(1), 0ul);
202+
ulong pt1 = e.ToScalar();
203+
204+
return T.CreateTruncating(pt1 | ((varintPart1 & 0x0000000000000100) << 55) | ((varintPart1 & 0x000000000000007f) << 56));
205+
}
196206
else
197207
{
198208
return T.CreateTruncating((varintPart0 & 0x000000000000007f) | ((varintPart0 & 0x7f00000000000000) >> 7) | ((varintPart0 & 0x007f000000000000) >> 6) | ((varintPart0 & 0x00007f0000000000) >> 5) | ((varintPart0 & 0x0000007f00000000) >> 4) | ((varintPart0 & 0x000000007f000000) >> 3) | ((varintPart0 & 0x00000000007f0000) >> 2) | ((varintPart0 & 0x0000000000007f00) >> 1) | ((varintPart1 & 0x0000000000000100) << 55) | ((varintPart1 & 0x000000000000007f) << 56));
@@ -204,45 +214,37 @@ public unsafe (TT, TU) DecodeVarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
204214
where TT : unmanaged, INumber<TT>
205215
where TU : unmanaged, INumber<TU>
206216
{
207-
if (!Ssse3.X64.IsSupported) throw new PlatformNotSupportedException();
217+
if (!Ssse3.X64.IsSupported && !AdvSimd.Arm64.IsSupported) throw new PlatformNotSupportedException();
208218

209219
if (sizeof(TT) + sizeof(TU) > 12) throw new NotSupportedException();
210220

211221
if (sizeof(TT) <= 4 && sizeof(TU) <= 4) return DecodeTwo32VarIntUnsafe<TT, TU>(src); // try to use fast path of lookup table
212222

213223
var b = Unsafe.As<byte, Vector128<sbyte>>(ref MemoryMarshal.GetReference(src));
214-
uint bitmask = (uint)Sse2.MoveMask(b);
224+
uint bitmask = b.AsByte().ExtractMostSignificantBits();
215225
uint maskNot = ~bitmask;
216226
int firstLen = BitOperations.TrailingZeroCount(maskNot) + 1;
217227
uint maskNot2 = maskNot >> firstLen;
218228
int secondLen = BitOperations.TrailingZeroCount(maskNot2) + 1;
219-
229+
220230
var firstLenVec = Vector128.Create((sbyte)firstLen);
221-
var firstMask = Sse2.CompareLessThan(Ascend, firstLenVec);
222-
var first = Sse2.And(b, firstMask);
223-
224-
var secondShuf = Sse2.Add(Ascend, firstLenVec);
225-
var secondShuffled = Ssse3.Shuffle(b, secondShuf);
226-
var secondMask = Sse2.CompareLessThan(Ascend, Vector128.Create((sbyte)secondLen));
227-
var second = Sse2.And(secondShuffled, secondMask);
231+
var firstMask = Vector128.LessThan(Ascend, firstLenVec);
232+
var first = b & firstMask;
233+
234+
var secondShuf = Ascend + firstLenVec;
235+
var secondShuffled = Ssse3.IsSupported ? Ssse3.Shuffle(b, secondShuf) : AdvSimd.Arm64.VectorTableLookup(b.AsByte(), secondShuf.AsByte()).AsSByte();
236+
var secondMask = Vector128.LessThan(Ascend, Vector128.Create((sbyte)secondLen));
237+
var second = secondShuffled & secondMask;
228238

229239
TT firstNum;
230240
TU secondNum;
231241
if (sizeof(TT) <= 4 && sizeof(TU) <= 4 && !Bmi2.X64.IsSupported)
232242
{
233-
var comb = Sse2.Or(first, Sse2.ShiftLeftLogical128BitLane(second, 8)).AsUInt64();
243+
var shifted = Sse2.IsSupported ? Sse2.ShiftLeftLogical128BitLane(second, 8) : Vector128.Create(0L, second.AsInt64().ToScalar()).AsSByte();
244+
var comb = (first | shifted).AsUInt64();
234245
var x = sizeof(TT) <= 1 && sizeof(TU) <= 1 ? DualU8Stage2(comb) : sizeof(TT) <= 2 && sizeof(TU) <= 2 ? DualU16Stage2(comb) : DualU32Stage2(comb);
235-
if (Sse41.X64.IsSupported)
236-
{
237-
firstNum = TT.CreateTruncating(Sse41.X64.Extract(x, 0));
238-
secondNum = TU.CreateTruncating(Sse41.X64.Extract(x, 1));
239-
}
240-
else
241-
{
242-
var x32 = x.AsUInt32();
243-
firstNum = TT.CreateTruncating(x32[0]);
244-
secondNum = TU.CreateTruncating(x32[2]);
245-
}
246+
firstNum = TT.CreateTruncating(x.GetElement(0));
247+
secondNum = TU.CreateTruncating(x.GetElement(1));
246248
}
247249
else
248250
{
@@ -258,40 +260,26 @@ public unsafe (TT, TU) DecodeVarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
258260
[MethodImpl(MethodImplOptions.AggressiveInlining)]
259261
private static Vector128<ulong> DualU8Stage2(Vector128<ulong> comb)
260262
{
261-
return Sse2.Or(
262-
Sse2.And(comb, Vector128.Create(0x000000000000007ful, 0x000000000000007ful)),
263-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x000000000000007ful, 0x000000000000007ful)), 1)
264-
);
263+
var mask = Vector128.Create(0x000000000000007ful, 0x000000000000007ful);
264+
return (comb & mask) | Vector128.ShiftRightLogical(comb & mask, 1);
265265
}
266266

267267
[MethodImpl(MethodImplOptions.AggressiveInlining)]
268268
private static Vector128<ulong> DualU16Stage2(Vector128<ulong> comb)
269269
{
270-
return Sse2.Or(
271-
Sse2.Or(
272-
Sse2.And(comb, Vector128.Create(0x000000000000007ful, 0x000000000000007ful)),
273-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x0000000000030000ul, 0x0000000000030000ul)), 2)
274-
),
275-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x0000000000007f00ul, 0x0000000000007f00ul)), 1)
276-
);
270+
return ((comb & Vector128.Create(0x000000000000007ful, 0x000000000000007ful)) |
271+
Vector128.ShiftRightLogical(comb & Vector128.Create(0x0000000000030000ul, 0x0000000000030000ul), 2)) |
272+
Vector128.ShiftRightLogical(comb & Vector128.Create(0x0000000000007f00ul, 0x0000000000007f00ul), 1);
277273
}
278274

279275
[MethodImpl(MethodImplOptions.AggressiveInlining)]
280276
private static Vector128<ulong> DualU32Stage2(Vector128<ulong> comb)
281277
{
282-
return Sse2.Or(
283-
Sse2.Or(
284-
Sse2.And(comb, Vector128.Create(0x000000000000007ful, 0x000000000000007ful)),
285-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x0000000f00000000ul, 0x0000000f00000000ul)), 4)
286-
),
287-
Sse2.Or(
288-
Sse2.Or(
289-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x000000007f000000ul, 0x000000007f000000ul)), 3),
290-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x00000000007f0000ul, 0x00000000007f0000ul)), 2)
291-
),
292-
Sse2.ShiftRightLogical(Sse2.And(comb, Vector128.Create(0x0000000000007f00ul, 0x0000000000007f00ul)), 1)
293-
)
294-
);
278+
return ((comb & Vector128.Create(0x000000000000007ful, 0x000000000000007ful)) |
279+
Vector128.ShiftRightLogical(comb & Vector128.Create(0x0000000f00000000ul, 0x0000000f00000000ul), 4)) |
280+
((Vector128.ShiftRightLogical(comb & Vector128.Create(0x000000007f000000ul, 0x000000007f000000ul), 3) |
281+
Vector128.ShiftRightLogical(comb & Vector128.Create(0x00000000007f0000ul, 0x00000000007f0000ul), 2)) |
282+
Vector128.ShiftRightLogical(comb & Vector128.Create(0x0000000000007f00ul, 0x0000000000007f00ul), 1));
295283
}
296284

297285

@@ -301,35 +289,31 @@ private unsafe (TT, TU) DecodeTwo32VarIntUnsafe<TT, TU>(ReadOnlySpan<byte> src)
301289
where TU : unmanaged, INumber<TU>
302290
{
303291
var b = Unsafe.As<byte, Vector128<sbyte>>(ref MemoryMarshal.GetReference(src));
304-
uint bitmask = (uint)Sse2.MoveMask(b) & 0b1111111111;
292+
uint bitmask = b.AsByte().ExtractMostSignificantBits() & 0b1111111111;
305293
var (lookup, firstLen, secondLen) = Lookup.DoubleStep1[(int)bitmask];
306294
var shuf = Unsafe.Add(ref MemoryMarshal.GetReference(Lookup.DoubleVec), lookup);
307-
var comb = Ssse3.Shuffle(b, shuf).AsUInt64();
308-
295+
296+
Vector128<ulong> comb;
297+
if (Ssse3.IsSupported)
298+
comb = Ssse3.Shuffle(b, shuf).AsUInt64();
299+
else
300+
comb = AdvSimd.Arm64.VectorTableLookup(b.AsByte(), shuf.AsByte()).AsUInt64();
301+
309302
TT firstNum;
310303
TU secondNum;
311304

312305
if (Bmi2.X64.IsSupported)
313306
{
314307
var shift = Sse2.ShiftRightLogical128BitLane(comb, 8);
315-
308+
316309
firstNum = ExtractFromVector<TT>(comb[0], comb[1]);
317310
secondNum = ExtractFromVector<TU>(shift[0], shift[1]);
318311
}
319312
else
320313
{
321314
var x = sizeof(TT) <= 1 && sizeof(TU) <= 1 ? DualU8Stage2(comb) : sizeof(TT) <= 2 && sizeof(TU) <= 2 ? DualU16Stage2(comb) : DualU32Stage2(comb);
322-
if (Sse41.X64.IsSupported)
323-
{
324-
firstNum = TT.CreateTruncating(Sse41.X64.Extract(x, 0));
325-
secondNum = TU.CreateTruncating(Sse41.X64.Extract(x, 1));
326-
}
327-
else
328-
{
329-
var x32 = x.AsUInt32();
330-
firstNum = TT.CreateTruncating(x32[0]);
331-
secondNum = TU.CreateTruncating(x32[2]);
332-
}
315+
firstNum = TT.CreateTruncating(x.GetElement(0));
316+
secondNum = TU.CreateTruncating(x.GetElement(1));
333317
}
334318

335319
_offset += (firstLen + secondLen) >> 3; // in bits

Lagrange.Proto/Primitives/ProtoWriter.cs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
77
using System.Runtime.Intrinsics;
8+
using System.Runtime.Intrinsics.Arm;
89
using System.Runtime.Intrinsics.X86;
910
using System.Text;
1011
using Lagrange.Proto.Utility;
@@ -189,19 +190,19 @@ private unsafe void EncodeVarIntUnsafe<T>(T value) where T : unmanaged, INumberB
189190
}
190191
else
191192
{
192-
if (Sse2.IsSupported)
193+
if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
193194
{
194195
var stage1 = PackVector<T>(v).AsSByte();
195196
var minimum = Vector128.Create(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
196-
var exists = Sse2.Or(Sse2.CompareGreaterThan(stage1, Vector128<sbyte>.Zero), minimum);
197-
uint bits = (uint)Sse2.MoveMask(exists);
197+
var exists = Vector128.GreaterThan(stage1, Vector128<sbyte>.Zero) | minimum;
198+
uint bits = exists.AsByte().ExtractMostSignificantBits();
198199

199200
byte bytes = (byte)(32 - BitOperations.LeadingZeroCount(bits));
200-
var mask = Sse2.CompareLessThan(Ascend, Vector128.Create((sbyte)bytes));
201+
var mask = Vector128.LessThan(Ascend, Vector128.Create((sbyte)bytes));
201202

202-
var shift = Sse2.ShiftRightLogical128BitLane(mask, 1);
203-
var msbmask = Sse2.And(shift, Vector128.Create((sbyte)-128));
204-
var merged = Sse2.Or(stage1, msbmask);
203+
var shift = Sse2.IsSupported ? Sse2.ShiftRightLogical128BitLane(mask, 1) : AdvSimd.ExtractVector128(mask.AsByte(), Vector128<byte>.Zero, 1).AsSByte();
204+
var msbmask = shift & Vector128.Create((sbyte)-128);
205+
var merged = stage1 | msbmask;
205206

206207
ref byte destination = ref MemoryMarshal.GetReference(_memory.Span);
207208
Unsafe.As<byte, Vector128<sbyte>>(ref Unsafe.Add(ref destination, BytesPending)) = merged;
@@ -278,6 +279,14 @@ private static unsafe Vector128<ulong> PackVector<T>(ulong v) where T : unmanage
278279
x = Sse41.X64.Extract(d, 0);
279280
y = (v & 0x7f00000000000000) >> 56 | (v & 0x8000000000000000) >> 55;
280281
}
282+
else if (AdvSimd.Arm64.IsSupported)
283+
{
284+
var b = Vector128.Create(v, v);
285+
var c = (AdvSimd.ShiftLogical(b & Vector128.Create(0x00000007f0000000ul, 0x000003f800000000ul), Vector128.Create(4L, 5L)) | AdvSimd.ShiftLogical(b & Vector128.Create(0x0001fc0000000000ul, 0x00fe000000000000ul), Vector128.Create(6L, 7L))) | (AdvSimd.ShiftLogical(b & Vector128.Create(0x000000000000007ful, 0x0000000000003f80ul), Vector128.Create(0L, 1L)) | AdvSimd.ShiftLogical(b & Vector128.Create(0x00000000001fc000ul, 0x000000000fe00000ul), Vector128.Create(2L, 3L)));
286+
var d = c | Vector128.Create(c.GetElement(1), 0ul);
287+
x = d.ToScalar();
288+
y = (v & 0x7f00000000000000) >> 56 | (v & 0x8000000000000000) >> 55;
289+
}
281290
else
282291
{
283292
x = (v & 0x000000000000007f) | ((v & 0x0000000000003f80) << 1) | ((v & 0x00000000001fc000) << 2) | ((v & 0x000000000fe00000) << 3) | ((v & 0x00000007f0000000) << 4) | ((v & 0x000003f800000000) << 5) | ((v & 0x0001fc0000000000) << 6) | ((v & 0x00fe000000000000) << 7);
@@ -310,7 +319,7 @@ public unsafe void EncodeTwo32VarIntUnsafe<TT, TU>(TT first, TU second)
310319
return;
311320
}
312321

313-
if (Ssse3.IsSupported)
322+
if (Ssse3.IsSupported || AdvSimd.Arm64.IsSupported)
314323
{
315324
EncodeTwo32VarIntSimd<TT, TU>(v1, v2);
316325
}
@@ -343,7 +352,11 @@ private unsafe void EncodeTwo32VarIntSimd<TT, TU>(ulong v1, ulong v2)
343352

344353
var vec = Vector128.Create(merged1, merged2).AsByte();
345354
var indices = GetCompactShuffleVector(bytes1, bytes2);
346-
var result = Ssse3.Shuffle(vec, indices);
355+
Vector128<byte> result;
356+
if (Ssse3.IsSupported)
357+
result = Ssse3.Shuffle(vec, indices);
358+
else
359+
result = AdvSimd.Arm64.VectorTableLookup(vec, indices);
347360

348361
ref byte destination = ref MemoryMarshal.GetReference(_memory.Span);
349362
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref destination, BytesPending)) = result;

0 commit comments

Comments
 (0)