-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Expand file tree
/
Copy pathAsciiHash.cs
More file actions
340 lines (301 loc) · 12.3 KB
/
AsciiHash.cs
File metadata and controls
340 lines (301 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
using System.Buffers.Binary;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace RESPite;
/// <summary>
/// This type is intended to provide fast hashing functions for small ASCII strings, for example well-known
/// RESP literals that are usually identifiable by their length and initial bytes; it is not intended
/// for general purpose hashing, and the behavior is undefined for non-ASCII literals.
/// All matches must also perform a sequence equality check.
/// </summary>
/// <remarks>See HastHashGenerator.md for more information and intended usage.</remarks>
[AttributeUsage(
AttributeTargets.Class | AttributeTargets.Method | AttributeTargets.Field | AttributeTargets.Enum,
AllowMultiple = false,
Inherited = false)]
[Conditional("DEBUG")] // evaporate in release
[Experimental(Experiments.Respite, UrlFormat = Experiments.UrlFormat)]
public sealed partial class AsciiHashAttribute(string token = "") : Attribute
{
/// <summary>
/// The token expected when parsing data, if different from the implied value. The implied
/// value is the name, replacing underscores for hyphens, so: 'a_b' becomes 'a-b'.
/// </summary>
public string Token => token;
/// <summary>
/// Indicates whether a parse operation is case-sensitive. Not used in other contexts.
/// </summary>
public bool CaseSensitive { get; set; } = true;
}
// note: instance members are in AsciiHash.Instance.cs.
[Experimental(Experiments.Respite, UrlFormat = Experiments.UrlFormat)]
public readonly partial struct AsciiHash
{
/// <summary>
/// In-place ASCII upper-case conversion.
/// </summary>
public static void ToUpper(Span<byte> span)
{
foreach (ref var b in span)
{
if (b >= 'a' && b <= 'z')
b = (byte)(b & ~0x20);
}
}
/// <summary>
/// In-place ASCII lower-case conversion.
/// </summary>
public static void ToLower(Span<byte> span)
{
foreach (ref var b in span)
{
if (b >= 'a' && b <= 'z')
b |= (byte)(b & ~0x20);
}
}
internal const int MaxBytesHashed = sizeof(long);
public static bool EqualsCS(ReadOnlySpan<byte> first, ReadOnlySpan<byte> second)
{
var len = first.Length;
if (len != second.Length) return false;
// for very short values, the CS hash performs CS equality
return len <= MaxBytesHashed ? HashCS(first) == HashCS(second) : first.SequenceEqual(second);
}
public static bool SequenceEqualsCS(ReadOnlySpan<byte> first, ReadOnlySpan<byte> second)
=> first.SequenceEqual(second);
public static bool EqualsCI(ReadOnlySpan<byte> first, ReadOnlySpan<byte> second)
{
var len = first.Length;
if (len != second.Length) return false;
// for very short values, the UC hash performs CI equality
return len <= MaxBytesHashed ? HashUC(first) == HashUC(second) : SequenceEqualsCI(first, second);
}
public static bool EqualsCI(ReadOnlySpan<byte> first, ReadOnlySpan<char> second)
=> EqualsCI(second, first);
public static unsafe bool SequenceEqualsCI(ReadOnlySpan<byte> first, ReadOnlySpan<byte> second)
{
var len = first.Length;
if (len != second.Length) return false;
// OK, don't be clever (SIMD, etc); the purpose of FashHash is to compare RESP key tokens, which are
// typically relatively short, think 3-20 bytes. That wouldn't even touch a SIMD vector, so:
// just loop (the exact thing we'd need to do *anyway* in a SIMD implementation, to mop up the non-SIMD
// trailing bytes).
fixed (byte* firstPtr = &MemoryMarshal.GetReference(first))
{
fixed (byte* secondPtr = &MemoryMarshal.GetReference(second))
{
const int CS_MASK = 0b0101_1111;
for (int i = 0; i < len; i++)
{
byte x = firstPtr[i];
var xCI = x & CS_MASK;
if (xCI >= 'A' & xCI <= 'Z')
{
// alpha mismatch
if (xCI != (secondPtr[i] & CS_MASK)) return false;
}
else if (x != secondPtr[i])
{
// non-alpha mismatch
return false;
}
}
return true;
}
}
}
public static bool SequenceEqualsCI(ReadOnlySpan<byte> first, ReadOnlySpan<char> second)
=> SequenceEqualsCI(second, first);
public static bool EqualsCS(ReadOnlySpan<char> first, ReadOnlySpan<char> second)
{
var len = first.Length;
if (len != second.Length) return false;
// for very short values, the CS hash performs CS equality
return len <= MaxBytesHashed ? HashCS(first) == HashCS(second) : first.SequenceEqual(second);
}
public static bool SequenceEqualsCS(ReadOnlySpan<char> first, ReadOnlySpan<char> second)
=> first.SequenceEqual(second);
public static bool EqualsCI(ReadOnlySpan<char> first, ReadOnlySpan<char> second)
{
var len = first.Length;
if (len != second.Length) return false;
// for very short values, the CS hash performs CS equality; check that first
return len <= MaxBytesHashed ? HashUC(first) == HashUC(second) : SequenceEqualsCI(first, second);
}
public static bool EqualsCI(ReadOnlySpan<char> first, ReadOnlySpan<byte> second)
{
var len = first.Length;
if (len != second.Length) return false;
// for very short values, the UC hash performs CI equality
return len <= MaxBytesHashed ? HashUC(first) == HashUC(second) : SequenceEqualsCI(first, second);
}
public static unsafe bool SequenceEqualsCI(ReadOnlySpan<char> first, ReadOnlySpan<char> second)
{
var len = first.Length;
if (len != second.Length) return false;
// OK, don't be clever (SIMD, etc); the purpose of FashHash is to compare RESP key tokens, which are
// typically relatively short, think 3-20 bytes. That wouldn't even touch a SIMD vector, so:
// just loop (the exact thing we'd need to do *anyway* in a SIMD implementation, to mop up the non-SIMD
// trailing bytes).
fixed (char* firstPtr = &MemoryMarshal.GetReference(first))
{
fixed (char* secondPtr = &MemoryMarshal.GetReference(second))
{
const int CS_MASK = 0b0101_1111;
for (int i = 0; i < len; i++)
{
int x = (byte)firstPtr[i];
var xCI = x & CS_MASK;
if (xCI >= 'A' & xCI <= 'Z')
{
// alpha mismatch
if (xCI != (secondPtr[i] & CS_MASK)) return false;
}
else if (x != (byte)secondPtr[i])
{
// non-alpha mismatch
return false;
}
}
return true;
}
}
}
public static unsafe bool SequenceEqualsCI(ReadOnlySpan<char> first, ReadOnlySpan<byte> second)
{
var len = first.Length;
if (len != second.Length) return false;
// OK, don't be clever (SIMD, etc); the purpose of FashHash is to compare RESP key tokens, which are
// typically relatively short, think 3-20 bytes. That wouldn't even touch a SIMD vector, so:
// just loop (the exact thing we'd need to do *anyway* in a SIMD implementation, to mop up the non-SIMD
// trailing bytes).
fixed (char* firstPtr = &MemoryMarshal.GetReference(first))
{
fixed (byte* secondPtr = &MemoryMarshal.GetReference(second))
{
const int CS_MASK = 0b0101_1111;
for (int i = 0; i < len; i++)
{
int x = (byte)firstPtr[i];
var xCI = x & CS_MASK;
if (xCI >= 'A' & xCI <= 'Z')
{
// alpha mismatch
if (xCI != (secondPtr[i] & CS_MASK)) return false;
}
else if (x != secondPtr[i])
{
// non-alpha mismatch
return false;
}
}
return true;
}
}
}
public static void Hash(scoped ReadOnlySpan<byte> value, out long cs, out long uc)
{
cs = HashCS(value);
uc = ToUC(cs);
}
public static void Hash(scoped ReadOnlySpan<char> value, out long cs, out long uc)
{
cs = HashCS(value);
uc = ToUC(cs);
}
public static long HashUC(scoped ReadOnlySpan<byte> value) => ToUC(HashCS(value));
public static long HashUC(scoped ReadOnlySpan<char> value) => ToUC(HashCS(value));
internal static long ToUC(long hashCS)
{
const long LC_MASK = 0x2020_2020_2020_2020;
// check whether there are any possible lower-case letters;
// this would be anything with the 0x20 bit set
if ((hashCS & LC_MASK) == 0) return hashCS;
// Something looks possibly lower-case; we can't just mask it off,
// because there are other non-alpha characters in that range.
#if NET
ToUpper(MemoryMarshal.CreateSpan(ref Unsafe.As<long, byte>(ref hashCS), sizeof(long)));
return hashCS;
#else
Span<byte> buffer = stackalloc byte[8];
BinaryPrimitives.WriteInt64LittleEndian(buffer, hashCS);
ToUpper(buffer);
return BinaryPrimitives.ReadInt64LittleEndian(buffer);
#endif
}
public static long HashCS(scoped ReadOnlySpan<byte> value)
{
// at least 8? we can blit
if ((value.Length >> 3) != 0) return BinaryPrimitives.ReadInt64LittleEndian(value);
// small (<7); manual loop
// note: profiling with unsafe code to pick out elements: much slower
// note: profiling with overstamping a local: 3x slower
ulong tally = 0;
for (int i = 0; i < value.Length; i++)
{
tally |= ((ulong)value[i]) << (i << 3);
}
return (long)tally;
}
public static long HashCS(scoped ReadOnlySpan<char> value)
{
// note: BDN profiling with Vector64.Narrow showed no benefit
if ((value.Length >> 3) != 0)
{
// slice if necessary, so we can use bounds-elided foreach
if (value.Length != 8) value = value.Slice(0, 8);
}
ulong tally = 0;
for (int i = 0; i < value.Length; i++)
{
tally |= ((ulong)value[i]) << (i << 3);
}
return (long)tally;
}
public static void HashCS(scoped ReadOnlySpan<byte> value, out long cs0, out long cs1)
{
cs0 = HashCS(value);
cs1 = value.Length > MaxBytesHashed ? HashCS(value.Slice(start: MaxBytesHashed)) : 0;
}
public static void HashCS(scoped ReadOnlySpan<char> value, out long cs0, out long cs1)
{
cs0 = HashCS(value);
cs1 = value.Length > MaxBytesHashed ? HashCS(value.Slice(start: MaxBytesHashed)) : 0;
}
public static void HashUC(scoped ReadOnlySpan<byte> value, out long cs0, out long cs1)
{
cs0 = HashUC(value);
cs1 = value.Length > MaxBytesHashed ? HashUC(value.Slice(start: MaxBytesHashed)) : 0;
}
public static void HashUC(scoped ReadOnlySpan<char> value, out long cs0, out long cs1)
{
cs0 = HashUC(value);
cs1 = value.Length > MaxBytesHashed ? HashUC(value.Slice(start: MaxBytesHashed)) : 0;
}
public static void Hash(scoped ReadOnlySpan<byte> value, out long cs0, out long uc0, out long cs1, out long uc1)
{
Hash(value, out cs0, out uc0);
if (value.Length > MaxBytesHashed)
{
Hash(value.Slice(start: MaxBytesHashed), out cs1, out uc1);
}
else
{
cs1 = uc1 = 0;
}
}
public static void Hash(scoped ReadOnlySpan<char> value, out long cs0, out long uc0, out long cs1, out long uc1)
{
Hash(value, out cs0, out uc0);
if (value.Length > MaxBytesHashed)
{
Hash(value.Slice(start: MaxBytesHashed), out cs1, out uc1);
}
else
{
cs1 = uc1 = 0;
}
}
}