Skip to content

Commit f39673e

Browse files
committed
Optimize search hot paths
1 parent 5f5f669 commit f39673e

14 files changed

Lines changed: 869 additions & 295 deletions

README.md

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@ Markdown links, wikilinks, and arrow assertions are not implicitly converted int
12851285

12861286
## Algorithm References
12871287

1288-
- Optional fuzzy lexical matching is shared by BM25 typo-tolerant ranking and Tiktoken fuzzy query correction. It uses bounded edit distance with portable SIMD common-affix trimming, a single-word bit-vector dynamic-programming path for short residual tokens, and a bounded banded dynamic-programming fallback for longer residual tokens. It is not a naive full-matrix Levenshtein implementation and does not use platform-specific SIMD intrinsics.
1288+
- Optional fuzzy lexical matching is shared by BM25 typo-tolerant ranking and Tiktoken fuzzy query correction. It uses bounded edit distance with portable SIMD common-affix trimming, stack-backed bit-vector masks for short residual tokens, and a pooled bounded banded dynamic-programming fallback for longer residual tokens. It is not a naive full-matrix Levenshtein implementation and does not use platform-specific SIMD intrinsics.
12891289
- The bit-vector path is guided by Gene Myers, "A fast bit-vector algorithm for approximate string matching based on dynamic programming", Journal of the ACM, 1999, DOI: <https://doi.org/10.1145/316542.316550>.
12901290
- The bounded-threshold behavior is guided by Esko Ukkonen, "Algorithms for approximate string matching", Information and Control, 1985, DOI: <https://doi.org/10.1016/S0019-9958(85)80046-2>.
12911291
- Thanks to `biegehydra/MyersBitParallelDotnet` for inspiring the practical direction we took for fast short-token typo matching.
@@ -1363,28 +1363,28 @@ Graph search exact-query mean time:
13631363

13641364
| Profile | Ranked graph | BM25 | BM25 fuzzy | Focused | Schema SPARQL | Local federated |
13651365
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
1366-
| `ShortDocuments` | 1.200 ms | 2.018 ms | 2.627 ms | 2.053 ms | 46.034 ms | 49.615 ms |
1367-
| `LongDocuments` | 0.480 ms | 3.577 ms | 3.574 ms | 0.642 ms | 12.819 ms | 14.561 ms |
1368-
| `FederatedRunbooks` | 1.334 ms | 2.723 ms | 2.720 ms | 2.271 ms | 45.981 ms | 55.269 ms |
1366+
| `ShortDocuments` | 1.198 ms | 1.673 ms | 1.988 ms | 2.016 ms | 48.157 ms | 51.551 ms |
1367+
| `LongDocuments` | 0.449 ms | 1.987 ms | 1.975 ms | 0.638 ms | 12.698 ms | 15.186 ms |
1368+
| `FederatedRunbooks` | 1.327 ms | 2.024 ms | 2.038 ms | 2.255 ms | 41.309 ms | 61.614 ms |
13691369

13701370
Graph search exact-query allocated memory per operation:
13711371

13721372
| Profile | Ranked graph | BM25 | BM25 fuzzy | Focused | Schema SPARQL | Local federated |
13731373
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
1374-
| `ShortDocuments` | 2.37 MB | 4.83 MB | 7.22 MB | 3.27 MB | 60.34 MB | 62.33 MB |
1375-
| `LongDocuments` | 1.91 MB | 10.67 MB | 10.67 MB | 1.21 MB | 20.22 MB | 22.21 MB |
1376-
| `FederatedRunbooks` | 2.54 MB | 6.80 MB | 6.80 MB | 3.48 MB | 60.75 MB | 62.61 MB |
1374+
| `ShortDocuments` | 2.37 MB | 3.07 MB | 3.06 MB | 3.27 MB | 60.47 MB | 62.32 MB |
1375+
| `LongDocuments` | 1.91 MB | 3.46 MB | 3.46 MB | 1.21 MB | 20.26 MB | 22.21 MB |
1376+
| `FederatedRunbooks` | 2.53 MB | 3.53 MB | 3.53 MB | 3.48 MB | 60.75 MB | 62.75 MB |
13771377

13781378
The `ShortDocuments` exact-query diagnostic slice shows the current hot paths:
13791379

13801380
| Method | Mean | Allocated | Alloc ratio | Gen0 | Gen1 | Gen2 | Work items | Lock contentions |
13811381
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
1382-
| Ranked graph | 1.200 ms | 2.37 MB | 1.00x | 296.8750 | 107.4219 | 0 | 0 | 0 |
1383-
| BM25 | 2.018 ms | 4.83 MB | 2.04x | 601.5625 | 210.9375 | 0 | 0 | 0 |
1384-
| BM25 fuzzy | 2.627 ms | 7.22 MB | 3.04x | 902.3438 | 230.4688 | 0 | 0 | 0 |
1385-
| Focused | 2.053 ms | 3.27 MB | 1.38x | 406.2500 | 179.6875 | 0 | 0 | 0 |
1386-
| Schema SPARQL | 46.034 ms | 60.34 MB | 25.44x | 8500.0000 | 1833.3333 | 500.0000 | 551 | 325 |
1387-
| Local federated | 49.615 ms | 62.33 MB | 26.27x | 8666.6667 | 2166.6667 | 500.0000 | 552 | 315.1667 |
1382+
| Ranked graph | 1.198 ms | 2.37 MB | 1.00x | 296.8750 | 107.4219 | 0 | 0 | 0 |
1383+
| BM25 | 1.673 ms | 3.07 MB | 1.29x | 384.7656 | 142.5781 | 0 | 0 | 0 |
1384+
| BM25 fuzzy | 1.988 ms | 3.06 MB | 1.29x | 375.0000 | 156.2500 | 0 | 0 | 0 |
1385+
| Focused | 2.016 ms | 3.27 MB | 1.38x | 406.2500 | 179.6875 | 0 | 0 | 0 |
1386+
| Schema SPARQL | 48.157 ms | 60.47 MB | 25.49x | 8400.0000 | 1800.0000 | 400.0000 | 551 | 305.2000 |
1387+
| Local federated | 51.551 ms | 62.32 MB | 26.27x | 8500.0000 | 2000.0000 | 333.3333 | 552 | 314.5000 |
13881388

13891389
Allocation and GC columns come directly from BenchmarkDotNet diagnosers. Treat the ratios and relative pressure inside the same run as the useful signal; ShortRun is a fast diagnostic pass, not a release-grade SLA measurement.
13901390

@@ -1404,18 +1404,22 @@ Tiktoken token-distance search:
14041404

14051405
| Profile | Query | Exact | Fuzzy-corrected | Exact allocated | Fuzzy allocated |
14061406
| --- | --- | ---: | ---: | ---: | ---: |
1407-
| `LongDocuments` | Exact | 955.1 us | 952.7 us | 2.38 MB | 2.38 MB |
1408-
| `LongDocuments` | Typo | 1.112 ms | 1.291 ms | 2.78 MB | 3.73 MB |
1409-
| `TokenizedMultilingual` | Exact | 680.8 us | 690.5 us | 1.81 MB | 1.81 MB |
1410-
| `TokenizedMultilingual` | Typo | 811.3 us | 861.2 us | 1.81 MB | 1.82 MB |
1407+
| `LongDocuments` | Exact | 298.1 us | 301.9 us | 212.24 KB | 212.99 KB |
1408+
| `LongDocuments` | Typo | 350.4 us | 393.0 us | 212.88 KB | 216.13 KB |
1409+
| `LongDocuments` | NoMatch | 254.3 us | 257.7 us | 212.19 KB | 213.41 KB |
1410+
| `TokenizedMultilingual` | Exact | 219.4 us | 220.5 us | 139.18 KB | 140.13 KB |
1411+
| `TokenizedMultilingual` | Typo | 246.2 us | 267.8 us | 139.59 KB | 142.02 KB |
1412+
| `TokenizedMultilingual` | NoMatch | 200.3 us | 184.3 us | 138.91 KB | 140.06 KB |
14111413

14121414
Fuzzy edit-distance mean time:
14131415

14141416
| Scenario | Bounded bit-vector/banded | Naive Levenshtein | Speedup vs naive | Bounded allocation | Naive allocation |
14151417
| --- | ---: | ---: | ---: | ---: | ---: |
1416-
| Short deletion | 6.778 ns | 91.780 ns | 13.54x | 0 B | 112 B |
1417-
| Short substitution | 31.011 ns | 82.948 ns | 2.67x | 216 B | 112 B |
1418-
| Long insertion | 21.980 ns | 7,990.146 ns | 363.53x | 0 B | 640 B |
1419-
| Long no-match | 70.283 ns | 8,990.700 ns | 127.92x | 328 B | 672 B |
1418+
| Short deletion | 6.793 ns | 94.900 ns | 13.97x | 0 B | 112 B |
1419+
| Short substitution | 32.973 ns | 83.927 ns | 2.55x | 0 B | 112 B |
1420+
| Long insertion | 22.062 ns | 8,261.735 ns | 374.48x | 0 B | 640 B |
1421+
| Long no-match | 53.873 ns | 9,292.649 ns | 172.49x | 0 B | 672 B |
1422+
1423+
This run reflects the allocation-focused search hot-path pass: BM25 now uses the shared allocation-aware tokenizer, direct scoring loops, and bounded top-N match retention; fuzzy edit distance uses stack-backed bit-vector masks for short residual tokens and pooled rows for the long-token fallback; and Tiktoken search keeps only bounded top-N candidates while TF-IDF weighting updates dictionary values without temporary key arrays.
14201424

14211425
These numbers are local measurements, not a cross-machine performance contract. The README keeps compact slices only; [Performance Benchmarks](docs/Features/PerformanceBenchmarks.md) and the full Markdown, CSV, and JSON BenchmarkDotNet reports remain the source for detailed diagnostics.

docs/Features/HybridGraphSearch.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ flowchart LR
4040
- `schema:keywords` are excluded from canonical ranking.
4141
- BM25 mode does not require an embedding provider, semantic index, Lucene index, or database.
4242
- Build-result BM25 can find body-only terms that are absent from title, summary, and front matter.
43-
- Fuzzy BM25 token matching is opt-in through `KnowledgeGraphRankedSearchOptions.EnableFuzzyTokenMatching`, `MaxFuzzyEditDistance`, and `MinimumFuzzyTokenLength`. It handles insertion, deletion, and substitution typos with portable SIMD common-affix trimming, a single-word bit-vector path for short residual tokens, and a bounded banded dynamic-programming fallback for longer residual tokens. It does not use platform-specific SIMD intrinsics.
43+
- Fuzzy BM25 token matching is opt-in through `KnowledgeGraphRankedSearchOptions.EnableFuzzyTokenMatching`, `MaxFuzzyEditDistance`, and `MinimumFuzzyTokenLength`. It handles insertion, deletion, and substitution typos with portable SIMD common-affix trimming, stack-backed bit-vector masks for short residual tokens, and a pooled bounded banded dynamic-programming fallback for longer residual tokens. It does not use platform-specific SIMD intrinsics.
4444
- A hit present in both graph and semantic ranking is marked as merged and keeps its graph-first position.
4545
- Semantic-only hits never outrank canonical graph hits in hybrid mode.
4646
- `KnowledgeGraphHybridFusionStrategy.ReciprocalRank` is opt-in. It applies reciprocal rank fusion across graph and semantic result lists while preserving the canonical and semantic component scores for diagnostics.

docs/Features/PerformanceBenchmarks.md

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -109,28 +109,28 @@ Graph search exact-query mean time:
109109

110110
| Profile | Ranked graph | BM25 | BM25 fuzzy | Focused | Schema SPARQL | Local federated |
111111
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
112-
| `ShortDocuments` | 1.200 ms | 2.018 ms | 2.627 ms | 2.053 ms | 46.034 ms | 49.615 ms |
113-
| `LongDocuments` | 0.480 ms | 3.577 ms | 3.574 ms | 0.642 ms | 12.819 ms | 14.561 ms |
114-
| `FederatedRunbooks` | 1.334 ms | 2.723 ms | 2.720 ms | 2.271 ms | 45.981 ms | 55.269 ms |
112+
| `ShortDocuments` | 1.198 ms | 1.673 ms | 1.988 ms | 2.016 ms | 48.157 ms | 51.551 ms |
113+
| `LongDocuments` | 0.449 ms | 1.987 ms | 1.975 ms | 0.638 ms | 12.698 ms | 15.186 ms |
114+
| `FederatedRunbooks` | 1.327 ms | 2.024 ms | 2.038 ms | 2.255 ms | 41.309 ms | 61.614 ms |
115115

116116
Graph search exact-query allocated memory per operation:
117117

118118
| Profile | Ranked graph | BM25 | BM25 fuzzy | Focused | Schema SPARQL | Local federated |
119119
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
120-
| `ShortDocuments` | 2.37 MB | 4.83 MB | 7.22 MB | 3.27 MB | 60.34 MB | 62.33 MB |
121-
| `LongDocuments` | 1.91 MB | 10.67 MB | 10.67 MB | 1.21 MB | 20.22 MB | 22.21 MB |
122-
| `FederatedRunbooks` | 2.54 MB | 6.80 MB | 6.80 MB | 3.48 MB | 60.75 MB | 62.61 MB |
120+
| `ShortDocuments` | 2.37 MB | 3.07 MB | 3.06 MB | 3.27 MB | 60.47 MB | 62.32 MB |
121+
| `LongDocuments` | 1.91 MB | 3.46 MB | 3.46 MB | 1.21 MB | 20.26 MB | 22.21 MB |
122+
| `FederatedRunbooks` | 2.53 MB | 3.53 MB | 3.53 MB | 3.48 MB | 60.75 MB | 62.75 MB |
123123

124124
The `ShortDocuments` exact-query diagnostic slice shows the current hot paths:
125125

126126
| Method | Mean | Allocated | Alloc ratio | Gen0 | Gen1 | Gen2 | Work items | Lock contentions |
127127
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
128-
| Ranked graph | 1.200 ms | 2.37 MB | 1.00x | 296.8750 | 107.4219 | 0 | 0 | 0 |
129-
| BM25 | 2.018 ms | 4.83 MB | 2.04x | 601.5625 | 210.9375 | 0 | 0 | 0 |
130-
| BM25 fuzzy | 2.627 ms | 7.22 MB | 3.04x | 902.3438 | 230.4688 | 0 | 0 | 0 |
131-
| Focused | 2.053 ms | 3.27 MB | 1.38x | 406.2500 | 179.6875 | 0 | 0 | 0 |
132-
| Schema SPARQL | 46.034 ms | 60.34 MB | 25.44x | 8500.0000 | 1833.3333 | 500.0000 | 551 | 325 |
133-
| Local federated | 49.615 ms | 62.33 MB | 26.27x | 8666.6667 | 2166.6667 | 500.0000 | 552 | 315.1667 |
128+
| Ranked graph | 1.198 ms | 2.37 MB | 1.00x | 296.8750 | 107.4219 | 0 | 0 | 0 |
129+
| BM25 | 1.673 ms | 3.07 MB | 1.29x | 384.7656 | 142.5781 | 0 | 0 | 0 |
130+
| BM25 fuzzy | 1.988 ms | 3.06 MB | 1.29x | 375.0000 | 156.2500 | 0 | 0 | 0 |
131+
| Focused | 2.016 ms | 3.27 MB | 1.38x | 406.2500 | 179.6875 | 0 | 0 | 0 |
132+
| Schema SPARQL | 48.157 ms | 60.47 MB | 25.49x | 8400.0000 | 1800.0000 | 400.0000 | 551 | 305.2000 |
133+
| Local federated | 51.551 ms | 62.32 MB | 26.27x | 8500.0000 | 2000.0000 | 333.3333 | 552 | 314.5000 |
134134

135135
Allocation, GC, work-item, and lock-contention columns come directly from BenchmarkDotNet diagnosers. Treat ratios and relative pressure inside the same run as the useful signal; ShortRun is a fast diagnostic pass, not a release-grade SLA measurement.
136136

@@ -152,13 +152,13 @@ Tiktoken token-distance search over the semantic profiles:
152152

153153
| Profile | Query | Exact | Fuzzy-corrected | Exact allocated | Fuzzy allocated |
154154
| --- | --- | ---: | ---: | ---: | ---: |
155-
| `LongDocuments` | Exact | 955.1 us | 952.7 us | 2.38 MB | 2.38 MB |
156-
| `LongDocuments` | Typo | 1.112 ms | 1.291 ms | 2.78 MB | 3.73 MB |
157-
| `LongDocuments` | NoMatch | 891.0 us | 902.5 us | 2.25 MB | 2.27 MB |
158-
| `TokenizedMultilingual` | Exact | 680.8 us | 690.5 us | 1.81 MB | 1.81 MB |
159-
| `TokenizedMultilingual` | Typo | 811.3 us | 861.2 us | 1.81 MB | 1.82 MB |
160-
| `TokenizedMultilingual` | NoMatch | 634.1 us | 634.7 us | 1.81 MB | 1.82 MB |
155+
| `LongDocuments` | Exact | 298.1 us | 301.9 us | 212.24 KB | 212.99 KB |
156+
| `LongDocuments` | Typo | 350.4 us | 393.0 us | 212.88 KB | 216.13 KB |
157+
| `LongDocuments` | NoMatch | 254.3 us | 257.7 us | 212.19 KB | 213.41 KB |
158+
| `TokenizedMultilingual` | Exact | 219.4 us | 220.5 us | 139.18 KB | 140.13 KB |
159+
| `TokenizedMultilingual` | Typo | 246.2 us | 267.8 us | 139.59 KB | 142.02 KB |
160+
| `TokenizedMultilingual` | NoMatch | 200.3 us | 184.3 us | 138.91 KB | 140.06 KB |
161161

162-
Interpretation: ranked graph, BM25, BM25 fuzzy, and focused search are the low-latency retrieval paths. BM25 fuzzy deliberately spends more time and allocation on typo-heavy queries and should stay opt-in. Schema-aware SPARQL and local federation are explainable RDF query paths, but dotNetRDF query-plan execution keeps them materially heavier for repeated low-latency calls. JSON-LD load is the highest persistence cost in the current local run; Turtle load and snapshot/serialization are cheaper. Use ranked graph or BM25 search when the caller needs low-latency retrieval, and use schema/federation when caller-visible evidence and graph-shape constraints matter more than raw latency.
162+
Interpretation: ranked graph, BM25, BM25 fuzzy, focused search, and Tiktoken token-distance search are the low-latency retrieval paths. The current BM25 implementation keeps exact and fuzzy allocation close by sharing the same tokenizer, dictionary shape, bounded top-N match retention, stack-backed short-token edit-distance masks, and pooled long-token fallback rows. Tiktoken search keeps bounded top-N candidates and updates TF-IDF dictionary values without temporary key arrays. Fuzzy BM25 still costs more CPU on typo-heavy queries and should stay opt-in. Schema-aware SPARQL and local federation are explainable RDF query paths, but dotNetRDF query-plan execution keeps them materially heavier for repeated low-latency calls. JSON-LD load is the highest persistence cost in the current local run; Turtle load and snapshot/serialization are cheaper. Use ranked graph or BM25 search when the caller needs low-latency retrieval, and use schema/federation when caller-visible evidence and graph-shape constraints matter more than raw latency.
163163

164-
The fuzzy edit-distance suite measured the bounded bit-vector/banded path faster than the naive Levenshtein baseline in every measured scenario, including 363.53x faster for the long-insertion case and 127.92x faster for the long no-match case.
164+
The fuzzy edit-distance suite measured the bounded bit-vector/banded path with zero allocated bytes and faster than the naive Levenshtein baseline in every measured scenario, including 374.48x faster for the long-insertion case and 172.49x faster for the long no-match case.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
using System.Buffers;
2+
3+
namespace ManagedCode.MarkdownLd.Kb.Pipeline;
4+
5+
internal static class KnowledgeGraphBandedEditDistance
6+
{
7+
private const int ExactDistance = 0;
8+
private const int UnitDistance = 1;
9+
10+
internal static int Compute(ReadOnlySpan<char> left, ReadOnlySpan<char> right, int maxDistance)
11+
{
12+
var sentinel = maxDistance + UnitDistance;
13+
var rowLength = right.Length + 1;
14+
var previous = ArrayPool<int>.Shared.Rent(rowLength);
15+
var current = ArrayPool<int>.Shared.Rent(rowLength);
16+
17+
try
18+
{
19+
return ComputeWithRows(left, right, maxDistance, previous, current, sentinel);
20+
}
21+
finally
22+
{
23+
ArrayPool<int>.Shared.Return(previous);
24+
ArrayPool<int>.Shared.Return(current);
25+
}
26+
}
27+
28+
private static int ComputeWithRows(
29+
ReadOnlySpan<char> left,
30+
ReadOnlySpan<char> right,
31+
int maxDistance,
32+
int[] previous,
33+
int[] current,
34+
int sentinel)
35+
{
36+
var rowLength = right.Length + 1;
37+
var previousRow = previous.AsSpan(0, rowLength);
38+
var currentRow = current.AsSpan(0, rowLength);
39+
FillInitialRow(previousRow, right.Length, maxDistance, sentinel);
40+
41+
for (var row = 1; row <= left.Length; row++)
42+
{
43+
currentRow.Fill(sentinel);
44+
if (row <= maxDistance)
45+
{
46+
currentRow[0] = row;
47+
}
48+
49+
if (!TryFillRow(left, right, maxDistance, row, previousRow, currentRow))
50+
{
51+
return KnowledgeGraphBoundedEditDistance.NoMatchDistance;
52+
}
53+
54+
var rowSwap = previousRow;
55+
previousRow = currentRow;
56+
currentRow = rowSwap;
57+
}
58+
59+
return previousRow[right.Length] <= maxDistance
60+
? previousRow[right.Length]
61+
: KnowledgeGraphBoundedEditDistance.NoMatchDistance;
62+
}
63+
64+
private static void FillInitialRow(Span<int> previous, int rightLength, int maxDistance, int sentinel)
65+
{
66+
previous.Fill(sentinel);
67+
for (var column = 0; column <= Math.Min(rightLength, maxDistance); column++)
68+
{
69+
previous[column] = column;
70+
}
71+
}
72+
73+
private static bool TryFillRow(
74+
ReadOnlySpan<char> left,
75+
ReadOnlySpan<char> right,
76+
int maxDistance,
77+
int row,
78+
ReadOnlySpan<int> previous,
79+
Span<int> current)
80+
{
81+
var startColumn = Math.Max(1, row - maxDistance);
82+
var endColumn = Math.Min(right.Length, row + maxDistance);
83+
if (startColumn > endColumn)
84+
{
85+
return false;
86+
}
87+
88+
var bestInRow = current[0];
89+
for (var column = startColumn; column <= endColumn; column++)
90+
{
91+
var cost = left[row - 1] == right[column - 1] ? ExactDistance : UnitDistance;
92+
current[column] = Math.Min(
93+
Math.Min(current[column - 1] + UnitDistance, previous[column] + UnitDistance),
94+
previous[column - 1] + cost);
95+
bestInRow = Math.Min(bestInRow, current[column]);
96+
}
97+
98+
return bestInRow <= maxDistance;
99+
}
100+
}

0 commit comments

Comments
 (0)