Skip to content

Commit 5f44642

Browse files
committed
optimization
1 parent c7968c5 commit 5f44642

14 files changed

Lines changed: 588 additions & 141 deletions

src/MarkdownLd.Kb/Graph/Runtime/KnowledgeGraph.RankedSearch.cs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ internal static async Task<IReadOnlyList<KnowledgeGraphRankedSearchMatch>> Searc
5858
var filteredCandidates = FilterSearchCandidates(
5959
candidates,
6060
effectiveOptions.CandidateNodeIds);
61-
var candidatesById = filteredCandidates.ToDictionary(static candidate => candidate.NodeId, StringComparer.Ordinal);
6261
var canonicalMatches = effectiveOptions.Mode is KnowledgeGraphSearchMode.Graph or KnowledgeGraphSearchMode.Hybrid
6362
? SearchCanonical(filteredCandidates, query, effectiveOptions.MaxResults)
6463
: [];
@@ -77,6 +76,7 @@ internal static async Task<IReadOnlyList<KnowledgeGraphRankedSearchMatch>> Searc
7776
throw new InvalidOperationException(SemanticSearchRequiresIndexMessage);
7877
}
7978

79+
var candidatesById = filteredCandidates.ToDictionary(static candidate => candidate.NodeId, StringComparer.Ordinal);
8080
var semanticMatches = (await semanticIndex
8181
.SearchAsync(
8282
query,
@@ -99,13 +99,19 @@ private static IReadOnlyList<KnowledgeGraphRankedSearchMatch> SearchCanonical(
9999
int limit)
100100
{
101101
var normalizedQuery = query.Trim();
102-
return candidates
103-
.Select(candidate => CreateCanonicalMatch(candidate, normalizedQuery))
104-
.Where(match => match.Score > ZeroConfidence)
105-
.OrderByDescending(static match => match.Score)
106-
.ThenBy(static match => match.Label, StringComparer.OrdinalIgnoreCase)
107-
.Take(limit)
108-
.ToArray();
102+
var matches = new List<KnowledgeGraphRankedSearchMatch>(Math.Min(candidates.Count, limit));
103+
foreach (var candidate in candidates)
104+
{
105+
var match = CreateCanonicalMatch(candidate, normalizedQuery);
106+
if (match.Score <= ZeroConfidence)
107+
{
108+
continue;
109+
}
110+
111+
KnowledgeGraphBm25SearchResults.AddBoundedMatch(matches, match, limit);
112+
}
113+
114+
return KnowledgeGraphBm25SearchResults.ToArray(matches);
109115
}
110116

111117
private static void ValidateCandidateNodeIds(IReadOnlyCollection<string>? candidateNodeIds)

src/MarkdownLd.Kb/Graph/Runtime/KnowledgeGraphBandedEditDistance.cs

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,36 +6,49 @@ internal static class KnowledgeGraphBandedEditDistance
66
{
77
private const int ExactDistance = 0;
88
private const int UnitDistance = 1;
9+
private const int StackRowLengthLimit = 256;
910

1011
internal static int Compute(ReadOnlySpan<char> left, ReadOnlySpan<char> right, int maxDistance)
1112
{
1213
var sentinel = maxDistance + UnitDistance;
1314
var rowLength = right.Length + 1;
14-
var previous = ArrayPool<int>.Shared.Rent(rowLength);
15-
var current = ArrayPool<int>.Shared.Rent(rowLength);
15+
if (rowLength <= StackRowLengthLimit)
16+
{
17+
Span<int> stackPrevious = stackalloc int[rowLength];
18+
Span<int> stackCurrent = stackalloc int[rowLength];
19+
return ComputeWithRows(left, right, maxDistance, stackPrevious, stackCurrent, sentinel);
20+
}
21+
22+
var pooledPrevious = ArrayPool<int>.Shared.Rent(rowLength);
23+
var pooledCurrent = ArrayPool<int>.Shared.Rent(rowLength);
1624

1725
try
1826
{
19-
return ComputeWithRows(left, right, maxDistance, previous, current, sentinel);
27+
return ComputeWithRows(
28+
left,
29+
right,
30+
maxDistance,
31+
pooledPrevious.AsSpan(0, rowLength),
32+
pooledCurrent.AsSpan(0, rowLength),
33+
sentinel);
2034
}
2135
finally
2236
{
23-
ArrayPool<int>.Shared.Return(previous);
24-
ArrayPool<int>.Shared.Return(current);
37+
ArrayPool<int>.Shared.Return(pooledPrevious);
38+
ArrayPool<int>.Shared.Return(pooledCurrent);
2539
}
2640
}
2741

2842
private static int ComputeWithRows(
2943
ReadOnlySpan<char> left,
3044
ReadOnlySpan<char> right,
3145
int maxDistance,
32-
int[] previous,
33-
int[] current,
46+
Span<int> previous,
47+
Span<int> current,
3448
int sentinel)
3549
{
36-
var rowLength = right.Length + 1;
37-
var previousRow = previous.AsSpan(0, rowLength);
38-
var currentRow = current.AsSpan(0, rowLength);
50+
var previousRow = previous;
51+
var currentRow = current;
3952
FillInitialRow(previousRow, right.Length, maxDistance, sentinel);
4053

4154
for (var row = 1; row <= left.Length; row++)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
using static ManagedCode.MarkdownLd.Kb.Pipeline.PipelineConstants;
2+
3+
namespace ManagedCode.MarkdownLd.Kb.Pipeline;
4+
5+
internal static class KnowledgeGraphBm25Scoring
6+
{
7+
private const double K1 = 1.2d;
8+
private const double B = 0.75d;
9+
private const double Half = 0.5d;
10+
private const double IdfOffset = 1d;
11+
12+
public static double ScoreTerm(
13+
int documentLength,
14+
double frequency,
15+
int documentFrequency,
16+
int documentCount,
17+
double averageDocumentLength)
18+
{
19+
if (documentFrequency == 0 || documentLength == 0 || frequency <= ZeroConfidence)
20+
{
21+
return ZeroConfidence;
22+
}
23+
24+
var idf = Math.Log(IdfOffset + ((documentCount - documentFrequency + Half) / (documentFrequency + Half)));
25+
var denominator = frequency + K1 * (IdfOffset - B + (B * documentLength / averageDocumentLength));
26+
return idf * ((frequency * (K1 + IdfOffset)) / denominator);
27+
}
28+
}

src/MarkdownLd.Kb/Graph/Runtime/KnowledgeGraphBm25Search.cs

Lines changed: 39 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,6 @@ namespace ManagedCode.MarkdownLd.Kb.Pipeline;
44

55
internal static class KnowledgeGraphBm25Search
66
{
7-
private const double K1 = 1.2d;
8-
private const double B = 0.75d;
9-
private const double Half = 0.5d;
10-
private const double IdfOffset = 1d;
11-
127
public static IReadOnlyList<KnowledgeGraphRankedSearchMatch> Search(
138
IReadOnlyList<KnowledgeGraphSearchCandidate> candidates,
149
string query,
@@ -20,16 +15,18 @@ public static IReadOnlyList<KnowledgeGraphRankedSearchMatch> Search(
2015
return [];
2116
}
2217

23-
var documents = CreateDocuments(candidates, out var averageDocumentLength);
2418
var fuzzyOptions = KnowledgeGraphFuzzyTokenMatchingOptions.FromRankedSearch(options);
25-
var documentFrequency = CreateDocumentFrequency(documents, queryTerms, fuzzyOptions);
26-
return CreateMatches(
27-
documents,
28-
queryTerms,
29-
documentFrequency,
30-
averageDocumentLength,
31-
options.MaxResults,
32-
fuzzyOptions);
19+
if (!fuzzyOptions.Enabled)
20+
{
21+
return KnowledgeGraphExactBm25Search.Search(
22+
candidates,
23+
queryTerms,
24+
options.MaxResults);
25+
}
26+
27+
var documents = CreateDocuments(candidates, out var averageDocumentLength);
28+
using var statistics = CreateTermStatistics(documents, queryTerms, fuzzyOptions);
29+
return CreateMatches(documents, queryTerms, statistics, averageDocumentLength, options.MaxResults);
3330
}
3431

3532
private static Bm25Document[] CreateDocuments(
@@ -55,44 +52,47 @@ private static Bm25Document CreateDocument(KnowledgeGraphSearchCandidate candida
5552
return new Bm25Document(candidate, frequencies, length);
5653
}
5754

58-
private static Dictionary<string, int> CreateDocumentFrequency(
55+
private static KnowledgeGraphBm25TermStatistics CreateTermStatistics(
5956
IReadOnlyList<Bm25Document> documents,
6057
IReadOnlyList<string> queryTerms,
6158
KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions)
6259
{
63-
var frequency = new Dictionary<string, int>(StringComparer.Ordinal);
64-
foreach (var term in queryTerms)
60+
var statistics = KnowledgeGraphBm25TermStatistics.Rent(documents.Count, queryTerms.Count);
61+
for (var termIndex = 0; termIndex < queryTerms.Count; termIndex++)
6562
{
63+
var term = queryTerms[termIndex];
6664
var matchingDocuments = 0;
67-
foreach (var document in documents)
65+
for (var documentIndex = 0; documentIndex < documents.Count; documentIndex++)
6866
{
69-
matchingDocuments += TryFindTermFrequency(document, term, fuzzyOptions, out _) ? 1 : 0;
67+
var matched = TryFindTermFrequency(documents[documentIndex], term, fuzzyOptions, out var frequency);
68+
statistics.SetTermFrequency(documentIndex, termIndex, matched ? frequency : ZeroConfidence);
69+
matchingDocuments += matched ? 1 : 0;
7070
}
7171

72-
frequency[term] = matchingDocuments;
72+
statistics.SetDocumentFrequency(termIndex, matchingDocuments);
7373
}
7474

75-
return frequency;
75+
return statistics;
7676
}
7777

7878
private static KnowledgeGraphRankedSearchMatch[] CreateMatches(
7979
IReadOnlyList<Bm25Document> documents,
8080
IReadOnlyList<string> queryTerms,
81-
IReadOnlyDictionary<string, int> documentFrequency,
81+
KnowledgeGraphBm25TermStatistics statistics,
8282
double averageDocumentLength,
83-
int maxResults,
84-
KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions)
83+
int maxResults)
8584
{
8685
var matches = new List<KnowledgeGraphRankedSearchMatch>(Math.Min(documents.Count, maxResults));
87-
foreach (var document in documents)
86+
for (var documentIndex = 0; documentIndex < documents.Count; documentIndex++)
8887
{
88+
var document = documents[documentIndex];
8989
var score = ScoreDocument(
9090
document,
91-
queryTerms,
92-
documentFrequency,
91+
documentIndex,
92+
queryTerms.Count,
93+
statistics,
9394
documents.Count,
94-
averageDocumentLength,
95-
fuzzyOptions);
95+
averageDocumentLength);
9696
if (score <= ZeroConfidence)
9797
{
9898
continue;
@@ -114,50 +114,26 @@ private static KnowledgeGraphRankedSearchMatch[] CreateMatches(
114114

115115
private static double ScoreDocument(
116116
Bm25Document document,
117-
IReadOnlyList<string> queryTerms,
118-
IReadOnlyDictionary<string, int> documentFrequency,
117+
int documentIndex,
118+
int termCount,
119+
KnowledgeGraphBm25TermStatistics statistics,
119120
int documentCount,
120-
double averageDocumentLength,
121-
KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions)
121+
double averageDocumentLength)
122122
{
123123
var score = ZeroConfidence;
124-
foreach (var term in queryTerms)
124+
for (var termIndex = 0; termIndex < termCount; termIndex++)
125125
{
126-
score += ScoreTerm(
127-
document,
128-
term,
129-
documentFrequency.GetValueOrDefault(term),
126+
score += KnowledgeGraphBm25Scoring.ScoreTerm(
127+
document.Length,
128+
statistics.GetTermFrequency(documentIndex, termIndex),
129+
statistics.GetDocumentFrequency(termIndex),
130130
documentCount,
131-
averageDocumentLength,
132-
fuzzyOptions);
131+
averageDocumentLength);
133132
}
134133

135134
return score;
136135
}
137136

138-
private static double ScoreTerm(
139-
Bm25Document document,
140-
string term,
141-
int documentFrequency,
142-
int documentCount,
143-
double averageDocumentLength,
144-
KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions)
145-
{
146-
if (documentFrequency == 0 || document.Length == 0)
147-
{
148-
return ZeroConfidence;
149-
}
150-
151-
if (!TryFindTermFrequency(document, term, fuzzyOptions, out var frequency))
152-
{
153-
return ZeroConfidence;
154-
}
155-
156-
var idf = Math.Log(IdfOffset + ((documentCount - documentFrequency + Half) / (documentFrequency + Half)));
157-
var denominator = frequency + K1 * (IdfOffset - B + (B * document.Length / averageDocumentLength));
158-
return idf * ((frequency * (K1 + IdfOffset)) / denominator);
159-
}
160-
161137
private static bool TryFindTermFrequency(
162138
Bm25Document document,
163139
string term,
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
using System.Buffers;
2+
3+
namespace ManagedCode.MarkdownLd.Kb.Pipeline;
4+
5+
internal readonly struct KnowledgeGraphBm25TermStatistics : IDisposable
6+
{
7+
private readonly int _termCount;
8+
private readonly int _termFrequencyLength;
9+
private readonly int[] _documentFrequency;
10+
private readonly double[] _termFrequency;
11+
12+
private KnowledgeGraphBm25TermStatistics(
13+
int termCount,
14+
int termFrequencyLength,
15+
int[] documentFrequency,
16+
double[] termFrequency)
17+
{
18+
_termCount = termCount;
19+
_termFrequencyLength = termFrequencyLength;
20+
_documentFrequency = documentFrequency;
21+
_termFrequency = termFrequency;
22+
}
23+
24+
public static KnowledgeGraphBm25TermStatistics Rent(int documentCount, int termCount)
25+
{
26+
var termFrequencyLength = checked(documentCount * termCount);
27+
return new KnowledgeGraphBm25TermStatistics(
28+
termCount,
29+
termFrequencyLength,
30+
ArrayPool<int>.Shared.Rent(termCount),
31+
ArrayPool<double>.Shared.Rent(termFrequencyLength));
32+
}
33+
34+
public int GetDocumentFrequency(int termIndex)
35+
{
36+
return _documentFrequency[termIndex];
37+
}
38+
39+
public double GetTermFrequency(int documentIndex, int termIndex)
40+
{
41+
return _termFrequency[GetFrequencyIndex(documentIndex, termIndex)];
42+
}
43+
44+
public Span<double> GetDocumentTermFrequencies(int documentIndex)
45+
{
46+
return _termFrequency.AsSpan(GetFrequencyIndex(documentIndex, 0), _termCount);
47+
}
48+
49+
public void Clear()
50+
{
51+
Array.Clear(_documentFrequency, 0, _termCount);
52+
Array.Clear(_termFrequency, 0, _termFrequencyLength);
53+
}
54+
55+
public void IncrementDocumentFrequency(int termIndex)
56+
{
57+
_documentFrequency[termIndex]++;
58+
}
59+
60+
public void SetDocumentFrequency(int termIndex, int frequency)
61+
{
62+
_documentFrequency[termIndex] = frequency;
63+
}
64+
65+
public void SetTermFrequency(int documentIndex, int termIndex, double frequency)
66+
{
67+
_termFrequency[GetFrequencyIndex(documentIndex, termIndex)] = frequency;
68+
}
69+
70+
public void Dispose()
71+
{
72+
ArrayPool<int>.Shared.Return(_documentFrequency);
73+
ArrayPool<double>.Shared.Return(_termFrequency);
74+
}
75+
76+
private int GetFrequencyIndex(int documentIndex, int termIndex)
77+
{
78+
return checked((documentIndex * _termCount) + termIndex);
79+
}
80+
}

0 commit comments

Comments
 (0)