@@ -4,11 +4,6 @@ namespace ManagedCode.MarkdownLd.Kb.Pipeline;
44
55internal static class KnowledgeGraphBm25Search
66{
7- private const double K1 = 1.2d ;
8- private const double B = 0.75d ;
9- private const double Half = 0.5d ;
10- private const double IdfOffset = 1d ;
11-
127 public static IReadOnlyList < KnowledgeGraphRankedSearchMatch > Search (
138 IReadOnlyList < KnowledgeGraphSearchCandidate > candidates ,
149 string query ,
@@ -20,16 +15,18 @@ public static IReadOnlyList<KnowledgeGraphRankedSearchMatch> Search(
2015 return [ ] ;
2116 }
2217
23- var documents = CreateDocuments ( candidates , out var averageDocumentLength ) ;
2418 var fuzzyOptions = KnowledgeGraphFuzzyTokenMatchingOptions . FromRankedSearch ( options ) ;
25- var documentFrequency = CreateDocumentFrequency ( documents , queryTerms , fuzzyOptions ) ;
26- return CreateMatches (
27- documents ,
28- queryTerms ,
29- documentFrequency ,
30- averageDocumentLength ,
31- options . MaxResults ,
32- fuzzyOptions ) ;
19+ if ( ! fuzzyOptions . Enabled )
20+ {
21+ return KnowledgeGraphExactBm25Search . Search (
22+ candidates ,
23+ queryTerms ,
24+ options . MaxResults ) ;
25+ }
26+
27+ var documents = CreateDocuments ( candidates , out var averageDocumentLength ) ;
28+ using var statistics = CreateTermStatistics ( documents , queryTerms , fuzzyOptions ) ;
29+ return CreateMatches ( documents , queryTerms , statistics , averageDocumentLength , options . MaxResults ) ;
3330 }
3431
3532 private static Bm25Document [ ] CreateDocuments (
@@ -55,44 +52,47 @@ private static Bm25Document CreateDocument(KnowledgeGraphSearchCandidate candida
5552 return new Bm25Document ( candidate , frequencies , length ) ;
5653 }
5754
58- private static Dictionary < string , int > CreateDocumentFrequency (
55+ private static KnowledgeGraphBm25TermStatistics CreateTermStatistics (
5956 IReadOnlyList < Bm25Document > documents ,
6057 IReadOnlyList < string > queryTerms ,
6158 KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions )
6259 {
63- var frequency = new Dictionary < string , int > ( StringComparer . Ordinal ) ;
64- foreach ( var term in queryTerms )
60+ var statistics = KnowledgeGraphBm25TermStatistics . Rent ( documents . Count , queryTerms . Count ) ;
61+ for ( var termIndex = 0 ; termIndex < queryTerms . Count ; termIndex ++ )
6562 {
63+ var term = queryTerms [ termIndex ] ;
6664 var matchingDocuments = 0 ;
67- foreach ( var document in documents )
65+ for ( var documentIndex = 0 ; documentIndex < documents . Count ; documentIndex ++ )
6866 {
69- matchingDocuments += TryFindTermFrequency ( document , term , fuzzyOptions , out _ ) ? 1 : 0 ;
67+ var matched = TryFindTermFrequency ( documents [ documentIndex ] , term , fuzzyOptions , out var frequency ) ;
68+ statistics . SetTermFrequency ( documentIndex , termIndex , matched ? frequency : ZeroConfidence ) ;
69+ matchingDocuments += matched ? 1 : 0 ;
7070 }
7171
72- frequency [ term ] = matchingDocuments ;
72+ statistics . SetDocumentFrequency ( termIndex , matchingDocuments ) ;
7373 }
7474
75- return frequency ;
75+ return statistics ;
7676 }
7777
7878 private static KnowledgeGraphRankedSearchMatch [ ] CreateMatches (
7979 IReadOnlyList < Bm25Document > documents ,
8080 IReadOnlyList < string > queryTerms ,
81- IReadOnlyDictionary < string , int > documentFrequency ,
81+ KnowledgeGraphBm25TermStatistics statistics ,
8282 double averageDocumentLength ,
83- int maxResults ,
84- KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions )
83+ int maxResults )
8584 {
8685 var matches = new List < KnowledgeGraphRankedSearchMatch > ( Math . Min ( documents . Count , maxResults ) ) ;
87- foreach ( var document in documents )
86+ for ( var documentIndex = 0 ; documentIndex < documents . Count ; documentIndex ++ )
8887 {
88+ var document = documents [ documentIndex ] ;
8989 var score = ScoreDocument (
9090 document ,
91- queryTerms ,
92- documentFrequency ,
91+ documentIndex ,
92+ queryTerms . Count ,
93+ statistics ,
9394 documents . Count ,
94- averageDocumentLength ,
95- fuzzyOptions ) ;
95+ averageDocumentLength ) ;
9696 if ( score <= ZeroConfidence )
9797 {
9898 continue ;
@@ -114,50 +114,26 @@ private static KnowledgeGraphRankedSearchMatch[] CreateMatches(
114114
115115 private static double ScoreDocument (
116116 Bm25Document document ,
117- IReadOnlyList < string > queryTerms ,
118- IReadOnlyDictionary < string , int > documentFrequency ,
117+ int documentIndex ,
118+ int termCount ,
119+ KnowledgeGraphBm25TermStatistics statistics ,
119120 int documentCount ,
120- double averageDocumentLength ,
121- KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions )
121+ double averageDocumentLength )
122122 {
123123 var score = ZeroConfidence ;
124- foreach ( var term in queryTerms )
124+ for ( var termIndex = 0 ; termIndex < termCount ; termIndex ++ )
125125 {
126- score += ScoreTerm (
127- document ,
128- term ,
129- documentFrequency . GetValueOrDefault ( term ) ,
126+ score += KnowledgeGraphBm25Scoring . ScoreTerm (
127+ document . Length ,
128+ statistics . GetTermFrequency ( documentIndex , termIndex ) ,
129+ statistics . GetDocumentFrequency ( termIndex ) ,
130130 documentCount ,
131- averageDocumentLength ,
132- fuzzyOptions ) ;
131+ averageDocumentLength ) ;
133132 }
134133
135134 return score ;
136135 }
137136
138- private static double ScoreTerm (
139- Bm25Document document ,
140- string term ,
141- int documentFrequency ,
142- int documentCount ,
143- double averageDocumentLength ,
144- KnowledgeGraphFuzzyTokenMatchingOptions fuzzyOptions )
145- {
146- if ( documentFrequency == 0 || document . Length == 0 )
147- {
148- return ZeroConfidence ;
149- }
150-
151- if ( ! TryFindTermFrequency ( document , term , fuzzyOptions , out var frequency ) )
152- {
153- return ZeroConfidence ;
154- }
155-
156- var idf = Math . Log ( IdfOffset + ( ( documentCount - documentFrequency + Half ) / ( documentFrequency + Half ) ) ) ;
157- var denominator = frequency + K1 * ( IdfOffset - B + ( B * document . Length / averageDocumentLength ) ) ;
158- return idf * ( ( frequency * ( K1 + IdfOffset ) ) / denominator ) ;
159- }
160-
161137 private static bool TryFindTermFrequency (
162138 Bm25Document document ,
163139 string term ,
0 commit comments