11import math
2- from typing import Callable , Dict , Any , Optional , List
2+ from typing import Callable , Dict , Any , Optional , List , Iterable
33import re
44from addcorpus .models import CorpusConfiguration , Field
55from datetime import datetime
@@ -90,10 +90,9 @@ def get_match_count(es_client, es_query, corpus, size, fieldnames):
9090 terms = simple_query_string .collect_terms (query_text )
9191 prefix_query = ' ' .join (filter (requires_termvectors_analysis , terms ))
9292
93- matches = [
94- count_matches_in_document (hit , prefix_query , fieldnames , es_client )
95- for hit in found_hits
96- ]
93+ matches = list (
94+ count_matches_in_documents (found_hits , prefix_query , fieldnames , es_client )
95+ )
9796
9897 n_matches = sum (matches )
9998 skipped_docs = total_results - len (matches )
@@ -115,9 +114,11 @@ def estimate_skipped_count(matches, skipped_docs: int) -> int:
115114 return estimate_skipped
116115
117116
118- def count_matches_in_document (hit , prefix_query : Optional [str ], search_fields , es_client ):
117+ def count_matches_in_documents (
118+ hits : Iterable [Dict ], prefix_query : Optional [str ], search_fields , es_client
119+ ) -> Iterable [int ]:
119120 '''
120- Count matches of a query in a document.
121+ Count matches of a query per document.
121122
122123 Will use the explain API if possible, which is faster.
123124
@@ -128,15 +129,20 @@ def count_matches_in_document(hit, prefix_query: Optional[str], search_fields, e
128129 if prefix_query :
129130 # If the query contains a prefix query, use termvectors to get matches
130131 # for it.
131- prefix_matches = count_matches_from_termvectors (
132- hit [ '_id' ], hit [ '_index' ], search_fields , prefix_query , es_client
132+ docs = termvectors . request_termvectors_batched (
133+ hits , es_client , False , search_fields
133134 )
134- # Use explanation for other terms in the query (this is faster and more
135- # accurate if the query includes certain other operators)
136- rest_matches = count_matches_from_explanation (hit )
137- return prefix_matches + rest_matches
138-
139- return count_matches_from_explanation (hit )
135+ for hit , vectors in docs :
136+ prefix_matches = count_matches_from_termvectors (
137+ vectors , search_fields , prefix_query , es_client
138+ )
139+ # Use explanation for other terms in the query (this is faster and more
140+ # accurate if the query includes certain other operators)
141+ rest_matches = count_matches_from_explanation (hit )
142+ yield prefix_matches + rest_matches
143+ else :
144+ for hit in hits :
145+ yield count_matches_from_explanation (hit )
140146
141147
142148def count_matches_from_explanation (hit ) -> int :
@@ -169,19 +175,18 @@ def find_recursive(doc: Any, predicate: Callable):
169175 yield match
170176
171177
172- def count_matches_from_termvectors (id , index , fieldnames , query_text , es_client ):
178+ def count_matches_from_termvectors (doc : Dict , fieldnames , query_text , es_client ):
173179 '''
174180 Count matches of a query in a document using the termvectors API
175181 '''
176182 # get the term vectors for the hit
177- result = es_client .termvectors (index = index , id = id , fields = fieldnames )
178183
179184 matches = 0
180185
181186 for field in fieldnames :
182- terms = termvectors .get_terms (result , field )
187+ terms = termvectors .get_terms (doc , field )
183188 tokens = termvectors .get_tokens (terms , sort = False )
184- matches += sum (1 for _ in termvectors .token_matches (tokens , query_text , index , field , es_client ))
189+ matches += sum (1 for _ in termvectors .token_matches (tokens , query_text , doc [ '_index' ] , field , es_client ))
185190
186191 return matches
187192
0 commit comments