Skip to content

Commit 482ee42

Browse files
authored
Merge pull request #2010 from CentreForDigitalHumanities/feature/batch-termvectors-requests
Batch termvectors requests
2 parents f54eaae + 93c5459 commit 482ee42

4 files changed

Lines changed: 61 additions & 36 deletions

File tree

backend/visualization/ngram.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,12 @@ def tokens_by_time_interval(
120120
download_size=max_size_per_interval,
121121
)
122122
bin_ngrams = Counter()
123-
for hit in search_results:
123+
docs = termvectors.request_termvectors_batched(
124+
search_results, client, freq_compensation, [field]
125+
)
126+
for _, vectors in docs:
124127
tokens, ttfs = _count_tokens_in_document(
125-
hit, client, field, query_text,
128+
vectors, client, field, query_text,
126129
term_positions, ngram_size,
127130
freq_compensation=freq_compensation,
128131
mode=mode,
@@ -140,7 +143,7 @@ def tokens_by_time_interval(
140143

141144

142145
def _count_tokens_in_document(
143-
hit: Dict,
146+
termvector_result: Dict,
144147
client: Elasticsearch,
145148
field: str,
146149
query_text: str,
@@ -155,16 +158,10 @@ def _count_tokens_in_document(
155158
tokens = Counter()
156159
ttfs = dict()
157160
# get the term vectors for the hit
158-
result = client.termvectors(
159-
index=hit['_index'],
160-
id=hit['_id'],
161-
term_statistics=freq_compensation,
162-
fields=[field]
163-
)
164-
terms = termvectors.get_terms(result, field)
161+
terms = termvectors.get_terms(termvector_result, field)
165162
if terms:
166163
sorted_tokens = termvectors.get_tokens(terms, sort=True)
167-
matches = termvectors.token_matches(sorted_tokens, query_text, hit['_index'], field, client)
164+
matches = termvectors.token_matches(sorted_tokens, query_text, termvector_result['_index'], field, client)
168165
token_ranges = _token_ranges(
169166
matches, term_positions, ngram_size, len(sorted_tokens), mode=mode
170167
)

backend/visualization/term_frequency.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import math
2-
from typing import Callable, Dict, Any, Optional, List
2+
from typing import Callable, Dict, Any, Optional, List, Iterable
33
import re
44
from addcorpus.models import CorpusConfiguration, Field
55
from datetime import datetime
@@ -90,10 +90,9 @@ def get_match_count(es_client, es_query, corpus, size, fieldnames):
9090
terms = simple_query_string.collect_terms(query_text)
9191
prefix_query = ' '.join(filter(requires_termvectors_analysis, terms))
9292

93-
matches = [
94-
count_matches_in_document(hit, prefix_query, fieldnames, es_client)
95-
for hit in found_hits
96-
]
93+
matches = list(
94+
count_matches_in_documents(found_hits, prefix_query, fieldnames, es_client)
95+
)
9796

9897
n_matches = sum(matches)
9998
skipped_docs = total_results - len(matches)
@@ -115,9 +114,11 @@ def estimate_skipped_count(matches, skipped_docs: int) -> int:
115114
return estimate_skipped
116115

117116

118-
def count_matches_in_document(hit, prefix_query: Optional[str], search_fields, es_client):
117+
def count_matches_in_documents(
118+
hits: Iterable[Dict], prefix_query: Optional[str], search_fields, es_client
119+
) -> Iterable[int]:
119120
'''
120-
Count matches of a query in a document.
121+
Count matches of a query per document.
121122
122123
Will use the explain API if possible, which is faster.
123124
@@ -128,15 +129,20 @@ def count_matches_in_document(hit, prefix_query: Optional[str], search_fields, e
128129
if prefix_query:
129130
# If the query contains a prefix query, use termvectors to get matches
130131
# for it.
131-
prefix_matches = count_matches_from_termvectors(
132-
hit['_id'], hit['_index'], search_fields, prefix_query, es_client
132+
docs = termvectors.request_termvectors_batched(
133+
hits, es_client, False, search_fields
133134
)
134-
# Use explanation for other terms in the query (this is faster and more
135-
# accurate if the query includes certain other operators)
136-
rest_matches = count_matches_from_explanation(hit)
137-
return prefix_matches + rest_matches
138-
139-
return count_matches_from_explanation(hit)
135+
for hit, vectors in docs:
136+
prefix_matches = count_matches_from_termvectors(
137+
vectors, search_fields, prefix_query, es_client
138+
)
139+
# Use explanation for other terms in the query (this is faster and more
140+
# accurate if the query includes certain other operators)
141+
rest_matches = count_matches_from_explanation(hit)
142+
yield prefix_matches + rest_matches
143+
else:
144+
for hit in hits:
145+
yield count_matches_from_explanation(hit)
140146

141147

142148
def count_matches_from_explanation(hit) -> int:
@@ -169,19 +175,18 @@ def find_recursive(doc: Any, predicate: Callable):
169175
yield match
170176

171177

172-
def count_matches_from_termvectors(id, index, fieldnames, query_text, es_client):
178+
def count_matches_from_termvectors(doc: Dict, fieldnames, query_text, es_client):
173179
'''
174180
Count matches of a query in a document using the termvectors API
175181
'''
176182
# get the term vectors for the hit
177-
result = es_client.termvectors(index=index, id=id, fields = fieldnames)
178183

179184
matches = 0
180185

181186
for field in fieldnames:
182-
terms = termvectors.get_terms(result, field)
187+
terms = termvectors.get_terms(doc, field)
183188
tokens = termvectors.get_tokens(terms, sort = False)
184-
matches += sum(1 for _ in termvectors.token_matches(tokens, query_text, index, field, es_client))
189+
matches += sum(1 for _ in termvectors.token_matches(tokens, query_text, doc['_index'], field, es_client))
185190

186191
return matches
187192

backend/visualization/termvectors.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,33 @@
1-
from typing import List, Dict, Any, Optional
2-
3-
from es.client import elasticsearch
1+
from typing import List, Dict, Any, Optional, Iterable
42
import re
3+
import itertools
4+
5+
from elasticsearch import Elasticsearch
56
from textdistance import damerau_levenshtein
7+
from es.client import elasticsearch
68

79
from visualization.simple_query_string import collect_terms
810

11+
def request_termvectors_batched(
12+
hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool,
13+
fields: List[str],
14+
) -> Iterable[Dict]:
15+
'''
16+
Request term vectors for each hit in search results.
17+
Uses mtermvectors endpoint to make batched requests.
18+
'''
19+
batched_hits = itertools.batched(hits, 100)
20+
for batch in batched_hits:
21+
result = client.mtermvectors(
22+
docs=[
23+
{ '_index': doc['_index'], '_id': doc['_id'] }
24+
for doc in batch
25+
],
26+
term_statistics=term_statistics,
27+
fields=fields,
28+
)
29+
yield from zip(batch, result.body['docs'])
30+
931
def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]:
1032
termvectors = termvector_result['term_vectors']
1133
if field in termvectors:

backend/visualization/tests/test_term_frequency.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def test_match_count(small_mock_corpus, es_client, index_small_mock_corpus, quer
6363

6464
@pytest.fixture()
6565
def fix_match_count(monkeypatch):
66-
def count_matches_alt(*args, **kwargs):
67-
return 10
66+
def count_matches_alt(hits, *args, **kwargs):
67+
for hit in hits:
68+
yield 10
6869

69-
monkeypatch.setattr(term_frequency, 'count_matches_in_document', count_matches_alt)
70+
monkeypatch.setattr(term_frequency, 'count_matches_in_documents', count_matches_alt)
7071

7172

7273
def test_match_count_estimate(small_mock_corpus, basic_query, fix_match_count):

0 commit comments

Comments
 (0)