Merge pull request #2010 from CentreForDigitalHumanities/feature/batch-termvectors-requests

lukavdplas · web-flow · commit 482ee420dc7c · 2026-03-26T14:07:49.000+01:00
Batch termvectors requests
diff --git a/backend/visualization/ngram.py b/backend/visualization/ngram.py
@@ -120,9 +120,12 @@ def tokens_by_time_interval(
         download_size=max_size_per_interval,
     )
     bin_ngrams = Counter()
-    for hit in search_results:
+    docs = termvectors.request_termvectors_batched(
+        search_results, client, freq_compensation, [field]
+    )
+    for _, vectors in docs:
         tokens, ttfs = _count_tokens_in_document(
-            hit, client, field, query_text,
+            vectors, client, field, query_text,
             term_positions, ngram_size,
             freq_compensation=freq_compensation,
             mode=mode,
@@ -140,7 +143,7 @@ def tokens_by_time_interval(
 
 
 def _count_tokens_in_document(
-    hit: Dict,
+    termvector_result: Dict,
     client: Elasticsearch,
     field: str,
     query_text: str,
@@ -155,16 +158,10 @@ def _count_tokens_in_document(
     tokens = Counter()
     ttfs = dict()
     # get the term vectors for the hit
-    result = client.termvectors(
-        index=hit['_index'],
-        id=hit['_id'],
-        term_statistics=freq_compensation,
-        fields=[field]
-    )
-    terms = termvectors.get_terms(result, field)
+    terms = termvectors.get_terms(termvector_result, field)
     if terms:
         sorted_tokens = termvectors.get_tokens(terms, sort=True)
-        matches = termvectors.token_matches(sorted_tokens, query_text, hit['_index'], field, client)
+        matches = termvectors.token_matches(sorted_tokens, query_text, termvector_result['_index'], field, client)
         token_ranges = _token_ranges(
             matches, term_positions, ngram_size, len(sorted_tokens), mode=mode
         )
diff --git a/backend/visualization/term_frequency.py b/backend/visualization/term_frequency.py
@@ -1,5 +1,5 @@
 import math
-from typing import Callable, Dict, Any, Optional, List
+from typing import Callable, Dict, Any, Optional, List, Iterable
 import re
 from addcorpus.models import CorpusConfiguration, Field
 from datetime import datetime
@@ -90,10 +90,9 @@ def get_match_count(es_client, es_query, corpus, size, fieldnames):
     terms = simple_query_string.collect_terms(query_text)
     prefix_query = ' '.join(filter(requires_termvectors_analysis, terms))
 
-    matches = [
-        count_matches_in_document(hit, prefix_query, fieldnames, es_client)
-        for hit in found_hits
-    ]
+    matches = list(
+        count_matches_in_documents(found_hits, prefix_query, fieldnames, es_client)
+    )
 
     n_matches = sum(matches)
     skipped_docs = total_results - len(matches)
@@ -115,9 +114,11 @@ def estimate_skipped_count(matches, skipped_docs: int) -> int:
     return estimate_skipped
 
 
-def count_matches_in_document(hit, prefix_query: Optional[str], search_fields, es_client):
+def count_matches_in_documents(
+    hits: Iterable[Dict], prefix_query: Optional[str], search_fields, es_client
+) -> Iterable[int]:
     '''
-    Count matches of a query in a document.
+    Count matches of a query per document.
 
     Will use the explain API if possible, which is faster.
 
@@ -128,15 +129,20 @@ def count_matches_in_document(hit, prefix_query: Optional[str], search_fields, e
     if prefix_query:
         # If the query contains a prefix query, use termvectors to get matches
         # for it.
-        prefix_matches = count_matches_from_termvectors(
-            hit['_id'], hit['_index'], search_fields, prefix_query, es_client
+        docs = termvectors.request_termvectors_batched(
+            hits, es_client, False, search_fields
         )
-        # Use explanation for other terms in the query (this is faster and more
-        # accurate if the query includes certain other operators)
-        rest_matches = count_matches_from_explanation(hit)
-        return prefix_matches + rest_matches
-
-    return count_matches_from_explanation(hit)
+        for hit, vectors in docs:
+            prefix_matches = count_matches_from_termvectors(
+                vectors, search_fields, prefix_query, es_client
+            )
+            # Use explanation for other terms in the query (this is faster and more
+            # accurate if the query includes certain other operators)
+            rest_matches = count_matches_from_explanation(hit)
+            yield prefix_matches + rest_matches
+    else:
+        for hit in hits:
+            yield count_matches_from_explanation(hit)
 
 
 def count_matches_from_explanation(hit) -> int:
@@ -169,19 +175,18 @@ def find_recursive(doc: Any, predicate: Callable):
                 yield match
 
 
-def count_matches_from_termvectors(id, index, fieldnames, query_text, es_client):
+def count_matches_from_termvectors(doc: Dict, fieldnames, query_text, es_client):
     '''
     Count matches of a query in a document using the termvectors API
     '''
     # get the term vectors for the hit
-    result = es_client.termvectors(index=index, id=id, fields = fieldnames)
 
     matches = 0
 
     for field in fieldnames:
-        terms = termvectors.get_terms(result, field)
+        terms = termvectors.get_terms(doc, field)
         tokens = termvectors.get_tokens(terms, sort = False)
-        matches += sum(1 for _ in termvectors.token_matches(tokens, query_text, index, field, es_client))
+        matches += sum(1 for _ in termvectors.token_matches(tokens, query_text, doc['_index'], field, es_client))
 
     return matches
 
diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py
@@ -1,11 +1,33 @@
-from typing import List, Dict, Any, Optional
-
-from es.client import elasticsearch
+from typing import List, Dict, Any, Optional, Iterable
 import re
+import itertools
+
+from elasticsearch import Elasticsearch
 from textdistance import damerau_levenshtein
+from es.client import elasticsearch
 
 from visualization.simple_query_string import collect_terms
 
+def request_termvectors_batched(
+    hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool,
+    fields: List[str],
+) -> Iterable[Dict]:
+    '''
+    Request term vectors for each hit in search results.
+    Uses mtermvectors endpoint to make batched requests.
+    '''
+    batched_hits = itertools.batched(hits, 100)
+    for batch in batched_hits:
+        result = client.mtermvectors(
+            docs=[
+                { '_index': doc['_index'], '_id': doc['_id'] }
+                for doc in batch
+            ],
+            term_statistics=term_statistics,
+            fields=fields,
+        )
+        yield from zip(batch, result.body['docs'])
+
 def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]:
     termvectors = termvector_result['term_vectors']
     if field in termvectors:
diff --git a/backend/visualization/tests/test_term_frequency.py b/backend/visualization/tests/test_term_frequency.py
@@ -63,10 +63,11 @@ def test_match_count(small_mock_corpus, es_client, index_small_mock_corpus, quer
 
 @pytest.fixture()
 def fix_match_count(monkeypatch):
-    def count_matches_alt(*args, **kwargs):
-        return 10
+    def count_matches_alt(hits, *args, **kwargs):
+        for hit in hits:
+            yield 10
 
-    monkeypatch.setattr(term_frequency, 'count_matches_in_document', count_matches_alt)
+    monkeypatch.setattr(term_frequency, 'count_matches_in_documents', count_matches_alt)
 
 
 def test_match_count_estimate(small_mock_corpus, basic_query, fix_match_count):