Merge pull request #2024 from CentreForDigitalHumanities/feature/wordcloud-tokenisation

lukavdplas · web-flow · commit e18eba00a2d6 · 2026-04-08T16:25:30.000+02:00
Wordcloud tokenisation
diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py
@@ -1,7 +1,5 @@
 from typing import List, Dict, Iterable
 
-from langcodes import standardize_tag
-
 from addcorpus.models import Field
 from addcorpus.json_corpora.utils import get_path
 from addcorpus import es_mappings
@@ -95,12 +93,9 @@ def _parse_text_content_field(field_data: Dict) -> Field:
 
     visualize = get_path(field_data, 'options', 'visualize')
     if visualize:
-        # add wordcloud, but make an exception for languages where the wordcloud's
-        # tokenisation does not work.
-        if not (language and standardize_tag(language, macro=True) in ['zh', 'ja', 'ko']):
-            parsed['visualizations'] = [
-                VisualizationType.WORDCLOUD.value
-            ]
+        parsed['visualizations'] = [
+            VisualizationType.WORDCLOUD.value
+        ]
 
     return parsed
 
diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Optional, Iterable
+from typing import List, Dict, Any, Optional, Iterable, Tuple
 import re
 import itertools
 
@@ -11,7 +11,7 @@
 def request_termvectors_batched(
     hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool,
     fields: List[str],
-) -> Iterable[Dict]:
+) -> Iterable[Tuple[Dict, Dict]]:
     '''
     Request term vectors for each hit in search results.
     Uses mtermvectors endpoint to make batched requests.
@@ -34,6 +34,10 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]:
         terms = termvectors[field]['terms']
         return terms
 
+def term_counts(doc: Dict, field: str) -> Dict[str, int]:
+    terms = get_terms(doc, field)
+    return {term: data['term_freq'] for term, data in terms.items()}
+
 def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]:
     if not terms:
         return []
diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py
@@ -98,32 +98,21 @@ def occurs_in_results(word): return any(
     for word in words_to_exclude:
         assert not occurs_in_results(word)
 
-def test_wordcloud_counts(small_mock_corpus):
-    '''
-    Each non-stopword only occurs once in the mock corpus data, so
-    this test uses some fake texts for counting.
-    '''
-
-    texts = [
-        'Some words',
-        'Even more!',
-        'Words, words, words...',
-        'More words! More!',
-        'That should be enough.',
-    ]
-    docs = [
-        {'_source': {'content': text}}
-        for text in texts
-    ]
-
-    results = wordcloud.make_wordcloud_data(docs, 'content', small_mock_corpus)
+def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus):
+    result = search.search(
+        corpus_name=large_mock_corpus,
+        query_model=query.MATCH_ALL,
+        size=10
+    )
+    documents = search.hits(result)
+    results = wordcloud.make_wordcloud_data(
+        documents, 'content', large_mock_corpus)
 
     counts = {
         item['key']: item['doc_count']
         for item in results
     }
-
-    assert counts['words'] == 5
+    assert counts['the'] == 20
 
 def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
     stopwords = ['the', 'and', 'of']
diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py
@@ -1,38 +1,29 @@
 from collections import Counter
-from sklearn.feature_extraction.text import CountVectorizer
-
-from addcorpus.models import Corpus
+from typing import Iterable, Dict
+from addcorpus.models import CorpusConfiguration, Field
+from visualization.termvectors import request_termvectors_batched, term_counts
+from es import download as download
+from es.client import elasticsearch
 from es import download as download
-from addcorpus.language_analyzers import get_analyzer
-
-def field_stopwords(corpus_name, field_name):
-    corpus = Corpus.objects.get(name=corpus_name)
-    field = corpus.configuration.fields.get(name=field_name)
 
-    if field.language and field.language != 'dynamic':
-        analyzer = get_analyzer(field.language)
-    else:
-        return []
+def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool:
+    corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name)
+    field: Field = corpus_config.fields.get(name=field_name)
+    has_clean_field = 'clean' in field.es_mapping.get('fields', {})
+    if has_clean_field:
+        return field_name + '.clean'
+    return field_name
 
-    stopwords = analyzer.stopwords()
-    return stopwords or []
 
+def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name):
+    search_field = _wordcloud_search_field(corpus_name, field_name)
 
-def make_wordcloud_data(documents, field, corpus):
-    texts = []
-    for document in documents:
-        content = document['_source'][field]
-        if isinstance(content, str) and len(content):
-            texts.append(content)
-        if isinstance(content, list) and len(content):
-            texts.append('\n'.join(content))
+    counts = Counter()
+    client = elasticsearch(corpus_name)
+    docs = request_termvectors_batched(hits, client, False, [search_field])
+    for _, doc in docs:
+        counts.update(term_counts(doc, search_field))
 
-    stopwords = field_stopwords(corpus, field)
-    cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords)
-    cvtexts = cv.fit_transform(texts)
-    counts = cvtexts.sum(axis=0).A1
-    words = list(cv.get_feature_names_out())
-    freq_distribution = Counter(dict(zip(words, counts)))
-    output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words]
+    output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()]
     return output
 
diff --git a/frontend/src/assets/manual/en-GB/glossary.md b/frontend/src/assets/manual/en-GB/glossary.md
@@ -40,6 +40,10 @@ When you provide a bit of text to search for, we call this a query. Your query c
 
 A kind of text processing where the words in a text are stripped of all inflection. For instance, *walk*, *walks*, and *walking* are all reduced to *walk*. This can be used to ignore irrevelant variation between words. See [stemming (wikipedia)](/https://en.wikipedia.org/wiki/Stemming) for more information.
 
+## Stopword
+
+Stopwords are very common words in a language. For example, some English stopwords are "the", "in", and "are". These are typically not interesting when searching, or in quantitative analysis; they do not convey what the text is about. For some text analysis, it makes sense to remove stopwords.
+
 ## Tag
 
 On Textcavator, tags are labels that you can assign to documents. See [tags](/manual/tagging-documents) for more information.
diff --git a/frontend/src/assets/manual/en-GB/wordcloud.md b/frontend/src/assets/manual/en-GB/wordcloud.md
@@ -1,5 +1,5 @@
 The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud.
 
-The wordcloud will count the frequencies of words in search results. Highly frequent words (word occuring in more than 70% of documents) are removed. In fields that offer [stemming](/manual/glossary#stemming), we also use a list of common stopwords to exclude. We also remove words with fewer than 3 characters, or which consist of only numbers or punctuation. After this, the frequency of the remaining words in the text field is counted.
+The wordcloud will count the frequencies of words in search results. We use a list of [stopwords](/manual/glossary#stopword) to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted.
 
 The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful.