From a1420bd69a96ff8d1546c3c4d907d29d7cf7a2f0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 16:01:06 +0000 Subject: [PATCH 1/6] correct output type --- backend/visualization/termvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py index e63608fdb..85e522a86 100644 --- a/backend/visualization/termvectors.py +++ b/backend/visualization/termvectors.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional, Iterable +from typing import List, Dict, Any, Optional, Iterable, Tuple import re import itertools @@ -11,7 +11,7 @@ def request_termvectors_batched( hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool, fields: List[str], -) -> Iterable[Dict]: +) -> Iterable[Tuple[Dict, Dict]]: ''' Request term vectors for each hit in search results. Uses mtermvectors endpoint to make batched requests. From 31af081bc48f48119d810a5dd228b06e96e9cac0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 16:06:18 +0000 Subject: [PATCH 2/6] get wordcloud counts from tem vectors --- backend/visualization/termvectors.py | 7 +++++ backend/visualization/wordcloud.py | 38 ++++++++-------------------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py index 85e522a86..46b788d6b 100644 --- a/backend/visualization/termvectors.py +++ b/backend/visualization/termvectors.py @@ -34,6 +34,13 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]: terms = termvectors[field]['terms'] return terms +def token_counts(doc: Dict, field: str) -> Dict[str, int]: + terms = get_terms(doc, field) + return { + term: data['term_freq'] + for term, data in terms.items() + } + def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]: if not terms: return [] diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index 4b905fb2c..20190ffc9 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -1,36 +1,18 @@ from collections import Counter -from sklearn.feature_extraction.text import CountVectorizer +from typing import Iterable, Dict -from addcorpus.models import Corpus -from addcorpus.es_settings import get_nltk_stopwords +from visualization.termvectors import request_termvectors_batched, token_counts from es import download as download +from es.client import elasticsearch -def field_stopwords(corpus_name, field_name): - corpus = Corpus.objects.get(name=corpus_name) - field = corpus.configuration.fields.get(name=field_name) - if field.language and field.language != 'dynamic': - try: - return get_nltk_stopwords(field.language) - except: - return [] - else: - return [] +def make_wordcloud_data(hits: Iterable[Dict], field, corpus_name): + counts = Counter() -def make_wordcloud_data(documents, field, corpus): - texts = [] - for document in documents: - content = document['_source'][field] - if isinstance(content, str) and len(content): - texts.append(content) - if isinstance(content, list) and len(content): - texts.append('\n'.join(content)) + client = elasticsearch(corpus_name) + docs = request_termvectors_batched(hits, client, False, [field]) + for _, doc in docs: + counts.update(token_counts(doc, field)) - stopwords = field_stopwords(corpus, field) - cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords) - cvtexts = cv.fit_transform(texts) - counts = cvtexts.sum(axis=0).A1 - words = list(cv.get_feature_names_out()) - freq_distribution = Counter(dict(zip(words, counts))) - output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words] + output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()] return output From 8c5276a61eca86ee9ad494b2636ea7cba0d01d7d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 16:21:35 +0000 Subject: [PATCH 3/6] use stopwords if available --- backend/visualization/termvectors.py | 7 ++--- backend/visualization/tests/test_wordcloud.py | 31 ++++++------------- backend/visualization/wordcloud.py | 21 ++++++++++--- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py index 46b788d6b..fd7522e4c 100644 --- a/backend/visualization/termvectors.py +++ b/backend/visualization/termvectors.py @@ -34,12 +34,9 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]: terms = termvectors[field]['terms'] return terms -def token_counts(doc: Dict, field: str) -> Dict[str, int]: +def term_counts(doc: Dict, field: str) -> Dict[str, int]: terms = get_terms(doc, field) - return { - term: data['term_freq'] - for term, data in terms.items() - } + return {term: data['term_freq'] for term, data in terms.items()} def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]: if not terms: diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py index d90696100..753abdae7 100644 --- a/backend/visualization/tests/test_wordcloud.py +++ b/backend/visualization/tests/test_wordcloud.py @@ -98,32 +98,21 @@ def occurs_in_results(word): return any( for word in words_to_exclude: assert not occurs_in_results(word) -def test_wordcloud_counts(small_mock_corpus): - ''' - Each non-stopword only occurs once in the mock corpus data, so - this test uses some fake texts for counting. - ''' - - texts = [ - 'Some words', - 'Even more!', - 'Words, words, words...', - 'More words! More!', - 'That should be enough.', - ] - docs = [ - {'_source': {'content': text}} - for text in texts - ] - - results = wordcloud.make_wordcloud_data(docs, 'content', small_mock_corpus) +def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus): + result = search.search( + corpus_name=large_mock_corpus, + query_model=query.MATCH_ALL, + size=10 + ) + documents = search.hits(result) + results = wordcloud.make_wordcloud_data( + documents, 'content', large_mock_corpus) counts = { item['key']: item['doc_count'] for item in results } - - assert counts['words'] == 5 + assert counts['the'] == 20 def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud): stopwords = ['the', 'and', 'of'] diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index 20190ffc9..c8502eae0 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -1,17 +1,28 @@ from collections import Counter from typing import Iterable, Dict -from visualization.termvectors import request_termvectors_batched, token_counts +from addcorpus.models import CorpusConfiguration, Field +from visualization.termvectors import request_termvectors_batched, term_counts from es import download as download from es.client import elasticsearch -def make_wordcloud_data(hits: Iterable[Dict], field, corpus_name): - counts = Counter() +def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool: + corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name) + field: Field = corpus_config.fields.get(name=field_name) + has_clean_field = 'clean' in field.es_mapping.get('fields', {}) + if has_clean_field: + return field_name + '.clean' + return field_name + +def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name): + search_field = _wordcloud_search_field(corpus_name, field_name) + + counts = Counter() client = elasticsearch(corpus_name) - docs = request_termvectors_batched(hits, client, False, [field]) + docs = request_termvectors_batched(hits, client, False, [search_field]) for _, doc in docs: - counts.update(token_counts(doc, field)) + counts.update(term_counts(doc, search_field)) output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()] return output From 56fac3afdd19654a244f586bd7a2cf2234ec1b31 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 16:44:47 +0000 Subject: [PATCH 4/6] update manual --- frontend/src/assets/manual/en-GB/wordcloud.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/assets/manual/en-GB/wordcloud.md b/frontend/src/assets/manual/en-GB/wordcloud.md index ae7912172..df22a7b6c 100644 --- a/frontend/src/assets/manual/en-GB/wordcloud.md +++ b/frontend/src/assets/manual/en-GB/wordcloud.md @@ -1,5 +1,5 @@ The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud. -The wordcloud will count the frequencies of words in search results. Highly frequent words (word occuring in more than 70% of documents) are removed. In fields that offer [stemming](/manual/glossary#stemming), we also use a list of common stopwords to exclude. We also remove words with fewer than 3 characters, or which consist of only numbers or punctuation. After this, the frequency of the remaining words in the text field is counted. +The wordcloud will count the frequencies of words in search results. We use a list of common stopwords to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted. The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful. From c7526a68da718968c550d1a50ab2b7de62e9576a Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 16:47:16 +0000 Subject: [PATCH 5/6] remove language exception for wordcloud --- backend/addcorpus/json_corpora/import_json.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index 44c833cf6..a8837f061 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -1,7 +1,5 @@ from typing import List, Dict, Iterable -from langcodes import standardize_tag - from addcorpus.models import Field from addcorpus.json_corpora.utils import get_path from addcorpus import es_mappings @@ -95,12 +93,9 @@ def _parse_text_content_field(field_data: Dict) -> Field: visualize = get_path(field_data, 'options', 'visualize') if visualize: - # add wordcloud, but make an exception for languages where the wordcloud's - # tokenisation does not work. - if not (language and standardize_tag(language, macro=True) in ['zh', 'ja', 'ko']): - parsed['visualizations'] = [ - VisualizationType.WORDCLOUD.value - ] + parsed['visualizations'] = [ + VisualizationType.WORDCLOUD.value + ] return parsed From c8f83ad50822fc4ff9f3d2619f6e590677c51618 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 3 Mar 2026 17:13:20 +0000 Subject: [PATCH 6/6] explain stopwords in manual --- frontend/src/assets/manual/en-GB/glossary.md | 4 ++++ frontend/src/assets/manual/en-GB/wordcloud.md | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/frontend/src/assets/manual/en-GB/glossary.md b/frontend/src/assets/manual/en-GB/glossary.md index b38b2a952..075e67995 100644 --- a/frontend/src/assets/manual/en-GB/glossary.md +++ b/frontend/src/assets/manual/en-GB/glossary.md @@ -40,6 +40,10 @@ When you provide a bit of text to search for, we call this a query. Your query c A kind of text processing where the words in a text are stripped of all inflection. For instance, *walk*, *walks*, and *walking* are all reduced to *walk*. This can be used to ignore irrevelant variation between words. See [stemming (wikipedia)](/https://en.wikipedia.org/wiki/Stemming) for more information. +## Stopword + +Stopwords are very common words in a language. For example, some English stopwords are "the", "in", and "are". These are typically not interesting when searching, or in quantitative analysis; they do not convey what the text is about. For some text analysis, it makes sense to remove stopwords. + ## Tag On Textcavator, tags are labels that you can assign to documents. See [tags](/manual/tagging-documents) for more information. diff --git a/frontend/src/assets/manual/en-GB/wordcloud.md b/frontend/src/assets/manual/en-GB/wordcloud.md index df22a7b6c..0a96a0eb5 100644 --- a/frontend/src/assets/manual/en-GB/wordcloud.md +++ b/frontend/src/assets/manual/en-GB/wordcloud.md @@ -1,5 +1,5 @@ The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud. -The wordcloud will count the frequencies of words in search results. We use a list of common stopwords to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted. +The wordcloud will count the frequencies of words in search results. We use a list of [stopwords](/manual/glossary#stopword) to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted. The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful.