diff --git a/backend/addcorpus/json_corpora/import_json.py b/backend/addcorpus/json_corpora/import_json.py index 44c833cf6..a8837f061 100644 --- a/backend/addcorpus/json_corpora/import_json.py +++ b/backend/addcorpus/json_corpora/import_json.py @@ -1,7 +1,5 @@ from typing import List, Dict, Iterable -from langcodes import standardize_tag - from addcorpus.models import Field from addcorpus.json_corpora.utils import get_path from addcorpus import es_mappings @@ -95,12 +93,9 @@ def _parse_text_content_field(field_data: Dict) -> Field: visualize = get_path(field_data, 'options', 'visualize') if visualize: - # add wordcloud, but make an exception for languages where the wordcloud's - # tokenisation does not work. - if not (language and standardize_tag(language, macro=True) in ['zh', 'ja', 'ko']): - parsed['visualizations'] = [ - VisualizationType.WORDCLOUD.value - ] + parsed['visualizations'] = [ + VisualizationType.WORDCLOUD.value + ] return parsed diff --git a/backend/visualization/termvectors.py b/backend/visualization/termvectors.py index e63608fdb..fd7522e4c 100644 --- a/backend/visualization/termvectors.py +++ b/backend/visualization/termvectors.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional, Iterable +from typing import List, Dict, Any, Optional, Iterable, Tuple import re import itertools @@ -11,7 +11,7 @@ def request_termvectors_batched( hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool, fields: List[str], -) -> Iterable[Dict]: +) -> Iterable[Tuple[Dict, Dict]]: ''' Request term vectors for each hit in search results. Uses mtermvectors endpoint to make batched requests. @@ -34,6 +34,10 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]: terms = termvectors[field]['terms'] return terms +def term_counts(doc: Dict, field: str) -> Dict[str, int]: + terms = get_terms(doc, field) + return {term: data['term_freq'] for term, data in terms.items()} + def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]: if not terms: return [] diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py index d90696100..753abdae7 100644 --- a/backend/visualization/tests/test_wordcloud.py +++ b/backend/visualization/tests/test_wordcloud.py @@ -98,32 +98,21 @@ def occurs_in_results(word): return any( for word in words_to_exclude: assert not occurs_in_results(word) -def test_wordcloud_counts(small_mock_corpus): - ''' - Each non-stopword only occurs once in the mock corpus data, so - this test uses some fake texts for counting. - ''' - - texts = [ - 'Some words', - 'Even more!', - 'Words, words, words...', - 'More words! More!', - 'That should be enough.', - ] - docs = [ - {'_source': {'content': text}} - for text in texts - ] - - results = wordcloud.make_wordcloud_data(docs, 'content', small_mock_corpus) +def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus): + result = search.search( + corpus_name=large_mock_corpus, + query_model=query.MATCH_ALL, + size=10 + ) + documents = search.hits(result) + results = wordcloud.make_wordcloud_data( + documents, 'content', large_mock_corpus) counts = { item['key']: item['doc_count'] for item in results } - - assert counts['words'] == 5 + assert counts['the'] == 20 def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud): stopwords = ['the', 'and', 'of'] diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index 12f22a959..782601322 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -1,38 +1,29 @@ from collections import Counter -from sklearn.feature_extraction.text import CountVectorizer - -from addcorpus.models import Corpus +from typing import Iterable, Dict +from addcorpus.models import CorpusConfiguration, Field +from visualization.termvectors import request_termvectors_batched, term_counts +from es import download as download +from es.client import elasticsearch from es import download as download -from addcorpus.language_analyzers import get_analyzer - -def field_stopwords(corpus_name, field_name): - corpus = Corpus.objects.get(name=corpus_name) - field = corpus.configuration.fields.get(name=field_name) - if field.language and field.language != 'dynamic': - analyzer = get_analyzer(field.language) - else: - return [] +def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool: + corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name) + field: Field = corpus_config.fields.get(name=field_name) + has_clean_field = 'clean' in field.es_mapping.get('fields', {}) + if has_clean_field: + return field_name + '.clean' + return field_name - stopwords = analyzer.stopwords() - return stopwords or [] +def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name): + search_field = _wordcloud_search_field(corpus_name, field_name) -def make_wordcloud_data(documents, field, corpus): - texts = [] - for document in documents: - content = document['_source'][field] - if isinstance(content, str) and len(content): - texts.append(content) - if isinstance(content, list) and len(content): - texts.append('\n'.join(content)) + counts = Counter() + client = elasticsearch(corpus_name) + docs = request_termvectors_batched(hits, client, False, [search_field]) + for _, doc in docs: + counts.update(term_counts(doc, search_field)) - stopwords = field_stopwords(corpus, field) - cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords) - cvtexts = cv.fit_transform(texts) - counts = cvtexts.sum(axis=0).A1 - words = list(cv.get_feature_names_out()) - freq_distribution = Counter(dict(zip(words, counts))) - output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words] + output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()] return output diff --git a/frontend/src/assets/manual/en-GB/glossary.md b/frontend/src/assets/manual/en-GB/glossary.md index b38b2a952..075e67995 100644 --- a/frontend/src/assets/manual/en-GB/glossary.md +++ b/frontend/src/assets/manual/en-GB/glossary.md @@ -40,6 +40,10 @@ When you provide a bit of text to search for, we call this a query. Your query c A kind of text processing where the words in a text are stripped of all inflection. For instance, *walk*, *walks*, and *walking* are all reduced to *walk*. This can be used to ignore irrevelant variation between words. See [stemming (wikipedia)](/https://en.wikipedia.org/wiki/Stemming) for more information. +## Stopword + +Stopwords are very common words in a language. For example, some English stopwords are "the", "in", and "are". These are typically not interesting when searching, or in quantitative analysis; they do not convey what the text is about. For some text analysis, it makes sense to remove stopwords. + ## Tag On Textcavator, tags are labels that you can assign to documents. See [tags](/manual/tagging-documents) for more information. diff --git a/frontend/src/assets/manual/en-GB/wordcloud.md b/frontend/src/assets/manual/en-GB/wordcloud.md index ae7912172..0a96a0eb5 100644 --- a/frontend/src/assets/manual/en-GB/wordcloud.md +++ b/frontend/src/assets/manual/en-GB/wordcloud.md @@ -1,5 +1,5 @@ The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud. -The wordcloud will count the frequencies of words in search results. Highly frequent words (word occuring in more than 70% of documents) are removed. In fields that offer [stemming](/manual/glossary#stemming), we also use a list of common stopwords to exclude. We also remove words with fewer than 3 characters, or which consist of only numbers or punctuation. After this, the frequency of the remaining words in the text field is counted. +The wordcloud will count the frequencies of words in search results. We use a list of [stopwords](/manual/glossary#stopword) to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted. The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful.