Skip to content

Commit e18eba0

Browse files
authored
Merge pull request #2024 from CentreForDigitalHumanities/feature/wordcloud-tokenisation
Wordcloud tokenisation
2 parents 46c66f3 + 033272a commit e18eba0

6 files changed

Lines changed: 44 additions & 61 deletions

File tree

backend/addcorpus/json_corpora/import_json.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from typing import List, Dict, Iterable
22

3-
from langcodes import standardize_tag
4-
53
from addcorpus.models import Field
64
from addcorpus.json_corpora.utils import get_path
75
from addcorpus import es_mappings
@@ -95,12 +93,9 @@ def _parse_text_content_field(field_data: Dict) -> Field:
9593

9694
visualize = get_path(field_data, 'options', 'visualize')
9795
if visualize:
98-
# add wordcloud, but make an exception for languages where the wordcloud's
99-
# tokenisation does not work.
100-
if not (language and standardize_tag(language, macro=True) in ['zh', 'ja', 'ko']):
101-
parsed['visualizations'] = [
102-
VisualizationType.WORDCLOUD.value
103-
]
96+
parsed['visualizations'] = [
97+
VisualizationType.WORDCLOUD.value
98+
]
10499

105100
return parsed
106101

backend/visualization/termvectors.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Dict, Any, Optional, Iterable
1+
from typing import List, Dict, Any, Optional, Iterable, Tuple
22
import re
33
import itertools
44

@@ -11,7 +11,7 @@
1111
def request_termvectors_batched(
1212
hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool,
1313
fields: List[str],
14-
) -> Iterable[Dict]:
14+
) -> Iterable[Tuple[Dict, Dict]]:
1515
'''
1616
Request term vectors for each hit in search results.
1717
Uses mtermvectors endpoint to make batched requests.
@@ -34,6 +34,10 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]:
3434
terms = termvectors[field]['terms']
3535
return terms
3636

37+
def term_counts(doc: Dict, field: str) -> Dict[str, int]:
38+
terms = get_terms(doc, field)
39+
return {term: data['term_freq'] for term, data in terms.items()}
40+
3741
def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]:
3842
if not terms:
3943
return []

backend/visualization/tests/test_wordcloud.py

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -98,32 +98,21 @@ def occurs_in_results(word): return any(
9898
for word in words_to_exclude:
9999
assert not occurs_in_results(word)
100100

101-
def test_wordcloud_counts(small_mock_corpus):
102-
'''
103-
Each non-stopword only occurs once in the mock corpus data, so
104-
this test uses some fake texts for counting.
105-
'''
106-
107-
texts = [
108-
'Some words',
109-
'Even more!',
110-
'Words, words, words...',
111-
'More words! More!',
112-
'That should be enough.',
113-
]
114-
docs = [
115-
{'_source': {'content': text}}
116-
for text in texts
117-
]
118-
119-
results = wordcloud.make_wordcloud_data(docs, 'content', small_mock_corpus)
101+
def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus):
102+
result = search.search(
103+
corpus_name=large_mock_corpus,
104+
query_model=query.MATCH_ALL,
105+
size=10
106+
)
107+
documents = search.hits(result)
108+
results = wordcloud.make_wordcloud_data(
109+
documents, 'content', large_mock_corpus)
120110

121111
counts = {
122112
item['key']: item['doc_count']
123113
for item in results
124114
}
125-
126-
assert counts['words'] == 5
115+
assert counts['the'] == 20
127116

128117
def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
129118
stopwords = ['the', 'and', 'of']

backend/visualization/wordcloud.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,29 @@
11
from collections import Counter
2-
from sklearn.feature_extraction.text import CountVectorizer
3-
4-
from addcorpus.models import Corpus
2+
from typing import Iterable, Dict
3+
from addcorpus.models import CorpusConfiguration, Field
4+
from visualization.termvectors import request_termvectors_batched, term_counts
5+
from es import download as download
6+
from es.client import elasticsearch
57
from es import download as download
6-
from addcorpus.language_analyzers import get_analyzer
7-
8-
def field_stopwords(corpus_name, field_name):
9-
corpus = Corpus.objects.get(name=corpus_name)
10-
field = corpus.configuration.fields.get(name=field_name)
118

12-
if field.language and field.language != 'dynamic':
13-
analyzer = get_analyzer(field.language)
14-
else:
15-
return []
9+
def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool:
10+
corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name)
11+
field: Field = corpus_config.fields.get(name=field_name)
12+
has_clean_field = 'clean' in field.es_mapping.get('fields', {})
13+
if has_clean_field:
14+
return field_name + '.clean'
15+
return field_name
1616

17-
stopwords = analyzer.stopwords()
18-
return stopwords or []
1917

18+
def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name):
19+
search_field = _wordcloud_search_field(corpus_name, field_name)
2020

21-
def make_wordcloud_data(documents, field, corpus):
22-
texts = []
23-
for document in documents:
24-
content = document['_source'][field]
25-
if isinstance(content, str) and len(content):
26-
texts.append(content)
27-
if isinstance(content, list) and len(content):
28-
texts.append('\n'.join(content))
21+
counts = Counter()
22+
client = elasticsearch(corpus_name)
23+
docs = request_termvectors_batched(hits, client, False, [search_field])
24+
for _, doc in docs:
25+
counts.update(term_counts(doc, search_field))
2926

30-
stopwords = field_stopwords(corpus, field)
31-
cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords)
32-
cvtexts = cv.fit_transform(texts)
33-
counts = cvtexts.sum(axis=0).A1
34-
words = list(cv.get_feature_names_out())
35-
freq_distribution = Counter(dict(zip(words, counts)))
36-
output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words]
27+
output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()]
3728
return output
3829

frontend/src/assets/manual/en-GB/glossary.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ When you provide a bit of text to search for, we call this a query. Your query c
4040

4141
A kind of text processing where the words in a text are stripped of all inflection. For instance, *walk*, *walks*, and *walking* are all reduced to *walk*. This can be used to ignore irrevelant variation between words. See [stemming (wikipedia)](/https://en.wikipedia.org/wiki/Stemming) for more information.
4242

43+
## Stopword
44+
45+
Stopwords are very common words in a language. For example, some English stopwords are "the", "in", and "are". These are typically not interesting when searching, or in quantitative analysis; they do not convey what the text is about. For some text analysis, it makes sense to remove stopwords.
46+
4347
## Tag
4448

4549
On Textcavator, tags are labels that you can assign to documents. See [tags](/manual/tagging-documents) for more information.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud.
22

3-
The wordcloud will count the frequencies of words in search results. Highly frequent words (word occuring in more than 70% of documents) are removed. In fields that offer [stemming](/manual/glossary#stemming), we also use a list of common stopwords to exclude. We also remove words with fewer than 3 characters, or which consist of only numbers or punctuation. After this, the frequency of the remaining words in the text field is counted.
3+
The wordcloud will count the frequencies of words in search results. We use a list of [stopwords](/manual/glossary#stopword) to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted.
44

55
The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful.

0 commit comments

Comments
 (0)