Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions backend/addcorpus/json_corpora/import_json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import List, Dict, Iterable

from langcodes import standardize_tag

from addcorpus.models import Field
from addcorpus.json_corpora.utils import get_path
from addcorpus import es_mappings
Expand Down Expand Up @@ -95,12 +93,9 @@ def _parse_text_content_field(field_data: Dict) -> Field:

visualize = get_path(field_data, 'options', 'visualize')
if visualize:
# add wordcloud, but make an exception for languages where the wordcloud's
# tokenisation does not work.
if not (language and standardize_tag(language, macro=True) in ['zh', 'ja', 'ko']):
parsed['visualizations'] = [
VisualizationType.WORDCLOUD.value
]
parsed['visualizations'] = [
VisualizationType.WORDCLOUD.value
]

return parsed

Expand Down
8 changes: 6 additions & 2 deletions backend/visualization/termvectors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Dict, Any, Optional, Iterable
from typing import List, Dict, Any, Optional, Iterable, Tuple
import re
import itertools

Expand All @@ -11,7 +11,7 @@
def request_termvectors_batched(
hits: Iterable[Dict], client: Elasticsearch, term_statistics: bool,
fields: List[str],
) -> Iterable[Dict]:
) -> Iterable[Tuple[Dict, Dict]]:
'''
Request term vectors for each hit in search results.
Uses mtermvectors endpoint to make batched requests.
Expand All @@ -34,6 +34,10 @@ def get_terms(termvector_result, field: str) -> Optional[Dict[str, Dict]]:
terms = termvectors[field]['terms']
return terms

def term_counts(doc: Dict, field: str) -> Dict[str, int]:
terms = get_terms(doc, field)
return {term: data['term_freq'] for term, data in terms.items()}

def get_tokens(terms: Dict[str, Dict], sort=True) -> List[Dict[str, Any]]:
if not terms:
return []
Expand Down
31 changes: 10 additions & 21 deletions backend/visualization/tests/test_wordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,32 +98,21 @@ def occurs_in_results(word): return any(
for word in words_to_exclude:
assert not occurs_in_results(word)

def test_wordcloud_counts(small_mock_corpus):
'''
Each non-stopword only occurs once in the mock corpus data, so
this test uses some fake texts for counting.
'''

texts = [
'Some words',
'Even more!',
'Words, words, words...',
'More words! More!',
'That should be enough.',
]
docs = [
{'_source': {'content': text}}
for text in texts
]

results = wordcloud.make_wordcloud_data(docs, 'content', small_mock_corpus)
def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus):
result = search.search(
corpus_name=large_mock_corpus,
query_model=query.MATCH_ALL,
size=10
)
documents = search.hits(result)
results = wordcloud.make_wordcloud_data(
documents, 'content', large_mock_corpus)

counts = {
item['key']: item['doc_count']
for item in results
}

assert counts['words'] == 5
assert counts['the'] == 20

def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
stopwords = ['the', 'and', 'of']
Expand Down
49 changes: 20 additions & 29 deletions backend/visualization/wordcloud.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,29 @@
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

from addcorpus.models import Corpus
from typing import Iterable, Dict
from addcorpus.models import CorpusConfiguration, Field
from visualization.termvectors import request_termvectors_batched, term_counts
from es import download as download
from es.client import elasticsearch
from es import download as download
from addcorpus.language_analyzers import get_analyzer

def field_stopwords(corpus_name, field_name):
corpus = Corpus.objects.get(name=corpus_name)
field = corpus.configuration.fields.get(name=field_name)

if field.language and field.language != 'dynamic':
analyzer = get_analyzer(field.language)
else:
return []
def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool:
corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name)
field: Field = corpus_config.fields.get(name=field_name)
has_clean_field = 'clean' in field.es_mapping.get('fields', {})
if has_clean_field:
return field_name + '.clean'
return field_name

stopwords = analyzer.stopwords()
return stopwords or []

def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name):
search_field = _wordcloud_search_field(corpus_name, field_name)

def make_wordcloud_data(documents, field, corpus):
texts = []
for document in documents:
content = document['_source'][field]
if isinstance(content, str) and len(content):
texts.append(content)
if isinstance(content, list) and len(content):
texts.append('\n'.join(content))
counts = Counter()
client = elasticsearch(corpus_name)
docs = request_termvectors_batched(hits, client, False, [search_field])
for _, doc in docs:
counts.update(term_counts(doc, search_field))

stopwords = field_stopwords(corpus, field)
cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords)
cvtexts = cv.fit_transform(texts)
counts = cvtexts.sum(axis=0).A1
words = list(cv.get_feature_names_out())
freq_distribution = Counter(dict(zip(words, counts)))
output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words]
output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()]
return output

4 changes: 4 additions & 0 deletions frontend/src/assets/manual/en-GB/glossary.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ When you provide a bit of text to search for, we call this a query. Your query c

A kind of text processing where the words in a text are stripped of all inflection. For instance, *walk*, *walks*, and *walking* are all reduced to *walk*. This can be used to ignore irrevelant variation between words. See [stemming (wikipedia)](/https://en.wikipedia.org/wiki/Stemming) for more information.

## Stopword

Stopwords are very common words in a language. For example, some English stopwords are "the", "in", and "are". These are typically not interesting when searching, or in quantitative analysis; they do not convey what the text is about. For some text analysis, it makes sense to remove stopwords.

## Tag

On Textcavator, tags are labels that you can assign to documents. See [tags](/manual/tagging-documents) for more information.
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/assets/manual/en-GB/wordcloud.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
The word cloud shows term frequencies of the first 1000 search results corresponding to your query. You can select which fields should be analysed for generate the wordcloud.

The wordcloud will count the frequencies of words in search results. Highly frequent words (word occuring in more than 70% of documents) are removed. In fields that offer [stemming](/manual/glossary#stemming), we also use a list of common stopwords to exclude. We also remove words with fewer than 3 characters, or which consist of only numbers or punctuation. After this, the frequency of the remaining words in the text field is counted.
The wordcloud will count the frequencies of words in search results. We use a list of [stopwords](/manual/glossary#stopword) to exclude, if this is supported for the field. After this, the frequency of the remaining words in the text field is counted.

The 50 most frequent words are ordered on the canvas, where the text size indicates the frequency of the words. The colour and orientation of the word are not meaningful.
Loading