-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtest_wordcloud.py
More file actions
122 lines (101 loc) · 3.86 KB
/
test_wordcloud.py
File metadata and controls
122 lines (101 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import visualization.wordcloud as wordcloud
from es import search
import pytest
import visualization.query as query
from datetime import datetime
def make_filtered_query():
empty_query = {
"query": {
"bool": {
"filter": []
}
}
}
datefilter = query.make_date_filter(
max_date=datetime(year=1820, month=12, day=31))
return query.add_filter(empty_query, datefilter)
@pytest.fixture()
def small_mock_corpus_complete_wordcloud(small_mock_corpus, index_small_mock_corpus):
result = search.search(
corpus_name=small_mock_corpus,
query_model=query.MATCH_ALL,
size=10
)
documents = search.hits(result)
return wordcloud.make_wordcloud_data(documents, 'content', small_mock_corpus)
def test_wordcloud(small_mock_corpus, small_mock_corpus_complete_wordcloud):
target_unfiltered = [
{'key': 'wife', 'doc_count': 1},
{'key': 'universally', 'doc_count': 1},
{'key': 'truth', 'doc_count': 1},
{'key': 'tired', 'doc_count': 1},
{'key': 'sitting', 'doc_count': 1},
{'key': 'sister', 'doc_count': 1},
{'key': 'single', 'doc_count': 1},
{'key': 'rejoice', 'doc_count': 1},
{'key': 'regarded', 'doc_count': 1},
{'key': 'possession', 'doc_count': 1},
{'key': 'nothing', 'doc_count': 1},
{'key': 'man', 'doc_count': 1},
{'key': 'hear', 'doc_count': 1},
{'key': 'good', 'doc_count': 1},
{'key': 'fortune', 'doc_count': 1},
{'key': 'forebodings', 'doc_count': 1},
{'key': 'evil', 'doc_count': 1},
{'key': 'enterprise', 'doc_count': 1},
{'key': 'disaster', 'doc_count': 1},
{'key': 'commencement', 'doc_count': 1},
{'key': 'beginning', 'doc_count': 1},
{'key': 'bank', 'doc_count': 1},
{'key': 'alice', 'doc_count': 1},
{'key': 'acknowledged', 'doc_count': 1},
{'key': 'accompanied', 'doc_count': 1}
]
for item in target_unfiltered:
term = item['key']
doc_count = item['doc_count']
match = next(
hit for hit in small_mock_corpus_complete_wordcloud if hit['key'] == term)
assert match
assert doc_count == match['doc_count']
def test_wordcloud_filtered(small_mock_corpus, es_client, index_small_mock_corpus):
"""Test the word cloud on a query with date filter"""
filtered_query = make_filtered_query()
result = search.search(
corpus_name=small_mock_corpus,
query_model=filtered_query,
size=10,
client=es_client
)
documents = search.hits(result)
output = wordcloud.make_wordcloud_data(
documents, 'content', small_mock_corpus)
# from frankenstein + pride & prejudice
words_to_include = ['accompanied', 'acknowledged', 'truth', 'universally']
words_to_exclude = ['alice', 'beginning'] # from alice in wonderland
def occurs_in_results(word): return any(
item['key'] == word for item in output)
for word in words_to_include:
assert occurs_in_results(word)
for word in words_to_exclude:
assert not occurs_in_results(word)
def test_wordcloud_counts(large_mock_corpus, index_large_mock_corpus):
result = search.search(
corpus_name=large_mock_corpus,
query_model=query.MATCH_ALL,
size=10
)
documents = search.hits(result)
results = wordcloud.make_wordcloud_data(
documents, 'content', large_mock_corpus)
counts = {
item['key']: item['doc_count']
for item in results
}
assert counts['the'] == 20
def test_wordcloud_filters_stopwords(small_mock_corpus, small_mock_corpus_complete_wordcloud):
stopwords = ['the', 'and', 'of']
for stopword in stopwords:
match = any(
item['key'] == stopword for item in small_mock_corpus_complete_wordcloud)
assert not match