|
1 | 1 | from collections import Counter |
2 | | -from sklearn.feature_extraction.text import CountVectorizer |
3 | | - |
4 | | -from addcorpus.models import Corpus |
| 2 | +from typing import Iterable, Dict |
| 3 | +from addcorpus.models import CorpusConfiguration, Field |
| 4 | +from visualization.termvectors import request_termvectors_batched, term_counts |
| 5 | +from es import download as download |
| 6 | +from es.client import elasticsearch |
5 | 7 | from es import download as download |
6 | | -from addcorpus.language_analyzers import get_analyzer |
7 | | - |
8 | | -def field_stopwords(corpus_name, field_name): |
9 | | - corpus = Corpus.objects.get(name=corpus_name) |
10 | | - field = corpus.configuration.fields.get(name=field_name) |
11 | 8 |
|
12 | | - if field.language and field.language != 'dynamic': |
13 | | - analyzer = get_analyzer(field.language) |
14 | | - else: |
15 | | - return [] |
| 9 | +def _wordcloud_search_field(corpus_name: str, field_name: str) -> bool: |
| 10 | + corpus_config = CorpusConfiguration.objects.get(corpus__name=corpus_name) |
| 11 | + field: Field = corpus_config.fields.get(name=field_name) |
| 12 | + has_clean_field = 'clean' in field.es_mapping.get('fields', {}) |
| 13 | + if has_clean_field: |
| 14 | + return field_name + '.clean' |
| 15 | + return field_name |
16 | 16 |
|
17 | | - stopwords = analyzer.stopwords() |
18 | | - return stopwords or [] |
19 | 17 |
|
| 18 | +def make_wordcloud_data(hits: Iterable[Dict], field_name, corpus_name): |
| 19 | + search_field = _wordcloud_search_field(corpus_name, field_name) |
20 | 20 |
|
21 | | -def make_wordcloud_data(documents, field, corpus): |
22 | | - texts = [] |
23 | | - for document in documents: |
24 | | - content = document['_source'][field] |
25 | | - if isinstance(content, str) and len(content): |
26 | | - texts.append(content) |
27 | | - if isinstance(content, list) and len(content): |
28 | | - texts.append('\n'.join(content)) |
| 21 | + counts = Counter() |
| 22 | + client = elasticsearch(corpus_name) |
| 23 | + docs = request_termvectors_batched(hits, client, False, [search_field]) |
| 24 | + for _, doc in docs: |
| 25 | + counts.update(term_counts(doc, search_field)) |
29 | 26 |
|
30 | | - stopwords = field_stopwords(corpus, field) |
31 | | - cv = CountVectorizer(max_features=100, max_df=0.7, token_pattern=r'(?u)\b[^0-9\s]{3,30}\b', stop_words=stopwords) |
32 | | - cvtexts = cv.fit_transform(texts) |
33 | | - counts = cvtexts.sum(axis=0).A1 |
34 | | - words = list(cv.get_feature_names_out()) |
35 | | - freq_distribution = Counter(dict(zip(words, counts))) |
36 | | - output = [{'key': word, 'doc_count': int(freq_distribution[word])} for word in words] |
| 27 | + output = [{'key': word, 'doc_count': freq} for word, freq in counts.items()] |
37 | 28 | return output |
38 | 29 |
|
0 commit comments