|
1 | | -import os |
2 | | -import warnings |
3 | | -from typing import Dict |
| 1 | +from typing import Dict, Iterable |
| 2 | +import operator |
| 3 | +from functools import reduce |
4 | 4 |
|
5 | | -from django.conf import settings |
6 | | -from langcodes import Language, standardize_tag |
7 | | -import nltk |
| 5 | +from addcorpus.language_analyzers import get_analyzer |
8 | 6 |
|
9 | | -# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html] |
10 | | -AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', |
11 | | - 'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', |
12 | | - 'english', 'estonian', 'finnish', 'french', 'galician', |
13 | | - 'german', 'greek', 'hindi', 'hungarian', 'indonesian', |
14 | | - 'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', |
15 | | - 'persian', 'portuguese', 'romanian', 'russian', 'sorani', |
16 | | - 'spanish', 'swedish', 'turkish', 'thai'] |
17 | | - |
18 | | -def get_language_key(language_code): |
19 | | - ''' |
20 | | - Get the nltk stopwords file / elasticsearch stemmer name for a language code |
21 | | -
|
22 | | - E.g. 'en' -> 'english' |
23 | | - ''' |
24 | | - |
25 | | - return Language.make(standardize_tag(language_code)).display_name().lower() |
26 | | - |
27 | | -def _stopwords_directory() -> str: |
28 | | - stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords') |
29 | | - if not os.path.exists(stopwords_dir): |
30 | | - nltk.download('stopwords', settings.NLTK_DATA_PATH) |
31 | | - return stopwords_dir |
32 | | - |
33 | | -def _stopwords_path(language_code: str): |
34 | | - dir = _stopwords_directory() |
35 | | - language = get_language_key(language_code) |
36 | | - return os.path.join(dir, language) |
37 | | - |
38 | | -def stopwords_available(language_code: str) -> bool: |
39 | | - if not language_code: |
40 | | - return False |
41 | | - path = _stopwords_path(language_code) |
42 | | - return os.path.exists(path) |
43 | | - |
44 | | -def get_nltk_stopwords(language_code): |
45 | | - path = _stopwords_path(language_code) |
46 | | - |
47 | | - if os.path.exists(path): |
48 | | - with open(path) as infile: |
49 | | - words = [line.strip() for line in infile.readlines()] |
50 | | - return words |
51 | | - else: |
52 | | - raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code)) |
53 | | - |
54 | | -def add_language_string(name, language): |
55 | | - return '{}_{}'.format(name, language) if language else name |
56 | | - |
57 | | -def stemming_available(language_code: str) -> bool: |
58 | | - ''' |
59 | | - Check whether stemming is supported for a language. |
60 | | -
|
61 | | - Parameters: |
62 | | - language: an ISO-639 language code |
63 | | -
|
64 | | - Returns: |
65 | | - whether elasticsearch supports stemming analysis in this language. |
66 | | - ''' |
67 | | - if not language_code: |
68 | | - return False |
69 | | - return get_language_key(language_code) in AVAILABLE_ES_STEMMERS |
70 | | - |
71 | | -def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False): |
| 7 | +def es_settings(languages=[]): |
72 | 8 | ''' |
73 | 9 | Make elasticsearch settings json for a corpus index. Options: |
74 | | - - `languages`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming |
75 | | - - `stopword_analysis`: set to True to add an analyzer that removes stopwords. |
76 | | - - `stemming_analysis`: set to True to add an analyzer that removes stopwords and performs stemming. |
| 10 | + - `languages`: array of language codes (IETF tags) |
77 | 11 | ''' |
78 | | - settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}} |
79 | | - stopword_filter_name = 'stopwords' |
80 | | - clean_analyzer_name = 'clean' |
81 | | - stemmer_filter_name = 'stemmer' |
82 | | - stemmed_analyzer_name = 'stemmed' |
83 | | - |
84 | | - set_char_filter(settings) |
85 | | - |
86 | | - for language in languages: |
87 | | - # do not attach language isocodes if there is just one language |
88 | | - |
89 | | - tag = standardize_tag(language, macro=True) |
90 | | - |
91 | | - if stopword_analysis or stemming_analysis: |
92 | | - if not set_stopword_filter(settings, add_language_string(stopword_filter_name, tag), tag): |
93 | | - continue # skip languages for which we do not have a stopword list |
94 | | - |
95 | | - if stopword_analysis: |
96 | | - set_clean_analyzer( |
97 | | - settings, |
98 | | - tag, |
99 | | - add_language_string(stopword_filter_name, tag), |
100 | | - add_language_string(clean_analyzer_name, tag), |
101 | | - ) |
102 | | - if stemming_analysis: |
103 | | - if not stemming_available(tag): |
104 | | - warnings.warn('You specified `stemming_analysis=True`, but \ |
105 | | - there is no stemmer available for this language') |
106 | | - continue |
107 | | - set_stemmed_analyzer( |
108 | | - settings, |
109 | | - tag, |
110 | | - add_language_string(stopword_filter_name, tag), |
111 | | - add_language_string(stemmer_filter_name, tag), |
112 | | - add_language_string(stemmed_analyzer_name, tag), |
113 | | - ) |
114 | | - |
115 | | - return settings |
116 | | - |
117 | | -def number_filter(): |
118 | | - return { |
119 | | - "type":"pattern_replace", |
120 | | - "pattern":"\\d+", |
121 | | - "replacement":"" |
| 12 | + analyzers = [get_analyzer(lang) for lang in languages] |
| 13 | + analysis = { |
| 14 | + 'char_filter': _join_dicts(analyzer.char_filters() for analyzer in analyzers), |
| 15 | + 'filter': _join_dicts(analyzer.token_filters() for analyzer in analyzers), |
| 16 | + 'analyzer': _join_dicts(analyzer.analyzers() for analyzer in analyzers), |
122 | 17 | } |
123 | 18 |
|
124 | | -def make_stopword_filter(language): |
125 | | - try: |
126 | | - stopwords = get_nltk_stopwords(language) |
127 | | - return { |
128 | | - "type": "stop", |
129 | | - 'stopwords': stopwords |
130 | | - } |
131 | | - except: |
132 | | - return None |
133 | | - |
134 | | -def _standard_analyzer(language: str): |
135 | | - ''' |
136 | | - Basic analyzer for a language. |
137 | | - ''' |
138 | | - if language in ['zh', 'ja', 'ko']: |
139 | | - return { |
140 | | - 'tokenizer': 'standard', |
141 | | - 'filter': [ |
142 | | - 'cjk_width', |
143 | | - 'lowercase', |
144 | | - ] |
145 | | - } |
146 | | - else: |
147 | | - return { |
148 | | - 'tokenizer': 'standard', |
149 | | - 'char_filter': ['number_filter'], |
150 | | - 'filter': ['lowercase'] |
151 | | - } |
152 | | - |
153 | | -def make_clean_analyzer(language: str, stopword_filter_name: str) -> Dict: |
154 | | - analyzer = _standard_analyzer(language) |
155 | | - analyzer['filter'].append(stopword_filter_name) |
156 | | - return analyzer |
157 | | - |
158 | | - |
159 | | -def make_stemmer_filter(language): |
160 | | - stemmer_language = get_language_key(language) |
161 | | - return { |
162 | | - "type": "stemmer", |
163 | | - "language": stemmer_language |
| 19 | + settings = { |
| 20 | + 'index': {'number_of_shards': 1, 'number_of_replicas': 1}, |
| 21 | + 'analysis': analysis, |
164 | 22 | } |
| 23 | + return settings |
165 | 24 |
|
166 | | -def make_stemmed_analyzer( |
167 | | - language: str, stopword_filter_name: str, stemmer_filter_name: str |
168 | | -) -> Dict: |
169 | | - analyzer = make_clean_analyzer(language, stopword_filter_name) |
170 | | - analyzer['filter'].append(stemmer_filter_name) |
171 | | - return analyzer |
172 | | - |
173 | | - |
174 | | -def get_stopwords_from_settings(es_settings, analyzer): |
175 | | - try: |
176 | | - # the name of the stopword filter is second in the list, after "lowercase" |
177 | | - stopword_filter_name = es_settings['analysis']['analyzer'].get( |
178 | | - analyzer).get('filter')[-1] |
179 | | - token_filter = es_settings["analysis"]['filter'][stopword_filter_name] |
180 | | - return token_filter['stopwords'] |
181 | | - except: |
182 | | - return [] |
183 | | - |
184 | | -def set_stemmed_analyzer( |
185 | | - settings: Dict, |
186 | | - language: str, |
187 | | - stopword_filter_name: str, |
188 | | - stemmer_filter_name: str, |
189 | | - stemmed_analyzer_name: str, |
190 | | -) -> None: |
191 | | - filters = settings['analysis'].get('filter', {}) |
192 | | - filters.update({stemmer_filter_name: make_stemmer_filter(language)}) |
193 | | - settings['analysis']['filter'] = filters |
194 | | - analyzers = settings['analysis'].get('analyzer') |
195 | | - analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer( |
196 | | - language, stopword_filter_name, stemmer_filter_name)}) |
197 | | - settings['analysis']['analyzer'] = analyzers |
198 | | - |
199 | | -def set_char_filter(settings): |
200 | | - settings["analysis"] = { |
201 | | - "char_filter": { "number_filter": number_filter() } |
202 | | - } |
203 | | - |
204 | | -def set_stopword_filter(settings, stopword_filter_name, language): |
205 | | - stopword_filter = make_stopword_filter(language) |
206 | | - if not stopword_filter: |
207 | | - return False |
208 | | - filters = settings['analysis'].get('filter', {}) |
209 | | - filters.update({ |
210 | | - stopword_filter_name: stopword_filter |
211 | | - }) |
212 | | - settings['analysis']['filter'] = filters |
213 | | - return True |
214 | | - |
215 | | -def set_clean_analyzer( |
216 | | - settings: Dict, |
217 | | - language: str, |
218 | | - stopword_filter_name: str, |
219 | | - clean_analyzer_name: str, |
220 | | -) -> None: |
221 | | - clean_analyzer = make_clean_analyzer(language, stopword_filter_name) |
222 | | - analyzers = settings['analysis'].get('analyzer', {}) |
223 | | - analyzers.update({clean_analyzer_name: clean_analyzer}) |
224 | | - settings["analysis"]['analyzer'] = analyzers |
| 25 | +def _join_dicts(dicts: Iterable[Dict]) -> Dict: |
| 26 | + return reduce(operator.or_, dicts) |
0 commit comments