Skip to content

Commit efdd53f

Browse files
committed
Merge branch 'develop' into feature/wordcloud-tokenisation
2 parents c8f83ad + 482ee42 commit efdd53f

75 files changed

Lines changed: 5199 additions & 4341 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ keywords:
3535
- elasticsearch
3636
- natural language processing
3737
license: MIT
38-
version: 5.28.0
39-
date-released: '2026-02-20'
38+
version: 5.29.0
39+
date-released: '2026-03-11'

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ For corpora included in Textcavator, the backend includes a definition file that
2222

2323
## Usage
2424

25-
If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
25+
If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [JYU People & Parliament](https://people-and-parliament.hum.uu.nl/).
2626

2727
Textcavator does not have an "upload data" option (yet!). If you are interested in using Textcavator as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
2828

backend/addcorpus/es_mappings.py

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,69 @@
1-
from typing import Dict
2-
from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
3-
from langcodes import standardize_tag
1+
from typing import Dict, Optional
2+
from addcorpus.language_analyzers import get_analyzer
43

54
def primary_mapping_type(es_mapping: Dict) -> str:
65
return es_mapping.get('type', None)
76

87

98
def main_content_mapping(
10-
token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None
9+
token_counts=True, stopword_analysis=False, stemming_analysis=False,
10+
language: Optional[str] = None
1111
):
1212
'''
1313
Mapping for the main content field. Options:
1414
1515
- `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
1616
- `stopword_analysis`: enables analysis using stopword removal, if available for the language.
1717
- `stemming_analysis`: enables analysis using stemming, if available for the language.
18-
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
18+
- `language`: language (IETF tag) of the field contents
1919
'''
2020

21-
mapping = {"type": "text", "term_vector": "with_positions_offsets"}
21+
analyzer = get_analyzer(language)
22+
mapping = {
23+
'type': 'text',
24+
'analyzer': analyzer.standard_analyzer_name,
25+
'term_vector': 'with_positions_offsets'
26+
}
2227

2328
if any([token_counts, stopword_analysis, stemming_analysis]):
2429
multifields = {}
2530
if token_counts:
2631
multifields['length'] = {
27-
"type": "token_count",
28-
"analyzer": "standard"
32+
'type': 'token_count',
33+
'analyzer': analyzer.standard_analyzer_name
2934
}
3035

31-
if not language:
32-
return mapping
33-
tag = standardize_tag(language, macro=True)
34-
35-
if stopword_analysis and stopwords_available(tag):
36+
if stopword_analysis and analyzer.has_stopwords:
3637
multifields['clean'] = {
37-
"type": "text",
38-
"analyzer": add_language_string('clean', tag),
39-
"term_vector": "with_positions_offsets" # include character positions for highlighting
38+
'type': 'text',
39+
'analyzer': analyzer.clean_analyzer_name,
40+
'term_vector': 'with_positions_offsets' # include character positions for highlighting
4041
}
41-
if stemming_analysis and stemming_available(tag):
42+
if stemming_analysis and analyzer.has_stemming:
4243
multifields['stemmed'] = {
43-
"type": "text",
44-
"analyzer": add_language_string('stemmed', tag),
45-
"term_vector": "with_positions_offsets",
44+
'type': 'text',
45+
'analyzer': analyzer.stemmed_analyzer_name,
46+
'term_vector': 'with_positions_offsets',
4647
}
4748
mapping['fields'] = multifields
4849

4950
return mapping
5051

5152

52-
def text_mapping():
53+
def text_mapping(language: Optional[str] = None):
5354
'''
54-
Mapping for text fields that are not the main content. Performs tokenisation and lowercasing for full-text
55+
Mapping for text fields that are not the main content. Performs standard analysis for full-text
5556
search, but does not support other analysis options.
5657
'''
5758

59+
analyzer = get_analyzer(language)
60+
5861
return {
59-
'type': 'text'
62+
'type': 'text',
63+
'analyzer': analyzer.standard_analyzer_name,
6064
}
6165

62-
def keyword_mapping(enable_full_text_search = False):
66+
def keyword_mapping(enable_full_text_search = False, language: Optional[str] = None):
6367
'''
6468
Mapping for keyword fields. Keyword fields allow filtering and histogram visualisations.
6569
@@ -71,7 +75,7 @@ def keyword_mapping(enable_full_text_search = False):
7175
}
7276
if enable_full_text_search:
7377
mapping['fields'] = {
74-
'text': { 'type': 'text' },
78+
'text': text_mapping(language)
7579
}
7680

7781
return mapping

backend/addcorpus/es_settings.py

Lines changed: 17 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -1,224 +1,26 @@
1-
import os
2-
import warnings
3-
from typing import Dict
1+
from typing import Dict, Iterable
2+
import operator
3+
from functools import reduce
44

5-
from django.conf import settings
6-
from langcodes import Language, standardize_tag
7-
import nltk
5+
from addcorpus.language_analyzers import get_analyzer
86

9-
# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
10-
AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
11-
'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
12-
'english', 'estonian', 'finnish', 'french', 'galician',
13-
'german', 'greek', 'hindi', 'hungarian', 'indonesian',
14-
'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
15-
'persian', 'portuguese', 'romanian', 'russian', 'sorani',
16-
'spanish', 'swedish', 'turkish', 'thai']
17-
18-
def get_language_key(language_code):
19-
'''
20-
Get the nltk stopwords file / elasticsearch stemmer name for a language code
21-
22-
E.g. 'en' -> 'english'
23-
'''
24-
25-
return Language.make(standardize_tag(language_code)).display_name().lower()
26-
27-
def _stopwords_directory() -> str:
28-
stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
29-
if not os.path.exists(stopwords_dir):
30-
nltk.download('stopwords', settings.NLTK_DATA_PATH)
31-
return stopwords_dir
32-
33-
def _stopwords_path(language_code: str):
34-
dir = _stopwords_directory()
35-
language = get_language_key(language_code)
36-
return os.path.join(dir, language)
37-
38-
def stopwords_available(language_code: str) -> bool:
39-
if not language_code:
40-
return False
41-
path = _stopwords_path(language_code)
42-
return os.path.exists(path)
43-
44-
def get_nltk_stopwords(language_code):
45-
path = _stopwords_path(language_code)
46-
47-
if os.path.exists(path):
48-
with open(path) as infile:
49-
words = [line.strip() for line in infile.readlines()]
50-
return words
51-
else:
52-
raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code))
53-
54-
def add_language_string(name, language):
55-
return '{}_{}'.format(name, language) if language else name
56-
57-
def stemming_available(language_code: str) -> bool:
58-
'''
59-
Check whether stemming is supported for a language.
60-
61-
Parameters:
62-
language: an ISO-639 language code
63-
64-
Returns:
65-
whether elasticsearch supports stemming analysis in this language.
66-
'''
67-
if not language_code:
68-
return False
69-
return get_language_key(language_code) in AVAILABLE_ES_STEMMERS
70-
71-
def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
7+
def es_settings(languages=[]):
728
'''
739
Make elasticsearch settings json for a corpus index. Options:
74-
- `languages`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming
75-
- `stopword_analysis`: set to True to add an analyzer that removes stopwords.
76-
- `stemming_analysis`: set to True to add an analyzer that removes stopwords and performs stemming.
10+
- `languages`: array of language codes (IETF tags)
7711
'''
78-
settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}}
79-
stopword_filter_name = 'stopwords'
80-
clean_analyzer_name = 'clean'
81-
stemmer_filter_name = 'stemmer'
82-
stemmed_analyzer_name = 'stemmed'
83-
84-
set_char_filter(settings)
85-
86-
for language in languages:
87-
# do not attach language isocodes if there is just one language
88-
89-
tag = standardize_tag(language, macro=True)
90-
91-
if stopword_analysis or stemming_analysis:
92-
if not set_stopword_filter(settings, add_language_string(stopword_filter_name, tag), tag):
93-
continue # skip languages for which we do not have a stopword list
94-
95-
if stopword_analysis:
96-
set_clean_analyzer(
97-
settings,
98-
tag,
99-
add_language_string(stopword_filter_name, tag),
100-
add_language_string(clean_analyzer_name, tag),
101-
)
102-
if stemming_analysis:
103-
if not stemming_available(tag):
104-
warnings.warn('You specified `stemming_analysis=True`, but \
105-
there is no stemmer available for this language')
106-
continue
107-
set_stemmed_analyzer(
108-
settings,
109-
tag,
110-
add_language_string(stopword_filter_name, tag),
111-
add_language_string(stemmer_filter_name, tag),
112-
add_language_string(stemmed_analyzer_name, tag),
113-
)
114-
115-
return settings
116-
117-
def number_filter():
118-
return {
119-
"type":"pattern_replace",
120-
"pattern":"\\d+",
121-
"replacement":""
12+
analyzers = [get_analyzer(lang) for lang in languages]
13+
analysis = {
14+
'char_filter': _join_dicts(analyzer.char_filters() for analyzer in analyzers),
15+
'filter': _join_dicts(analyzer.token_filters() for analyzer in analyzers),
16+
'analyzer': _join_dicts(analyzer.analyzers() for analyzer in analyzers),
12217
}
12318

124-
def make_stopword_filter(language):
125-
try:
126-
stopwords = get_nltk_stopwords(language)
127-
return {
128-
"type": "stop",
129-
'stopwords': stopwords
130-
}
131-
except:
132-
return None
133-
134-
def _standard_analyzer(language: str):
135-
'''
136-
Basic analyzer for a language.
137-
'''
138-
if language in ['zh', 'ja', 'ko']:
139-
return {
140-
'tokenizer': 'standard',
141-
'filter': [
142-
'cjk_width',
143-
'lowercase',
144-
]
145-
}
146-
else:
147-
return {
148-
'tokenizer': 'standard',
149-
'char_filter': ['number_filter'],
150-
'filter': ['lowercase']
151-
}
152-
153-
def make_clean_analyzer(language: str, stopword_filter_name: str) -> Dict:
154-
analyzer = _standard_analyzer(language)
155-
analyzer['filter'].append(stopword_filter_name)
156-
return analyzer
157-
158-
159-
def make_stemmer_filter(language):
160-
stemmer_language = get_language_key(language)
161-
return {
162-
"type": "stemmer",
163-
"language": stemmer_language
19+
settings = {
20+
'index': {'number_of_shards': 1, 'number_of_replicas': 1},
21+
'analysis': analysis,
16422
}
23+
return settings
16524

166-
def make_stemmed_analyzer(
167-
language: str, stopword_filter_name: str, stemmer_filter_name: str
168-
) -> Dict:
169-
analyzer = make_clean_analyzer(language, stopword_filter_name)
170-
analyzer['filter'].append(stemmer_filter_name)
171-
return analyzer
172-
173-
174-
def get_stopwords_from_settings(es_settings, analyzer):
175-
try:
176-
# the name of the stopword filter is second in the list, after "lowercase"
177-
stopword_filter_name = es_settings['analysis']['analyzer'].get(
178-
analyzer).get('filter')[-1]
179-
token_filter = es_settings["analysis"]['filter'][stopword_filter_name]
180-
return token_filter['stopwords']
181-
except:
182-
return []
183-
184-
def set_stemmed_analyzer(
185-
settings: Dict,
186-
language: str,
187-
stopword_filter_name: str,
188-
stemmer_filter_name: str,
189-
stemmed_analyzer_name: str,
190-
) -> None:
191-
filters = settings['analysis'].get('filter', {})
192-
filters.update({stemmer_filter_name: make_stemmer_filter(language)})
193-
settings['analysis']['filter'] = filters
194-
analyzers = settings['analysis'].get('analyzer')
195-
analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(
196-
language, stopword_filter_name, stemmer_filter_name)})
197-
settings['analysis']['analyzer'] = analyzers
198-
199-
def set_char_filter(settings):
200-
settings["analysis"] = {
201-
"char_filter": { "number_filter": number_filter() }
202-
}
203-
204-
def set_stopword_filter(settings, stopword_filter_name, language):
205-
stopword_filter = make_stopword_filter(language)
206-
if not stopword_filter:
207-
return False
208-
filters = settings['analysis'].get('filter', {})
209-
filters.update({
210-
stopword_filter_name: stopword_filter
211-
})
212-
settings['analysis']['filter'] = filters
213-
return True
214-
215-
def set_clean_analyzer(
216-
settings: Dict,
217-
language: str,
218-
stopword_filter_name: str,
219-
clean_analyzer_name: str,
220-
) -> None:
221-
clean_analyzer = make_clean_analyzer(language, stopword_filter_name)
222-
analyzers = settings['analysis'].get('analyzer', {})
223-
analyzers.update({clean_analyzer_name: clean_analyzer})
224-
settings["analysis"]['analyzer'] = analyzers
25+
def _join_dicts(dicts: Iterable[Dict]) -> Dict:
26+
return reduce(operator.or_, dicts)

0 commit comments

Comments
 (0)