CentreForDigitalHumanities
diff --git a/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/addcorpus/es_mappings.py‎
Lines changed: 29 additions & 25 deletions b/‎backend/addcorpus/es_mappings.py‎
Lines changed: 29 additions & 25 deletions
diff --git a/‎backend/addcorpus/es_settings.py‎
Lines changed: 17 additions & 215 deletions b/‎backend/addcorpus/es_settings.py‎
Lines changed: 17 additions & 215 deletions
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.28.0
-date-released: '2026-02-20'
+version: 5.29.0
+date-released: '2026-03-11'
@@ -22,7 +22,7 @@ For corpora included in Textcavator, the backend includes a definition file that
 
 ## Usage
 
-If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [People & Parliament](https://people-and-parliament.hum.uu.nl/).
+If you are interested in using Textcavator, the most straightforward way to get started is to visit [textcavator.hum.uu.nl](https://textcavator.hum.uu.nl/). This server is maintained by the Research Software Lab and contains corpora focused on a variety of fields. We also maintain more specialised collections at [PEACE portal](https://peace.sites.uu.nl/epigraphy/search/) and [JYU People & Parliament](https://people-and-parliament.hum.uu.nl/).
 
 Textcavator does not have an "upload data" option (yet!). If you are interested in using Textcavator as a way to publish your dataset, or to make it easier to search and analyse, you can go about this two ways:
 
 
@@ -1,65 +1,69 @@
-from typing import Dict
-from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
-from langcodes import standardize_tag
+from typing import Dict, Optional
+from addcorpus.language_analyzers import get_analyzer
 
 def primary_mapping_type(es_mapping: Dict) -> str:
     return es_mapping.get('type', None)
 
 
 def main_content_mapping(
-    token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None
+    token_counts=True, stopword_analysis=False, stemming_analysis=False,
+    language: Optional[str] = None
 ):
     '''
     Mapping for the main content field. Options:
 
     - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
     - `stopword_analysis`: enables analysis using stopword removal, if available for the language.
     - `stemming_analysis`: enables analysis using stemming, if available for the language.
-    - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
+    - `language`: language (IETF tag) of the field contents
     '''
 
-    mapping = {"type": "text", "term_vector": "with_positions_offsets"}
+    analyzer = get_analyzer(language)
+    mapping = {
+        'type': 'text',
+        'analyzer': analyzer.standard_analyzer_name,
+        'term_vector': 'with_positions_offsets'
+    }
 
     if any([token_counts, stopword_analysis, stemming_analysis]):
         multifields = {}
         if token_counts:
             multifields['length'] = {
-                "type":     "token_count",
-                "analyzer": "standard"
+                'type':     'token_count',
+                'analyzer': analyzer.standard_analyzer_name
             }
 
-        if not language:
-            return mapping
-        tag = standardize_tag(language, macro=True)
-
-        if stopword_analysis and stopwords_available(tag):
+        if stopword_analysis and analyzer.has_stopwords:
             multifields['clean'] = {
-                "type": "text",
-                "analyzer": add_language_string('clean', tag),
-                "term_vector": "with_positions_offsets" # include character positions for highlighting
+                'type': 'text',
+                'analyzer': analyzer.clean_analyzer_name,
+                'term_vector': 'with_positions_offsets' # include character positions for highlighting
             }
-        if stemming_analysis and stemming_available(tag):
+        if stemming_analysis and analyzer.has_stemming:
             multifields['stemmed'] = {
-                "type": "text",
-                "analyzer": add_language_string('stemmed', tag),
-                "term_vector": "with_positions_offsets",
+                'type': 'text',
+                'analyzer': analyzer.stemmed_analyzer_name,
+                'term_vector': 'with_positions_offsets',
             }
         mapping['fields'] = multifields
 
     return mapping
 
 
-def text_mapping():
+def text_mapping(language: Optional[str] = None):
     '''
-    Mapping for text fields that are not the main content. Performs tokenisation and lowercasing for full-text
+    Mapping for text fields that are not the main content. Performs standard analysis for full-text
     search, but does not support other analysis options.
     '''
 
+    analyzer = get_analyzer(language)
+
     return {
-        'type': 'text'
+        'type': 'text',
+        'analyzer': analyzer.standard_analyzer_name,
     }
 
-def keyword_mapping(enable_full_text_search = False):
+def keyword_mapping(enable_full_text_search = False, language: Optional[str] = None):
     '''
     Mapping for keyword fields. Keyword fields allow filtering and histogram visualisations.
 
@@ -71,7 +75,7 @@ def keyword_mapping(enable_full_text_search = False):
     }
     if enable_full_text_search:
         mapping['fields'] = {
-            'text': { 'type': 'text' },
+            'text': text_mapping(language)
         }
 
     return mapping
 
@@ -1,224 +1,26 @@
-import os
-import warnings
-from typing import Dict
+from typing import Dict, Iterable
+import operator
+from functools import reduce
 
-from django.conf import settings
-from langcodes import Language, standardize_tag
-import nltk
+from addcorpus.language_analyzers import get_analyzer
 
-# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
-AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
-                         'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
-                         'english', 'estonian', 'finnish', 'french', 'galician',
-                         'german', 'greek', 'hindi', 'hungarian', 'indonesian',
-                         'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
-                         'persian', 'portuguese', 'romanian', 'russian', 'sorani',
-                         'spanish', 'swedish', 'turkish', 'thai']
-
-def get_language_key(language_code):
-    '''
-    Get the nltk stopwords file / elasticsearch stemmer name for a language code
-
-    E.g. 'en' -> 'english'
-    '''
-
-    return Language.make(standardize_tag(language_code)).display_name().lower()
-
-def _stopwords_directory() -> str:
-    stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
-    if not os.path.exists(stopwords_dir):
-        nltk.download('stopwords', settings.NLTK_DATA_PATH)
-    return stopwords_dir
-
-def _stopwords_path(language_code: str):
-    dir = _stopwords_directory()
-    language = get_language_key(language_code)
-    return os.path.join(dir, language)
-
-def stopwords_available(language_code: str) -> bool:
-    if not language_code:
-        return False
-    path = _stopwords_path(language_code)
-    return os.path.exists(path)
-
-def get_nltk_stopwords(language_code):
-    path = _stopwords_path(language_code)
-
-    if os.path.exists(path):
-        with open(path) as infile:
-            words = [line.strip() for line in infile.readlines()]
-            return words
-    else:
-        raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code))
-
-def add_language_string(name, language):
-    return '{}_{}'.format(name, language) if language else name
-
-def stemming_available(language_code: str) -> bool:
-    '''
-    Check whether stemming is supported for a language.
-
-    Parameters:
-        language: an ISO-639 language code
-
-    Returns:
-        whether elasticsearch supports stemming analysis in this language.
-    '''
-    if not language_code:
-        return False
-    return get_language_key(language_code) in AVAILABLE_ES_STEMMERS
-
-def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
+def es_settings(languages=[]):
     '''
     Make elasticsearch settings json for a corpus index. Options:
-    - `languages`: array of language codes. See addcorpus.constants for options, and which languages support stopwords/stemming
-    - `stopword_analysis`: set to True to add an analyzer that removes stopwords.
-    - `stemming_analysis`: set to True to add an analyzer that removes stopwords and performs stemming.
+    - `languages`: array of language codes (IETF tags)
     '''
-    settings = {'index': {'number_of_shards': 1, 'number_of_replicas': 1}}
-    stopword_filter_name = 'stopwords'
-    clean_analyzer_name = 'clean'
-    stemmer_filter_name = 'stemmer'
-    stemmed_analyzer_name = 'stemmed'
-
-    set_char_filter(settings)
-
-    for language in languages:
-        # do not attach language isocodes if there is just one language
-
-        tag = standardize_tag(language, macro=True)
-
-        if stopword_analysis or stemming_analysis:
-            if not set_stopword_filter(settings, add_language_string(stopword_filter_name, tag), tag):
-                continue # skip languages for which we do not have a stopword list
-
-            if stopword_analysis:
-                set_clean_analyzer(
-                    settings,
-                    tag,
-                    add_language_string(stopword_filter_name, tag),
-                    add_language_string(clean_analyzer_name, tag),
-                )
-            if stemming_analysis:
-                if not stemming_available(tag):
-                    warnings.warn('You specified `stemming_analysis=True`, but \
-                                      there is no stemmer available for this language')
-                    continue
-                set_stemmed_analyzer(
-                    settings,
-                    tag,
-                    add_language_string(stopword_filter_name, tag),
-                    add_language_string(stemmer_filter_name, tag),
-                    add_language_string(stemmed_analyzer_name, tag),
-                )
-
-    return settings
-
-def number_filter():
-    return {
-        "type":"pattern_replace",
-        "pattern":"\\d+",
-        "replacement":""
+    analyzers = [get_analyzer(lang) for lang in languages]
+    analysis = {
+        'char_filter': _join_dicts(analyzer.char_filters() for analyzer in analyzers),
+        'filter': _join_dicts(analyzer.token_filters() for analyzer in analyzers),
+        'analyzer': _join_dicts(analyzer.analyzers() for analyzer in analyzers),
     }
 
-def make_stopword_filter(language):
-    try:
-        stopwords = get_nltk_stopwords(language)
-        return {
-            "type": "stop",
-            'stopwords': stopwords
-        }
-    except:
-        return None
-
-def _standard_analyzer(language: str):
-    '''
-    Basic analyzer for a language.
-    '''
-    if language in ['zh', 'ja', 'ko']:
-        return {
-            'tokenizer': 'standard',
-            'filter': [
-                'cjk_width',
-                'lowercase',
-            ]
-        }
-    else:
-        return {
-            'tokenizer': 'standard',
-            'char_filter': ['number_filter'],
-            'filter': ['lowercase']
-        }
-
-def make_clean_analyzer(language: str, stopword_filter_name: str) -> Dict:
-    analyzer = _standard_analyzer(language)
-    analyzer['filter'].append(stopword_filter_name)
-    return analyzer
-
-
-def make_stemmer_filter(language):
-    stemmer_language = get_language_key(language)
-    return {
-        "type": "stemmer",
-        "language": stemmer_language
+    settings = {
+        'index': {'number_of_shards': 1, 'number_of_replicas': 1},
+        'analysis': analysis,
     }
+    return settings
 
-def make_stemmed_analyzer(
-    language: str, stopword_filter_name: str, stemmer_filter_name: str
-) -> Dict:
-    analyzer = make_clean_analyzer(language, stopword_filter_name)
-    analyzer['filter'].append(stemmer_filter_name)
-    return analyzer
-
-
-def get_stopwords_from_settings(es_settings, analyzer):
-    try:
-        # the name of the stopword filter is second in the list, after "lowercase"
-        stopword_filter_name = es_settings['analysis']['analyzer'].get(
-            analyzer).get('filter')[-1]
-        token_filter = es_settings["analysis"]['filter'][stopword_filter_name]
-        return token_filter['stopwords']
-    except:
-        return []
-
-def set_stemmed_analyzer(
-        settings: Dict,
-        language: str,
-        stopword_filter_name: str,
-        stemmer_filter_name: str,
-        stemmed_analyzer_name: str,
-) -> None:
-    filters = settings['analysis'].get('filter', {})
-    filters.update({stemmer_filter_name: make_stemmer_filter(language)})
-    settings['analysis']['filter'] = filters
-    analyzers = settings['analysis'].get('analyzer')
-    analyzers.update({stemmed_analyzer_name: make_stemmed_analyzer(
-        language, stopword_filter_name, stemmer_filter_name)})
-    settings['analysis']['analyzer'] = analyzers
-
-def set_char_filter(settings):
-    settings["analysis"] = {
-        "char_filter": { "number_filter": number_filter() }
-    }
-
-def set_stopword_filter(settings, stopword_filter_name, language):
-    stopword_filter = make_stopword_filter(language)
-    if not stopword_filter:
-        return False
-    filters = settings['analysis'].get('filter', {})
-    filters.update({
-        stopword_filter_name: stopword_filter
-    })
-    settings['analysis']['filter'] = filters
-    return True
-
-def set_clean_analyzer(
-    settings: Dict,
-    language: str,
-    stopword_filter_name: str,
-    clean_analyzer_name: str,
-) -> None:
-    clean_analyzer = make_clean_analyzer(language, stopword_filter_name)
-    analyzers = settings['analysis'].get('analyzer', {})
-    analyzers.update({clean_analyzer_name: clean_analyzer})
-    settings["analysis"]['analyzer'] = analyzers
+def _join_dicts(dicts: Iterable[Dict]) -> Dict:
+    return reduce(operator.or_, dicts)