CentreForDigitalHumanities · lukavdplas · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/backend/addcorpus/python_corpora/tests/test_corpusimport.py b/backend/addcorpus/python_corpora/tests/test_corpusimport.py
@@ -34,5 +34,5 @@ def test_import_error(db, settings):
 def test_corpus_dir(db, settings, basic_mock_corpus):
     path = load_corpus.corpus_dir(basic_mock_corpus)
     assert os.path.isabs(path)
-    assert 'mock_csv_corpus.py' in os.listdir(path)
+    assert 'corpus.py' in os.listdir(path)
     assert 'source_data' in os.listdir(path)
diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py
@@ -1,7 +1,7 @@
 import sys
 import pytest
 from django.conf import settings
-from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
+from corpora_test.basic.corpus import ExampleCorpus
 from addcorpus.models import Corpus, CorpusConfiguration
 from addcorpus.python_corpora.save_corpus import (_save_field_in_database,
     load_and_save_all_corpora, _save_or_skip_corpus
@@ -48,7 +48,7 @@ def test_no_errors_when_saving_corpora(db, capsys):
 
 def test_saving_broken_corpus(db, basic_mock_corpus):
     corpus = Corpus.objects.get(name=basic_mock_corpus)
-    corpus_def = MockCSVCorpus()
+    corpus_def = ExampleCorpus()
 
     corpus_def.min_date = 'Not a valid date'
 
@@ -90,7 +90,7 @@ def deactivated_corpus(basic_mock_corpus):
 def test_save_field_definition(db, basic_mock_corpus, deactivated_corpus):
     corpus = Corpus.objects.get(name=basic_mock_corpus)
     corpus_conf = corpus.configuration
-    corpus_def = MockCSVCorpus()
+    corpus_def = ExampleCorpus()
 
     corpus_conf.fields.all().delete()
 
@@ -107,7 +107,7 @@ def test_save_corpus_purity(db, basic_mock_corpus):
     '''
 
     corpus = Corpus.objects.get(name=basic_mock_corpus)
-    corpus_def = MockCSVCorpus()
+    corpus_def = ExampleCorpus()
 
     corpus_def.es_alias = 'test'
     _save_or_skip_corpus(basic_mock_corpus, corpus_def)

diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py
@@ -1,4 +1,4 @@
-from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
+from corpora_test.basic.corpus import ExampleCorpus
 import os
 
 here = os.path.abspath(os.path.dirname(__file__))
@@ -49,7 +49,7 @@
 
 
 def test_csv():
-    corpus = MockCSVCorpus()
+    corpus = ExampleCorpus()
 
     sources = list(corpus.sources(start=corpus.min_date, end=corpus.max_date))
     assert len(sources) == 1 and sources[0][1] == {'filename': 'example.csv'}

diff --git a/backend/addcorpus/tests/test_word_models_present.py b/backend/addcorpus/tests/test_word_models_present.py
@@ -5,7 +5,7 @@
 
 here = os.path.abspath(os.path.dirname(__file__))
 
-class ExampleCorpus(CorpusDefinition):
+class ExampleNoModels(CorpusDefinition):
     """Example corpus class for testing"""
 
     title = "Example"
@@ -17,12 +17,12 @@ class ExampleCorpus(CorpusDefinition):
 
     fields = []
 
-class ExampleCorpusWithWordModels(ExampleCorpus):
+class ExampleWithModels(ExampleNoModels):
     word_model_path = here
 
 def test_word_models_present():
-    corpus = ExampleCorpus()
+    corpus = ExampleNoModels()
     assert corpus.word_models_present == False
 
-    corpus_with_word_models = ExampleCorpusWithWordModels()
+    corpus_with_word_models = ExampleWithModels()
     assert corpus_with_word_models.word_models_present == True
diff --git a/backend/conftest.py b/backend/conftest.py
@@ -244,7 +244,7 @@ def index_json_mock_corpus(db, es_client: Elasticsearch, json_mock_corpus: Corpu
 
 @pytest.fixture()
 def json_corpus_definition():
-    path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'mock_corpus.json')
+    path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'corpus.json')
     with open(path) as f:
         return json.load(f)
 

diff --git a/backend/corpora_test/basic/mock_corpus.json → backend/corpora_test/basic/corpus.json b/backend/corpora_test/basic/mock_corpus.json → backend/corpora_test/basic/corpus.json
diff --git a/...end/corpora_test/basic/mock_csv_corpus.py → backend/corpora_test/basic/corpus.py b/...end/corpora_test/basic/mock_csv_corpus.py → backend/corpora_test/basic/corpus.py
@@ -7,7 +7,7 @@
 
 here = os.path.abspath(os.path.dirname(__file__))
 
-class MockCSVCorpus(CSVCorpusDefinition):
+class ExampleCorpus(CSVCorpusDefinition):
     '''
     Basic CSV corpus.
 

diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py
@@ -1,11 +1,11 @@
 import os
 
-from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
+from corpora_test.basic.corpus import ExampleCorpus
 from media.media_url import media_url
 
 here = os.path.abspath(os.path.dirname(__file__))
 
-class MediaMockCorpus(MockCSVCorpus):
+class MediaMockCorpus(ExampleCorpus):
     '''
     Test corpus that includes image attachments to documents.
     '''

diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py
@@ -9,7 +9,7 @@ def mock_corpus_path(*path):
     'multilingual-mock-corpus': 'corpora_test.mixed_language.multilingual_mock_corpus.MultilingualMockCorpus',
     'times': 'corpora.times.times.Times',
     'media-mock-corpus': 'corpora_test.media.media_mock_corpus.MediaMockCorpus',
-    'mock-csv-corpus': 'corpora_test.basic.mock_csv_corpus.MockCSVCorpus',
+    'mock-csv-corpus': 'corpora_test.basic.corpus.ExampleCorpus',
     'wordmodels-mock-corpus': 'corpora_test.wordmodels.wm_mock_corpus.WordmodelsMockCorpus',
     'tagging-mock-corpus': 'corpora_test.tag.tag_mock_corpus.TaggingMockCorpus',
     'annotated-mock-corpus': 'corpora_test.named_entities.annotated_mock_corpus.AnnotatedMockCorpus',

diff --git a/documentation/Adding-existing-corpora.md b/documentation/Adding-existing-corpora.md
@@ -0,0 +1,19 @@
+## Adding existing corpora
+
+These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers.
+
+Documentation on creating *new* corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md), or in the user manual (for creating corpora through the corpus form).
+
+### Python corpora
+
+Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch.
+
+The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows:
+
+1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](./Django-project-settings.md#corpora).
+2. Set custom settings for your corpus. Check the definition file to see which variables it expects to find in the Django settings. Some of these may be optional.
+3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index mycorpus`. See [Indexing](./Indexing-corpora.md) for more information.
+
+### Database-only corpora
+
+Database-only corpora are still in development, but we plan to support making copies of your created corpora by downloading the configuration. This feature should be documented in the user manual.
diff --git a/documentation/Django-project-settings.md b/documentation/Django-project-settings.md
@@ -73,7 +73,7 @@ Unit tests for the backend will assume that there is a default server configured
 
 A dictionary that specifies Python corpus definitions that should be imported in your project.
 
-Each key must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example:
+Each value must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example:
 
 ```python
 CORPORA = {

diff --git a/documentation/First-time-setup.md b/documentation/First-time-setup.md
@@ -1,6 +1,6 @@
 # First time setup (for developers)
 
-These are instructions to set up an Textcavator server. If you are going to develop Textcavator, start by following these instructions.
+These are instructions to set up an Textcavator development server. If you are going to develop Textcavator, start by following these instructions.
 
 ## Prerequisites
 
@@ -50,34 +50,24 @@ DATA_DIR=where/corpus/data/is/located/on/your/machine
 Note: you can also call the .env file .myenv and specify this during startup:
 `docker-compose --env-file .myenv up`
 
+## Add a test corpus
 
-## Adding corpora
+These instructions will add a tiny example corpus to your environment. Use this to verify that everything is working correctly. Open the file `/backend/ianalyzer/settings_local.py`. Copy-paste:
 
-These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers.
-
-In a first-time setup, it is recommended that you add at least one existing corpus before creating your own. Documentation on creating new corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md).
-
-### Python corpora
-
-Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch.
-
-The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows:
-
-1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora).
-2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files.
-3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information.
-
-### Database-only corpora
-
-Note: database-only corpora are still in development and not yet recommended for first-time users.
+```py
+CORPORA = {
+    'example': 'corpora_test.basic.corpus.ExampleCorpus',
+}
+```
 
-To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running Textcavator server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON".
+Save the file and close. For the next step, PostgreSQL and Elasticsearch must be running. Run in the terminal:
 
-1. Start up your Textcavator server and log in as a staff user. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save.
-2. Visit the admin menu (`localhost:4200/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory.
-3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running `yarn django index {corpusname}`. See [Indexing](documentation/Indexing-corpora.md) for more information.
-4. Visit the admin menu again. Go to "corpora" and select te corpus. Set "active" to true and save.
+```sh
+yarn django loadcorpora
+yarn django index example
+```
 
+This will save the corpus configuration in the database and index the corpus data in Elasticsearch.
 
 ## Running a dev environment
 
@@ -93,6 +83,8 @@ Now that you have a working Textcavator environment, here are some common next s
 
 Configure your environment -> [Django project settings](./Django-project-settings.md) / [Frontend environment settings](./Frontend-environment-settings.md)
 
+Add an existing corpus -> [Adding existing corpora](./Adding-existing-corpora.md)
+
 Create a new Python corpus -> [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md)
 
 Add SAML intergration in your environment -> [SAML](./SAML.md)
diff --git a/documentation/README.md b/documentation/README.md
@@ -10,6 +10,7 @@ This directory contains documentation for developers.
 
 ## Adding corpora
 
+- [Adding existing corpora](./Adding-existing-corpora.md)
 - [Corpus definitions](./Corpus-definitions.md)
 - [Corpus database models](/Corpus-database-models.md)
 - [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md)

diff --git a/documentation/Writing-a-corpus-definition-in-JSON.md b/documentation/Writing-a-corpus-definition-in-JSON.md