From d482503eb53d00310ab5bd151e7a75b1ed961e87 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 26 Mar 2026 16:44:16 +0000 Subject: [PATCH 1/5] rename example corpus --- .../addcorpus/python_corpora/tests/test_save_corpus.py | 8 ++++---- backend/addcorpus/tests/test_csvcorpus.py | 4 ++-- backend/addcorpus/tests/test_word_models_present.py | 8 ++++---- backend/corpora_test/basic/mock_csv_corpus.py | 2 +- backend/corpora_test/media/media_mock_corpus.py | 4 ++-- backend/ianalyzer/settings_test.py | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py index ebf935ad1..b7dd2a547 100644 --- a/backend/addcorpus/python_corpora/tests/test_save_corpus.py +++ b/backend/addcorpus/python_corpora/tests/test_save_corpus.py @@ -1,7 +1,7 @@ import sys import pytest from django.conf import settings -from corpora_test.basic.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import ExampleCorpus from addcorpus.models import Corpus, CorpusConfiguration from addcorpus.python_corpora.save_corpus import (_save_field_in_database, load_and_save_all_corpora, _save_or_skip_corpus @@ -48,7 +48,7 @@ def test_no_errors_when_saving_corpora(db, capsys): def test_saving_broken_corpus(db, basic_mock_corpus): corpus = Corpus.objects.get(name=basic_mock_corpus) - corpus_def = MockCSVCorpus() + corpus_def = ExampleCorpus() corpus_def.min_date = 'Not a valid date' @@ -90,7 +90,7 @@ def deactivated_corpus(basic_mock_corpus): def test_save_field_definition(db, basic_mock_corpus, deactivated_corpus): corpus = Corpus.objects.get(name=basic_mock_corpus) corpus_conf = corpus.configuration - corpus_def = MockCSVCorpus() + corpus_def = ExampleCorpus() corpus_conf.fields.all().delete() @@ -107,7 +107,7 @@ def test_save_corpus_purity(db, basic_mock_corpus): ''' corpus = Corpus.objects.get(name=basic_mock_corpus) - corpus_def = MockCSVCorpus() + corpus_def = ExampleCorpus() corpus_def.es_alias = 'test' _save_or_skip_corpus(basic_mock_corpus, corpus_def) diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py index 834640ace..0606905f7 100644 --- a/backend/addcorpus/tests/test_csvcorpus.py +++ b/backend/addcorpus/tests/test_csvcorpus.py @@ -1,4 +1,4 @@ -from corpora_test.basic.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import ExampleCorpus import os here = os.path.abspath(os.path.dirname(__file__)) @@ -49,7 +49,7 @@ def test_csv(): - corpus = MockCSVCorpus() + corpus = ExampleCorpus() sources = list(corpus.sources(start=corpus.min_date, end=corpus.max_date)) assert len(sources) == 1 and sources[0][1] == {'filename': 'example.csv'} diff --git a/backend/addcorpus/tests/test_word_models_present.py b/backend/addcorpus/tests/test_word_models_present.py index 690ada72f..0be295676 100644 --- a/backend/addcorpus/tests/test_word_models_present.py +++ b/backend/addcorpus/tests/test_word_models_present.py @@ -5,7 +5,7 @@ here = os.path.abspath(os.path.dirname(__file__)) -class ExampleCorpus(CorpusDefinition): +class ExampleNoModels(CorpusDefinition): """Example corpus class for testing""" title = "Example" @@ -17,12 +17,12 @@ class ExampleCorpus(CorpusDefinition): fields = [] -class ExampleCorpusWithWordModels(ExampleCorpus): +class ExampleWithModels(ExampleNoModels): word_model_path = here def test_word_models_present(): - corpus = ExampleCorpus() + corpus = ExampleNoModels() assert corpus.word_models_present == False - corpus_with_word_models = ExampleCorpusWithWordModels() + corpus_with_word_models = ExampleWithModels() assert corpus_with_word_models.word_models_present == True diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py index 30ee7e7d6..6a5deb16b 100644 --- a/backend/corpora_test/basic/mock_csv_corpus.py +++ b/backend/corpora_test/basic/mock_csv_corpus.py @@ -7,7 +7,7 @@ here = os.path.abspath(os.path.dirname(__file__)) -class MockCSVCorpus(CSVCorpusDefinition): +class ExampleCorpus(CSVCorpusDefinition): ''' Basic CSV corpus. diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py index 85f869210..d30bd6693 100644 --- a/backend/corpora_test/media/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -1,11 +1,11 @@ import os -from corpora_test.basic.mock_csv_corpus import MockCSVCorpus +from corpora_test.basic.mock_csv_corpus import ExampleCorpus from media.media_url import media_url here = os.path.abspath(os.path.dirname(__file__)) -class MediaMockCorpus(MockCSVCorpus): +class MediaMockCorpus(ExampleCorpus): ''' Test corpus that includes image attachments to documents. ''' diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index eff740cf3..1e9578e74 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -9,7 +9,7 @@ def mock_corpus_path(*path): 'multilingual-mock-corpus': 'corpora_test.mixed_language.multilingual_mock_corpus.MultilingualMockCorpus', 'times': 'corpora.times.times.Times', 'media-mock-corpus': 'corpora_test.media.media_mock_corpus.MediaMockCorpus', - 'mock-csv-corpus': 'corpora_test.basic.mock_csv_corpus.MockCSVCorpus', + 'mock-csv-corpus': 'corpora_test.basic.mock_csv_corpus.ExampleCorpus', 'wordmodels-mock-corpus': 'corpora_test.wordmodels.wm_mock_corpus.WordmodelsMockCorpus', 'tagging-mock-corpus': 'corpora_test.tag.tag_mock_corpus.TaggingMockCorpus', 'annotated-mock-corpus': 'corpora_test.named_entities.annotated_mock_corpus.AnnotatedMockCorpus', From f9c356ac7c2a5def476cd9d2bc2975cfcee24050 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 26 Mar 2026 16:47:50 +0000 Subject: [PATCH 2/5] rename example corpus files/class --- backend/addcorpus/python_corpora/tests/test_save_corpus.py | 2 +- backend/addcorpus/tests/test_csvcorpus.py | 2 +- backend/conftest.py | 2 +- backend/corpora_test/basic/{mock_corpus.json => corpus.json} | 0 backend/corpora_test/basic/{mock_csv_corpus.py => corpus.py} | 0 backend/corpora_test/media/media_mock_corpus.py | 2 +- backend/ianalyzer/settings_test.py | 2 +- documentation/Writing-a-corpus-definition-in-JSON.md | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename backend/corpora_test/basic/{mock_corpus.json => corpus.json} (100%) rename backend/corpora_test/basic/{mock_csv_corpus.py => corpus.py} (100%) diff --git a/backend/addcorpus/python_corpora/tests/test_save_corpus.py b/backend/addcorpus/python_corpora/tests/test_save_corpus.py index b7dd2a547..143310a24 100644 --- a/backend/addcorpus/python_corpora/tests/test_save_corpus.py +++ b/backend/addcorpus/python_corpora/tests/test_save_corpus.py @@ -1,7 +1,7 @@ import sys import pytest from django.conf import settings -from corpora_test.basic.mock_csv_corpus import ExampleCorpus +from corpora_test.basic.corpus import ExampleCorpus from addcorpus.models import Corpus, CorpusConfiguration from addcorpus.python_corpora.save_corpus import (_save_field_in_database, load_and_save_all_corpora, _save_or_skip_corpus diff --git a/backend/addcorpus/tests/test_csvcorpus.py b/backend/addcorpus/tests/test_csvcorpus.py index 0606905f7..ec8e14bf0 100644 --- a/backend/addcorpus/tests/test_csvcorpus.py +++ b/backend/addcorpus/tests/test_csvcorpus.py @@ -1,4 +1,4 @@ -from corpora_test.basic.mock_csv_corpus import ExampleCorpus +from corpora_test.basic.corpus import ExampleCorpus import os here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/conftest.py b/backend/conftest.py index efbf75906..df5b4d4ce 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -244,7 +244,7 @@ def index_json_mock_corpus(db, es_client: Elasticsearch, json_mock_corpus: Corpu @pytest.fixture() def json_corpus_definition(): - path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'mock_corpus.json') + path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'corpus.json') with open(path) as f: return json.load(f) diff --git a/backend/corpora_test/basic/mock_corpus.json b/backend/corpora_test/basic/corpus.json similarity index 100% rename from backend/corpora_test/basic/mock_corpus.json rename to backend/corpora_test/basic/corpus.json diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/corpus.py similarity index 100% rename from backend/corpora_test/basic/mock_csv_corpus.py rename to backend/corpora_test/basic/corpus.py diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py index d30bd6693..50cb2e52e 100644 --- a/backend/corpora_test/media/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -1,6 +1,6 @@ import os -from corpora_test.basic.mock_csv_corpus import ExampleCorpus +from corpora_test.basic.corpus import ExampleCorpus from media.media_url import media_url here = os.path.abspath(os.path.dirname(__file__)) diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index 1e9578e74..c2d4526e8 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -9,7 +9,7 @@ def mock_corpus_path(*path): 'multilingual-mock-corpus': 'corpora_test.mixed_language.multilingual_mock_corpus.MultilingualMockCorpus', 'times': 'corpora.times.times.Times', 'media-mock-corpus': 'corpora_test.media.media_mock_corpus.MediaMockCorpus', - 'mock-csv-corpus': 'corpora_test.basic.mock_csv_corpus.ExampleCorpus', + 'mock-csv-corpus': 'corpora_test.basic.corpus.ExampleCorpus', 'wordmodels-mock-corpus': 'corpora_test.wordmodels.wm_mock_corpus.WordmodelsMockCorpus', 'tagging-mock-corpus': 'corpora_test.tag.tag_mock_corpus.TaggingMockCorpus', 'annotated-mock-corpus': 'corpora_test.named_entities.annotated_mock_corpus.AnnotatedMockCorpus', diff --git a/documentation/Writing-a-corpus-definition-in-JSON.md b/documentation/Writing-a-corpus-definition-in-JSON.md index ecbd5650b..c797d7b36 100644 --- a/documentation/Writing-a-corpus-definition-in-JSON.md +++ b/documentation/Writing-a-corpus-definition-in-JSON.md @@ -2,7 +2,7 @@ Database-only corpora support a JSON format for creating or editing corpus definitions. Like Python definitions, a JSON definition can be used to store and share a configuration for a corpus. -The format is defined in [corpus.schema.json](/backend/addcorpus/schemas/corpus.schema.json). You can find an example in the [test JSON definition](../backend/corpora_test/basic/mock_corpus.json). +The format is defined in [corpus.schema.json](/backend/addcorpus/schemas/corpus.schema.json). You can find an example in the [test JSON definition](../backend/corpora_test/basic/corpus.json). We do not (currently) have a guide to writing JSON definitions, though the JSON schema includes descriptions for each field. From ed8aed9795d30b5edb67d3a827d81d95937f69e1 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 26 Mar 2026 17:01:50 +0000 Subject: [PATCH 3/5] use example corpus in first-time setup --- documentation/Adding-existing-corpora.md | 27 ++++++++++++++++ documentation/First-time-setup.md | 40 ++++++++++-------------- documentation/README.md | 1 + 3 files changed, 44 insertions(+), 24 deletions(-) create mode 100644 documentation/Adding-existing-corpora.md diff --git a/documentation/Adding-existing-corpora.md b/documentation/Adding-existing-corpora.md new file mode 100644 index 000000000..be16ecad7 --- /dev/null +++ b/documentation/Adding-existing-corpora.md @@ -0,0 +1,27 @@ +## Adding existing corpora + +These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers. + +Documentation on creating *new* corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md). + +### Python corpora + +Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch. + +The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows: + +1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora). +2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files. +3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. + +### Database-only corpora + +Note: database-only corpora are still in development. + +To add a database-only corpus, you will need a JSON definition of the corpus, and a CSV file with the source data. To retrieve a JSON definition from a running Textcavator server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON". + +1. Start up your Textcavator server and log in as a superuser. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save. +2. Open the editing form for the corpus. In step 1, you can upload an image, but this is optional. In step 2, upload your source data and save. +3. Continue to step 4 of the form and index the corpus. When indexing is complete, click "activate". +4. Visit the admin site at `/admin`. Go to "corpora" and select te corpus. Set "active" to true and save. + diff --git a/documentation/First-time-setup.md b/documentation/First-time-setup.md index 23319fe45..69feda9c1 100644 --- a/documentation/First-time-setup.md +++ b/documentation/First-time-setup.md @@ -1,6 +1,6 @@ # First time setup (for developers) -These are instructions to set up an Textcavator server. If you are going to develop Textcavator, start by following these instructions. +These are instructions to set up an Textcavator development server. If you are going to develop Textcavator, start by following these instructions. ## Prerequisites @@ -50,34 +50,24 @@ DATA_DIR=where/corpus/data/is/located/on/your/machine Note: you can also call the .env file .myenv and specify this during startup: `docker-compose --env-file .myenv up` +## Add a test corpus -## Adding corpora +These instructions will add a tiny example corpus to your environment. Use this to verify that everything is working correctly. Open the file `/backend/ianalyzer/settings_local.py`. Copy-paste: -These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers. - -In a first-time setup, it is recommended that you add at least one existing corpus before creating your own. Documentation on creating new corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md). - -### Python corpora - -Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch. - -The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows: - -1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora). -2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files. -3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. - -### Database-only corpora - -Note: database-only corpora are still in development and not yet recommended for first-time users. +```py +CORPORA = { + 'example': 'corpora_test.basic.corpus.ExampleCorpus', +} +``` -To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running Textcavator server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON". +Save the file and close. For the next step, PostgreSQL and Elasticsearch must be running. Run in the terminal: -1. Start up your Textcavator server and log in as a staff user. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save. -2. Visit the admin menu (`localhost:4200/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory. -3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running `yarn django index {corpusname}`. See [Indexing](documentation/Indexing-corpora.md) for more information. -4. Visit the admin menu again. Go to "corpora" and select te corpus. Set "active" to true and save. +```sh +yarn django loadcorpora +yarn django index example +``` +This will save the corpus configuration in the database and index the corpus data in Elasticsearch. ## Running a dev environment @@ -93,6 +83,8 @@ Now that you have a working Textcavator environment, here are some common next s Configure your environment -> [Django project settings](./Django-project-settings.md) / [Frontend environment settings](./Frontend-environment-settings.md) +Add an existing corpus -> [Adding existing corpora](./Adding-existing-corpora.md) + Create a new Python corpus -> [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) Add SAML intergration in your environment -> [SAML](./SAML.md) diff --git a/documentation/README.md b/documentation/README.md index 0920bcd67..f1a3e96e9 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -10,6 +10,7 @@ This directory contains documentation for developers. ## Adding corpora +- [Adding existing corpora](./Adding-existing-corpora.md) - [Corpus definitions](./Corpus-definitions.md) - [Corpus database models](/Corpus-database-models.md) - [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) From 11fa5445b3659bccaaaa911953e74c94d0f73b0b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 26 Mar 2026 17:13:33 +0000 Subject: [PATCH 4/5] remove unnecessary documentation on json corpora --- documentation/Adding-existing-corpora.md | 18 +++++------------- documentation/Django-project-settings.md | 2 +- .../Writing-a-corpus-definition-in-JSON.md | 16 ---------------- 3 files changed, 6 insertions(+), 30 deletions(-) delete mode 100644 documentation/Writing-a-corpus-definition-in-JSON.md diff --git a/documentation/Adding-existing-corpora.md b/documentation/Adding-existing-corpora.md index be16ecad7..418aafea4 100644 --- a/documentation/Adding-existing-corpora.md +++ b/documentation/Adding-existing-corpora.md @@ -2,7 +2,7 @@ These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers. -Documentation on creating *new* corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md). +Documentation on creating *new* corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md), or in the user manual (for creating corpora through the corpus form). ### Python corpora @@ -10,18 +10,10 @@ Currently, all corpora that are used in production are *Python corpora*, meaning The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows: -1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora). -2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files. -3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. +1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](./Django-project-settings.md#corpora). +2. Set custom settings for your corpus. Check the definition file to see which variables it expects to find in the Django settings. Some of these may be optional. +3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index mycorpus`. See [Indexing](./Indexing-corpora.md) for more information. ### Database-only corpora -Note: database-only corpora are still in development. - -To add a database-only corpus, you will need a JSON definition of the corpus, and a CSV file with the source data. To retrieve a JSON definition from a running Textcavator server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON". - -1. Start up your Textcavator server and log in as a superuser. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save. -2. Open the editing form for the corpus. In step 1, you can upload an image, but this is optional. In step 2, upload your source data and save. -3. Continue to step 4 of the form and index the corpus. When indexing is complete, click "activate". -4. Visit the admin site at `/admin`. Go to "corpora" and select te corpus. Set "active" to true and save. - +Database-only corpora are still in development, but we plan to support making copies of your created corpora by downloading the configuration. This feature should be documented in the user manual. diff --git a/documentation/Django-project-settings.md b/documentation/Django-project-settings.md index 72b9b0218..2f03e7828 100644 --- a/documentation/Django-project-settings.md +++ b/documentation/Django-project-settings.md @@ -73,7 +73,7 @@ Unit tests for the backend will assume that there is a default server configured A dictionary that specifies Python corpus definitions that should be imported in your project. -Each key must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example: +Each value must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example: ```python CORPORA = { diff --git a/documentation/Writing-a-corpus-definition-in-JSON.md b/documentation/Writing-a-corpus-definition-in-JSON.md deleted file mode 100644 index c797d7b36..000000000 --- a/documentation/Writing-a-corpus-definition-in-JSON.md +++ /dev/null @@ -1,16 +0,0 @@ -# Writing a corpus definition in JSON - -Database-only corpora support a JSON format for creating or editing corpus definitions. Like Python definitions, a JSON definition can be used to store and share a configuration for a corpus. - -The format is defined in [corpus.schema.json](/backend/addcorpus/schemas/corpus.schema.json). You can find an example in the [test JSON definition](../backend/corpora_test/basic/corpus.json). - -We do not (currently) have a guide to writing JSON definitions, though the JSON schema includes descriptions for each field. - -## Importing and exporting definitions - -You can import and export JSON definitions through the frontend. Visit `/corpus-definitions/` to do so. - -Some notes on importing and exporting JSON definitions: - -- A JSON definition is less detailed than the database model. This is because the database model must also support Python corpora (which offer more customisation) and legacy options. If you edit a corpus through the admin, exporting it to JSON and importing it again may include some normalisation. -- Some properties of the corpus are not handled through the JSON interface, though they are supported in database-only corpora. Currently, these can only be configured in the admin. These are the corpus image, documentation pages, and data directory. You an edit these properties in the admin site once you have uploaded the JSON definition. From 476f0e516300b925649d670b77b2cdfc3528b107 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 26 Mar 2026 17:33:55 +0000 Subject: [PATCH 5/5] fix test --- backend/addcorpus/python_corpora/tests/test_corpusimport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/addcorpus/python_corpora/tests/test_corpusimport.py b/backend/addcorpus/python_corpora/tests/test_corpusimport.py index 3f572a321..06ae54171 100644 --- a/backend/addcorpus/python_corpora/tests/test_corpusimport.py +++ b/backend/addcorpus/python_corpora/tests/test_corpusimport.py @@ -34,5 +34,5 @@ def test_import_error(db, settings): def test_corpus_dir(db, settings, basic_mock_corpus): path = load_corpus.corpus_dir(basic_mock_corpus) assert os.path.isabs(path) - assert 'mock_csv_corpus.py' in os.listdir(path) + assert 'corpus.py' in os.listdir(path) assert 'source_data' in os.listdir(path)