Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ def test_import_error(db, settings):
def test_corpus_dir(db, settings, basic_mock_corpus):
path = load_corpus.corpus_dir(basic_mock_corpus)
assert os.path.isabs(path)
assert 'mock_csv_corpus.py' in os.listdir(path)
assert 'corpus.py' in os.listdir(path)
assert 'source_data' in os.listdir(path)
8 changes: 4 additions & 4 deletions backend/addcorpus/python_corpora/tests/test_save_corpus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import pytest
from django.conf import settings
from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
from corpora_test.basic.corpus import ExampleCorpus
from addcorpus.models import Corpus, CorpusConfiguration
from addcorpus.python_corpora.save_corpus import (_save_field_in_database,
load_and_save_all_corpora, _save_or_skip_corpus
Expand Down Expand Up @@ -48,7 +48,7 @@ def test_no_errors_when_saving_corpora(db, capsys):

def test_saving_broken_corpus(db, basic_mock_corpus):
corpus = Corpus.objects.get(name=basic_mock_corpus)
corpus_def = MockCSVCorpus()
corpus_def = ExampleCorpus()

corpus_def.min_date = 'Not a valid date'

Expand Down Expand Up @@ -90,7 +90,7 @@ def deactivated_corpus(basic_mock_corpus):
def test_save_field_definition(db, basic_mock_corpus, deactivated_corpus):
corpus = Corpus.objects.get(name=basic_mock_corpus)
corpus_conf = corpus.configuration
corpus_def = MockCSVCorpus()
corpus_def = ExampleCorpus()

corpus_conf.fields.all().delete()

Expand All @@ -107,7 +107,7 @@ def test_save_corpus_purity(db, basic_mock_corpus):
'''

corpus = Corpus.objects.get(name=basic_mock_corpus)
corpus_def = MockCSVCorpus()
corpus_def = ExampleCorpus()

corpus_def.es_alias = 'test'
_save_or_skip_corpus(basic_mock_corpus, corpus_def)
Expand Down
4 changes: 2 additions & 2 deletions backend/addcorpus/tests/test_csvcorpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
from corpora_test.basic.corpus import ExampleCorpus
import os

here = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -49,7 +49,7 @@


def test_csv():
corpus = MockCSVCorpus()
corpus = ExampleCorpus()

sources = list(corpus.sources(start=corpus.min_date, end=corpus.max_date))
assert len(sources) == 1 and sources[0][1] == {'filename': 'example.csv'}
Expand Down
8 changes: 4 additions & 4 deletions backend/addcorpus/tests/test_word_models_present.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

here = os.path.abspath(os.path.dirname(__file__))

class ExampleCorpus(CorpusDefinition):
class ExampleNoModels(CorpusDefinition):
"""Example corpus class for testing"""

title = "Example"
Expand All @@ -17,12 +17,12 @@ class ExampleCorpus(CorpusDefinition):

fields = []

class ExampleCorpusWithWordModels(ExampleCorpus):
class ExampleWithModels(ExampleNoModels):
word_model_path = here

def test_word_models_present():
corpus = ExampleCorpus()
corpus = ExampleNoModels()
assert corpus.word_models_present == False

corpus_with_word_models = ExampleCorpusWithWordModels()
corpus_with_word_models = ExampleWithModels()
assert corpus_with_word_models.word_models_present == True
2 changes: 1 addition & 1 deletion backend/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def index_json_mock_corpus(db, es_client: Elasticsearch, json_mock_corpus: Corpu

@pytest.fixture()
def json_corpus_definition():
path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'mock_corpus.json')
path = os.path.join(settings.BASE_DIR, 'corpora_test', 'basic', 'corpus.json')
with open(path) as f:
return json.load(f)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

here = os.path.abspath(os.path.dirname(__file__))

class MockCSVCorpus(CSVCorpusDefinition):
class ExampleCorpus(CSVCorpusDefinition):
'''
Basic CSV corpus.

Expand Down
4 changes: 2 additions & 2 deletions backend/corpora_test/media/media_mock_corpus.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os

from corpora_test.basic.mock_csv_corpus import MockCSVCorpus
from corpora_test.basic.corpus import ExampleCorpus
from media.media_url import media_url

here = os.path.abspath(os.path.dirname(__file__))

class MediaMockCorpus(MockCSVCorpus):
class MediaMockCorpus(ExampleCorpus):
'''
Test corpus that includes image attachments to documents.
'''
Expand Down
2 changes: 1 addition & 1 deletion backend/ianalyzer/settings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def mock_corpus_path(*path):
'multilingual-mock-corpus': 'corpora_test.mixed_language.multilingual_mock_corpus.MultilingualMockCorpus',
'times': 'corpora.times.times.Times',
'media-mock-corpus': 'corpora_test.media.media_mock_corpus.MediaMockCorpus',
'mock-csv-corpus': 'corpora_test.basic.mock_csv_corpus.MockCSVCorpus',
'mock-csv-corpus': 'corpora_test.basic.corpus.ExampleCorpus',
'wordmodels-mock-corpus': 'corpora_test.wordmodels.wm_mock_corpus.WordmodelsMockCorpus',
'tagging-mock-corpus': 'corpora_test.tag.tag_mock_corpus.TaggingMockCorpus',
'annotated-mock-corpus': 'corpora_test.named_entities.annotated_mock_corpus.AnnotatedMockCorpus',
Expand Down
19 changes: 19 additions & 0 deletions documentation/Adding-existing-corpora.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Adding existing corpora

These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers.

Documentation on creating *new* corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md), or in the user manual (for creating corpora through the corpus form).

### Python corpora

Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch.

The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows:

1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](./Django-project-settings.md#corpora).
2. Set custom settings for your corpus. Check the definition file to see which variables it expects to find in the Django settings. Some of these may be optional.
3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index mycorpus`. See [Indexing](./Indexing-corpora.md) for more information.

### Database-only corpora

Database-only corpora are still in development, but we plan to support making copies of your created corpora by downloading the configuration. This feature should be documented in the user manual.
2 changes: 1 addition & 1 deletion documentation/Django-project-settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Unit tests for the backend will assume that there is a default server configured

A dictionary that specifies Python corpus definitions that should be imported in your project.

Each key must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example:
Each value must be the import path to a corpus class (see [Django module loading](https://docs.djangoproject.com/en/5.2/ref/utils/#module-django.utils.module_loading)). For example:

```python
CORPORA = {
Expand Down
40 changes: 16 additions & 24 deletions documentation/First-time-setup.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# First time setup (for developers)

These are instructions to set up an Textcavator server. If you are going to develop Textcavator, start by following these instructions.
These are instructions to set up an Textcavator development server. If you are going to develop Textcavator, start by following these instructions.

## Prerequisites

Expand Down Expand Up @@ -50,34 +50,24 @@ DATA_DIR=where/corpus/data/is/located/on/your/machine
Note: you can also call the .env file .myenv and specify this during startup:
`docker-compose --env-file .myenv up`

## Add a test corpus

## Adding corpora
These instructions will add a tiny example corpus to your environment. Use this to verify that everything is working correctly. Open the file `/backend/ianalyzer/settings_local.py`. Copy-paste:

These instructions are for adding *already defined* corpora to your own environment. This means you would be working with a corpus that is already used in Textcavator or by other developers.

In a first-time setup, it is recommended that you add at least one existing corpus before creating your own. Documentation on creating new corpus definitions is in [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md) / [Writing a corpus definition in JSON](./Writing-a-corpus-definition-in-JSON.md).

### Python corpora

Currently, all corpora that are used in production are *Python corpora*, meaning they are defined in the source code. To include these corpora in your environment, you need to add them to your local settings and create an index in Elasticsearch.

The source files of a corpus are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add the corpus your our environment as follows:

1. Add the corpus to the `CORPORA` dictionary in your local settings file. See [CORPORA settings documentation](/documentation/Django-project-settings.md#corpora).
2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may be optional, but you will at least need to define the (absolute) path to your source files.
3. Activate your python virtual environment. Run the `loadcorpora` admin command (`yarn django loadcorpora`) to register the new corpus in the SQL database. Then create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information.

### Database-only corpora

Note: database-only corpora are still in development and not yet recommended for first-time users.
```py
CORPORA = {
'example': 'corpora_test.basic.corpus.ExampleCorpus',
}
```

To add a database-only corpus, you will need a JSON definition of the corpus, and a directory with (a sample of) the pre-processed source data. To retrieve a JSON definition from a running Textcavator server, log in as a staff user and visit `/corpus-definitions/`. Open the corpus you want to import and click "Download JSON".
Save the file and close. For the next step, PostgreSQL and Elasticsearch must be running. Run in the terminal:

1. Start up your Textcavator server and log in as a staff user. Go to `localhost:4200/corpus-definitions/new`. Upload the JSON definition file and save.
2. Visit the admin menu (`localhost:4200/admin`). Go to "corpus configurations" and select your corpus. In the "data directory" field, add the path to your source data directory.
3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running `yarn django index {corpusname}`. See [Indexing](documentation/Indexing-corpora.md) for more information.
4. Visit the admin menu again. Go to "corpora" and select te corpus. Set "active" to true and save.
```sh
yarn django loadcorpora
yarn django index example
```

This will save the corpus configuration in the database and index the corpus data in Elasticsearch.

## Running a dev environment

Expand All @@ -93,6 +83,8 @@ Now that you have a working Textcavator environment, here are some common next s

Configure your environment -> [Django project settings](./Django-project-settings.md) / [Frontend environment settings](./Frontend-environment-settings.md)

Add an existing corpus -> [Adding existing corpora](./Adding-existing-corpora.md)

Create a new Python corpus -> [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md)

Add SAML intergration in your environment -> [SAML](./SAML.md)
1 change: 1 addition & 0 deletions documentation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This directory contains documentation for developers.

## Adding corpora

- [Adding existing corpora](./Adding-existing-corpora.md)
- [Corpus definitions](./Corpus-definitions.md)
- [Corpus database models](/Corpus-database-models.md)
- [Writing a corpus definition in Python](./Writing-a-corpus-definition-in-Python.md)
Expand Down
16 changes: 0 additions & 16 deletions documentation/Writing-a-corpus-definition-in-JSON.md

This file was deleted.

Loading