From 2b9553c6372859fc8fe80f8ba4d00329cafa1eb6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 17 Jun 2026 14:26:35 +0000 Subject: [PATCH] remove 4.x migration guide + script close #1924 --- backend/ianalyzer/flask_data_transfer.py | 237 ------------------ .../tests/test_flask_data_transfer.py | 172 ------------- documentation/Migration-from-flask.md | 107 -------- 3 files changed, 516 deletions(-) delete mode 100644 backend/ianalyzer/flask_data_transfer.py delete mode 100644 backend/ianalyzer/tests/test_flask_data_transfer.py delete mode 100644 documentation/Migration-from-flask.md diff --git a/backend/ianalyzer/flask_data_transfer.py b/backend/ianalyzer/flask_data_transfer.py deleted file mode 100644 index ca0e99023..000000000 --- a/backend/ianalyzer/flask_data_transfer.py +++ /dev/null @@ -1,237 +0,0 @@ -import csv -import os -import base64 -from django.contrib.auth.models import Group -from users.models import CustomUser -from django.db import connection -from django.db.utils import IntegrityError -from addcorpus.models import Corpus -from api.models import Query -from download.models import Download -import json -from django.conf import settings -import warnings -from allauth.account.models import EmailAddress -from api.migration_utils.query_model_to_es_query import query_model_to_es_query - - -def adapt_password_encoding(flask_encoded): - '''Adapt encoded password hash from flask to django format''' - description, salt, hashed = flask_encoded.split('$', 3) - alg, hash_alg, iteration = description.split(':', 3) - raw_hash = base64.b16decode(hashed.strip().encode('ascii').upper()) - hashed = base64.b64encode(raw_hash).decode('ascii').strip() - rest = '$'.join([iteration, salt, hashed]) - return f'{alg}_{hash_alg}${rest}' - - -flask_table_columns = { - 'user': [ - 'id', 'username', 'password', 'email', 'active', 'authenticated', - 'download_limit', 'role_id', 'saml', - ], - 'role': ['id', 'name', 'description'], - 'corpus': ['id', 'name', 'description'], - 'corpora_roles': ['role_id', 'corpus_id'], - 'query': [ - 'id', 'query', 'started', 'completed', 'aborted', 'userID', 'transferred', - 'corpus_name', 'total_results', - ], - 'download': [ - 'id', 'started', 'completed', 'download_type', 'corpus_name', 'user_id', 'parameters', - 'filename' - ] -} - - -def extract_row_data(values, table): - columns = flask_table_columns[table] - return dict(zip(columns, values)) - - -def import_table_data(directory, table): - ''' - Import a data file. `directory` is the directory of the - flask data dump, `table` is the name of the table, which - should also be the file. E.g. table `user` is imported from `user.txt`. - Returns an empty list if the file does not exist. - ''' - - filepath = os.path.join(directory, f'{table}.txt') - if not os.path.exists(filepath): - warnings.warn( - f'Missing file {table}.txt to import data: skipping table migration', - Warning) - return [] - - with open(filepath) as userfile: - reader = csv.reader(userfile, delimiter='\t') - data = [extract_row_data(row, table) for row in reader] - return data - - -def save_flask_group(row): - ''' - Save a Group based on a datarow from the flask SQL data - - The `Group` argument specifies the relevant model, in this case `Group`. - Relevant during migrations - for unit testing this can be left blank, - so it is imported directly from users.models. - - Other models can be included for compatiblity with other functions, - they don't do anything. - ''' - - group = Group(id=row['id'], name=row['name']) - group.save() - - -def save_flask_user(row): - 'Save a User based on a datarow from the flask SQL data' - user = CustomUser( - id=row['id'], - username=row['username'], - password='', # we will set the password below - email=row['email'], - download_limit=row['download_limit'], - saml=null_to_none(row['saml']), - ) - user.save() - - if not null_to_none(row['role_id']): - group = Group.objects.get(name='basic') - else: - group = Group.objects.get(id=row['role_id']) - - user.groups.add(group) - - if group.name == 'admin': - user.is_staff = True - user.is_superuser = True - user.save() - - # now set the password hash - old_hash = null_to_none(row['password']) # for saml users, password can be null - if old_hash: - new_hash = adapt_password_encoding(old_hash) - with connection.cursor() as cursor: - cursor.execute( - 'UPDATE users_customuser SET password = %s WHERE id = %s', - [new_hash, row['id']] - ) - - # add an Allauth verified email address - allauth_email = EmailAddress.objects.filter(email=user.email).first() - if not allauth_email: - allauth_email = EmailAddress(email=user.email) - else: - print(f'duplicate user found for email: {user}') - - # set further details - allauth_email.verified = row['active'] - allauth_email.primary = True - allauth_email.user = user - allauth_email.save() - - - -def save_flask_corpus(row): - corpus = Corpus(id=row['id'], name=row['name']) - corpus.save() - - -def save_flask_corpus_role(row): - corpus = Corpus.objects.get(id=row['corpus_id']) - group = Group.objects.get(id=row['role_id']) - corpus.groups.add(group) - - -def null_to_none(value): - '''return None if the value is `'\\N'`, i.e. null''' - return value if value != '\\N' else None - -def load_json_value(string_value): - return json.loads(string_value.replace('\\\\', '\\')) - -def save_flask_query(row): - user_id = null_to_none(row['userID']) - - if not user_id: - return - - corpus_name = row['corpus_name'] - if not Corpus.objects.filter(name=corpus_name): - # some queries refer to corpus names that no longer exist - return - - query_model = load_json_value(row['query']) - es_query = query_model_to_es_query(query_model) - query = Query( - id=row['id'], - query_json=es_query, - corpus=Corpus.objects.get(name=corpus_name), - user=CustomUser.objects.get(id=user_id), - completed=null_to_none(row['completed']), - aborted=null_to_none(row['aborted']), - transferred=null_to_none(row['transferred']), - total_results=null_to_none(row['total_results']) - ) - query.save() - - # started would be overridden on first save, so set it now - query.started = row['started'] - query.save() - - -def save_flask_download(row): - download = Download( - id=row['id'], - completed=null_to_none(row['completed']), - download_type=row['download_type'], - corpus=Corpus.objects.get(name=row['corpus_name']), - user=CustomUser.objects.get(id=row['user_id']), - parameters=load_json_value(row['parameters']), - filename=os.path.relpath(row['filename'], settings.CSV_FILES_PATH), - ) - download.save() - - # started would be overridden on first save, so set it now - download.started = row['started'] - download.save() - - -def import_and_save_table(directory, flask_table_name, save_function, **kwargs): - for row in import_table_data(directory, flask_table_name): - try: - save_function(row, **kwargs) - except: - warnings.warn( - f'Could not migrate row {row}', - Warning - ) - - -def import_and_save_all_data(directory): - - if not os.path.isdir(directory): - warnings.warn( - f'Directory {directory} to import Flask data does not exist: skipping database migration', - Warning - ) - pass - - tables = [ - ('role', save_flask_group), - ('user', save_flask_user), - ('corpus', save_flask_corpus), - ('corpora_roles', save_flask_corpus_role), - ('query', save_flask_query), - ('download', save_flask_download) - ] - - for flask_table_name, save_function in tables: - with warnings.catch_warnings(): - # ignore runtime warnings about time zones - # (the imported does not include timezone info and django warns about that) - warnings.simplefilter('ignore', RuntimeWarning) - import_and_save_table(directory, flask_table_name, save_function) diff --git a/backend/ianalyzer/tests/test_flask_data_transfer.py b/backend/ianalyzer/tests/test_flask_data_transfer.py deleted file mode 100644 index be0b71936..000000000 --- a/backend/ianalyzer/tests/test_flask_data_transfer.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -from datetime import datetime - -import django.contrib.auth.hashers as hashers -import pytest -from addcorpus.models import Corpus -from allauth.account.models import EmailAddress -from api.models import Query -from django.contrib.auth.models import Group -from download.models import Download -from ianalyzer.flask_data_transfer import * -from users.models import CustomUser - -_here = os.path.abspath(os.path.dirname(__file__)) -flask_test_data_dir = os.path.join(_here, 'flask_test_data') - - -def test_legacy_data_import(): - user_data = import_table_data(flask_test_data_dir, 'user') - - assert len(user_data) == 4 - - user = user_data[0] - expected_user = { - 'id': '1', - 'username': 'admin', - 'password': 'password', - 'email': 'admin@textcavator.nl', - 'active': '1', - 'authenticated': '1', - 'download_limit': '10000', - 'role_id': '2', - 'saml': '0' - } - - for key in expected_user: - if key == 'password': - encoded = adapt_password_encoding(user['password']) - assert hashers.check_password(expected_user['password'], encoded) - else: - assert user[key] == expected_user[key] - - -def test_roles_import(): - role_data = import_table_data(flask_test_data_dir, 'role') - - assert len(role_data) == 2 - - role = role_data[0] - expected_role = { - 'id': '1', - 'name': 'basic', - 'description': 'corpora for public access' - } - - assert role == expected_role - - -def test_save_groups(db): - import_and_save_table(flask_test_data_dir, 'role', save_flask_group) - - groups = Group.objects.all() - assert len(groups) == 2 - - -def test_save_legacy_user(db): - import_and_save_table(flask_test_data_dir, 'role', save_flask_group) - import_and_save_table(flask_test_data_dir, 'user', save_flask_user) - - users = CustomUser.objects.all() - - assert len(users) == 4 - admin = CustomUser.objects.get(username='admin') - assert admin.username == 'admin' - assert admin.email == 'admin@textcavator.nl' - assert admin.is_superuser - assert admin.is_staff - assert not admin.saml - assert list(admin.groups.all()) == [Group.objects.get( - name='basic'), Group.objects.get(name='admin')] - - allauth_email = EmailAddress.objects.get(user=admin) - assert allauth_email.email == admin.email - assert allauth_email.verified - - saml = users[1] - assert not saml.is_superuser - assert saml.saml - -def test_save_corpora(db): - import_and_save_table(flask_test_data_dir, 'role', save_flask_group) - import_and_save_table(flask_test_data_dir, 'corpus', save_flask_corpus) - import_and_save_table(flask_test_data_dir, - 'corpora_roles', save_flask_corpus_role) - - corpora = Corpus.objects.all() - assert len(corpora) == 13 - - corpus = Corpus.objects.get(id='13') - assert corpus.name == 'parliament-ireland' - assert set(corpus.groups.all()) == set(Group.objects.all()) - - -def dates_match(datetime1, datetime2): - '''To avoid timezone issues, just check the dates to compare to datetime objects''' - return datetime1.date() == datetime2.date() - - -@pytest.mark.filterwarnings( - 'ignore:DateTimeField .* received a naive datetime (.*) while time zone support is active' -) -def test_save_queries(db): - import_and_save_all_data(flask_test_data_dir) - - queries = Query.objects.all() - assert len(queries) == 11 - - query = Query.objects.get(id='507') - - assert query.query_json == { - "sort": [{"date": "desc"}], - "query": {"bool": {"must": {"match_all": {}}, "filter": []}} - } - - - assert dates_match(query.started, - datetime(year=2022, month=12, day=7, hour=14, minute=18, second=6)) - assert query.completed is None - assert query.total_results == 7915 - assert query.aborted is False - assert query.transferred == 0 - assert query.user == CustomUser.objects.get(username='admin') - assert query.corpus == Corpus.objects.get(name='parliament-ireland') - - -@pytest.mark.filterwarnings( - 'ignore:DateTimeField .* received a naive datetime (.*) while time zone support is active' -) -def test_save_downloads(db): - import_and_save_all_data(flask_test_data_dir) - - downloads = Download.objects.all() - assert len(downloads) == 10 - - download = Download.objects.get(id='49') - assert dates_match(download.started, - datetime(year=2022, month=11, day=21, hour=10, minute=59, second=26)) - assert dates_match(download.completed, - datetime(year=2022, month=11, day=21, hour=10, minute=59, second=27)) - assert download.corpus == Corpus.objects.get(name='parliament-uk') - assert download.user == CustomUser.objects.get(id='1') - assert download.parameters == { - "corpus": "parliament-uk", - "es_query": { - "query": {"bool": {"must": {"match_all": {}}, "filter": []}}, - "sort": [{"date": "desc"}]}, - "fields": ["date", "speech", "id", "sequence", "speaker"], - "route": "/search/parliament-uk" - } - - _, filename = os.path.split(download.filename) - assert filename == 'parliament-uk.csv' - - -def test_no_data_to_import(db): - '''Assert that a missing directory or missing files will raise a warning - but not crash''' - with pytest.warns(Warning, match='skipping database migration'): - import_and_save_all_data('./nonexistent-directory') - - with pytest.warns(Warning, match='skipping table migration'): - import_table_data(flask_test_data_dir, 'nonexistent_table.txt') diff --git a/documentation/Migration-from-flask.md b/documentation/Migration-from-flask.md deleted file mode 100644 index d5e496c8d..000000000 --- a/documentation/Migration-from-flask.md +++ /dev/null @@ -1,107 +0,0 @@ -# Migrating from Flask - -This documentation concerns environments that were already running Textcavator with a Flask backend (version 3.x or lower). - -## Install django backend - -Run `yarn install-back` to install the new python requirements. - -If you do not have postreSQL installed, install it now. - -Set up a new database and run migrations by running these commands from the backend: - -```bash -psql -f create_db.sql -yarn django migrate -``` - -## Migrating SQL data - -You may want to to migrate the SQL data to the new django backend. (You do not need to do anything for elasticsearch data.) - -The SQL database contains the user data, so migration is essential for production environments. In a development environment, you may prefer to skip this step and just create a new superuser with `yarn django createsuperuser`. - -This update constitutes a switch from Flask to Django, as well as a switch from mySQL to postgreSQL. Migration consists of three steps: -- Create .txt files of the old database from mySQL. -- Move your database backup to the desired location -- Import data into the postgreSQL database using the django shell. - -In a *production* environment, the first two steps will need to be carried out by the system admin. - -### Exporting data from mysql - -To make database exports, your mySQL database user needs to have file privileges, which may not already be the case. You can grant these privileges to your user by getting into mySQL as the root user and running - -```sql -grant all privileges on {database}.* to {user}@'localhost' with grant option; -``` - -or just execute the following as the root user. - -Next, check where you are allowed to export files. (It seems to be the standard that mySQL will only allow you to export files to a specific directory.) - -Run the following: - -```sql -show variables like "secure_file_priv"; -``` - -The output will specify the directory. I will assume that this is `/var/lib/mysql-files/`. If you get a different directory, substitute it in the steps below. - -Run the following to export the data. - -```sql -use ianalyzer; -select * from corpus into outfile '/var/lib/mysql-files/corpus.txt'; -select * from corpora_roles into outfile '/var/lib/mysql-files/corpora_roles.txt'; -select * from download into outfile '/var/lib/mysql-files/download.txt'; -select * from query into outfile '/var/lib/mysql-files/query.txt'; -select * from role into outfile '/var/lib/mysql-files/role.txt'; -select * from user into outfile '/var/lib/mysql-files/user.txt'; -``` - -### Move exported data - -You exported data are in `/var/lib/mysql-files/`, which is inconvenient and requires sudo privileges to access, so you should move the files to a more convenient folder. - -### Import data in django - -Activate your python environment and run `yarn django shell`. Then run: - -```python -from ianalyzer.flask_data_transfer import import_and_save_all_data -directory = 'path/to/your/data' -import_and_save_all_data(directory) -``` - -Regarding the directory: - -- In production, the location of the flask migration is stored in the django settings. Use `directory = settings.FLASK_MIGRATION_DATA` -- If `directory` does not exist or does not contain relevant files, the script will not import anything. - -The script expects to run on an **empty** database, as it will also copy object IDs. This means that if the script fails halfway through, you will need to reset the database before you can re-attempt. You can do this from the command line with `yarn django flush`. - -### Update object IDs - -Now you need to make sure that your new database is aware of the newest object IDs, or it will attempt to create new rows with duplicate IDs. Run the following commands from the backend: - -```bash -python manage.py sqlsequencereset users | python manage.py dbshell -python manage.py sqlsequencereset addcorpus | python manage.py dbshell -python manage.py sqlsequencereset api | python manage.py dbshell -python manage.py sqlsequencereset download | python manage.py dbshell -``` - -## Add local settings - -In `backend/ianalyzer`, make a file `settings_local.py`. Transfer relevant local settings you had configured in your `config.py` file for Flask. - -Note that the new `settings_local` does not need all the information you had provided in`config`. For a development environment, is is probably sufficient to simply specify the `CORPORA`, and the locations of corpus source data and word model files. - -## Transfer downloads - -In the flask backend, the default storage location for CSV files was `/backend/api/csv_files/`. - -In a development environment, the new default location is `/backend/download/csv_files/`. (This can be configured in settings.) You will have to move the contents of your CSV directory here if you want to keep your download history. - -For a production environment, the csv files need to be moved from the old flask server to the new django server, if you are also moving servers. Check the deployment settings for the new location of the downloads. (This should be outside of the repository.)