From 576ccd0c2d701a19041b63a741a97b63904ba802 Mon Sep 17 00:00:00 2001 From: Philip Mateescu Date: Sat, 7 Feb 2026 16:33:46 +0800 Subject: [PATCH] Add SQLite import support --- README.md | 13 ++ sqlite/importcsv.py | 122 +++++++++++++++++++ sqlite/sql/CreateIndexes.sql | 39 ++++++ sqlite/sql/CreateTables.sql | 226 +++++++++++++++++++++++++++++++++++ 4 files changed, 400 insertions(+) create mode 100644 sqlite/importcsv.py create mode 100644 sqlite/sql/CreateIndexes.sql create mode 100644 sqlite/sql/CreateTables.sql diff --git a/README.md b/README.md index 3f6d200..f7ffebe 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,19 @@ $ mysql/importcsv.sh /csvdir/* $ mysql/exec_sql.sh < mysql/AssignPrimaryKeys.sql ``` +#### Importing into SQLite + +```sh +# Create database tables +$ sqlite3 /path/to/discogs.sqlite < sqlite/sql/CreateTables.sql + +# Import CSV files +$ python3 sqlite/importcsv.py --db=/path/to/discogs.sqlite /csvdir/* + +# Create indexes (optional but recommended for querying) +$ sqlite3 /path/to/discogs.sqlite < sqlite/sql/CreateIndexes.sql +``` + #### Importing into MongoDB The CSV files can be imported into MongoDB using diff --git a/sqlite/importcsv.py b/sqlite/importcsv.py new file mode 100644 index 0000000..3bf1f0a --- /dev/null +++ b/sqlite/importcsv.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +"""Usage: + importcsv.py --db= [--batch=] [--fast] PATH ... + +Options: + --db= path to sqlite database file + --batch= number of rows per batch insert [default: 5000] + --fast enable faster (less durable) sqlite settings + PATH one or more csv files (optionally .bz2) + +""" +import bz2 +import csv +import os +import pathlib +import sqlite3 +import sys + +from docopt import docopt + +# since we run this as a script, we need to add the parent folder +# so we can import discogsxml2db from it +parent_path = str(pathlib.Path(__file__).absolute().parent.parent) +sys.path.insert(1, parent_path) +from discogsxml2db.exporter import csv_headers # noqa + + +def _open_csv(path): + if path.endswith('.csv'): + return open(path, newline='', encoding='utf-8') + if path.endswith('.csv.bz2'): + return bz2.open(path, mode='rt', newline='', encoding='utf-8') + return None + + +def _normalize_header(header): + if not header: + return header + header[0] = header[0].lstrip('\ufeff') + return header + + +def _apply_fast_pragmas(db): + db.execute("PRAGMA synchronous=OFF") + db.execute("PRAGMA journal_mode=OFF") + db.execute("PRAGMA temp_store=MEMORY") + db.execute("PRAGMA foreign_keys=OFF") + + +def import_csv(path, db, batch_size): + base, filename = os.path.split(path) + table, ext = filename.split('.', 1) + if ext not in ('csv', 'csv.bz2'): + print("%s can not be imported: not a .csv or .csv.bz2 file" % filename) + return + + columns = csv_headers.get(table) + if not columns: + print("%s can not be imported: unknown table" % filename) + return + + fp = _open_csv(path) + if not fp: + print("%s can not be imported: failed to open" % filename) + return + + print("importing %s" % filename) + reader = csv.reader(fp) + header = _normalize_header(next(reader, None)) + if header is None: + print("%s can not be imported: empty file" % filename) + return + if header != columns: + print("warning: header mismatch in %s" % filename) + + placeholders = ", ".join(["?"] * len(columns)) + col_list = ", ".join(columns) + sql = "INSERT INTO {} ({}) VALUES ({})".format(table, col_list, placeholders) + + cursor = db.cursor() + batch = [] + for row in reader: + batch.append(row) + if len(batch) >= batch_size: + cursor.executemany(sql, batch) + db.commit() + batch.clear() + + if batch: + cursor.executemany(sql, batch) + db.commit() + cursor.close() + fp.close() + + +arguments = docopt(__doc__, version='0.1') +db_path = arguments['--db'] +try: + batch_size = int(arguments['--batch']) +except ValueError: + print("error: --batch must be an integer") + sys.exit(1) + +if not db_path: + print("error: --db is required") + sys.exit(1) + +if batch_size <= 0: + print("error: --batch must be positive") + sys.exit(1) + +connection = sqlite3.connect(db_path) +if arguments['--fast']: + _apply_fast_pragmas(connection) + +for path in arguments['PATH']: + if os.path.isfile(path): + import_csv(os.path.abspath(path), connection, batch_size) + else: + print("error: '%s' is not a readable file" % path) + +connection.close() diff --git a/sqlite/sql/CreateIndexes.sql b/sqlite/sql/CreateIndexes.sql new file mode 100644 index 0000000..c15cb91 --- /dev/null +++ b/sqlite/sql/CreateIndexes.sql @@ -0,0 +1,39 @@ +-- artists +CREATE INDEX artist_url_idx_artist ON artist_url (artist_id); +CREATE INDEX artist_namevariation_idx_artist ON artist_namevariation (artist_id); +CREATE INDEX artist_alias_idx_artist ON artist_alias (artist_id); +CREATE INDEX group_member_idx_group ON group_member (group_artist_id); +CREATE INDEX group_member_idx_member ON group_member (member_artist_id); + +-- labels +CREATE INDEX label_idx_parent_label ON label (parent_id); +CREATE INDEX label_url_idx_url ON label_url (label_id); + +-- masters +CREATE INDEX master_artist_idx_master ON master_artist (master_id); +CREATE INDEX master_artist_idx_artist ON master_artist (artist_id); +CREATE INDEX master_video_idx_master ON master_video (master_id); +CREATE INDEX master_genre_idx_master ON master_genre (master_id); +CREATE INDEX master_style_idx_master ON master_style (master_id); + +-- releases +CREATE INDEX release_idx_master ON release (master_id); +CREATE INDEX release_artist_idx_release ON release_artist (release_id); +CREATE INDEX release_artist_idx_artist ON release_artist (artist_id); +CREATE INDEX release_label_idx_release ON release_label (release_id); +CREATE INDEX release_label_idx_label ON release_label (label_id); +CREATE INDEX release_genre_idx_release ON release_genre (release_id); +CREATE INDEX release_style_idx_release ON release_style (release_id); +CREATE INDEX release_format_idx_release ON release_format (release_id); +CREATE INDEX release_track_idx_release ON release_track (release_id); +CREATE INDEX release_track_idx_sequence ON release_track (sequence); +CREATE INDEX release_track_idx_parent ON release_track (parent); +CREATE INDEX release_track_idx_title ON release_track (title); +CREATE INDEX release_track_artist_idx_release ON release_track_artist (release_id); +CREATE INDEX release_track_artist_idx_track_id ON release_track_artist (track_id); +CREATE INDEX release_track_artist_idx_track_sequence ON release_track_artist (track_sequence); +CREATE INDEX release_track_artist_idx_artist ON release_track_artist (artist_id); +CREATE INDEX release_identifier_idx_release ON release_identifier (release_id); +CREATE INDEX release_video_idx_release ON release_video (release_id); +CREATE INDEX release_company_idx_release ON release_company (release_id); +CREATE INDEX release_company_idx_company ON release_company (company_id); diff --git a/sqlite/sql/CreateTables.sql b/sqlite/sql/CreateTables.sql new file mode 100644 index 0000000..d695edf --- /dev/null +++ b/sqlite/sql/CreateTables.sql @@ -0,0 +1,226 @@ +-- artists +CREATE TABLE artist ( + id INTEGER PRIMARY KEY, + name TEXT, + realname TEXT, + profile TEXT, + + data_quality TEXT +); + +CREATE TABLE artist_url ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + artist_id INTEGER NOT NULL, + url TEXT +); + +CREATE TABLE artist_namevariation ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + artist_id INTEGER NOT NULL, + name TEXT NOT NULL +); + +CREATE TABLE artist_alias ( + artist_id INTEGER NOT NULL, + alias_name TEXT NOT NULL, + alias_artist_id INTEGER +); + +CREATE TABLE artist_image ( + artist_id INTEGER NOT NULL, + type TEXT, + width INTEGER, + height INTEGER +); + +CREATE TABLE group_member ( + group_artist_id INTEGER NOT NULL, + member_artist_id INTEGER NOT NULL, + member_name TEXT NOT NULL +); + +-- labels +CREATE TABLE label ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + contact_info TEXT, + profile TEXT, + parent_id INTEGER, + parent_name TEXT, + data_quality TEXT +); + +CREATE TABLE label_url ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + label_id INTEGER NOT NULL, + url TEXT NOT NULL +); + +CREATE TABLE label_image ( + label_id INTEGER NOT NULL, + type TEXT, + width INTEGER, + height INTEGER +); + +-- masters +CREATE TABLE master ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + year INTEGER, + main_release INTEGER NOT NULL, + data_quality TEXT +); + +CREATE TABLE master_artist ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + master_id INTEGER NOT NULL, + artist_id INTEGER NOT NULL, + artist_name TEXT, + anv TEXT, + position INTEGER, + join_string TEXT, + role TEXT +); + +CREATE TABLE master_video ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + master_id INTEGER NOT NULL, + duration INTEGER, + title TEXT, + description TEXT, + uri TEXT +); + +CREATE TABLE master_genre ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + master_id INTEGER NOT NULL, + genre TEXT +); + +CREATE TABLE master_style ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + master_id INTEGER NOT NULL, + style TEXT +); + +CREATE TABLE master_image ( + master_id INTEGER NOT NULL, + type TEXT, + width INTEGER, + height INTEGER +); + +-- releases +CREATE TABLE release ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + released TEXT, + country TEXT, + notes TEXT, + data_quality TEXT, + main INTEGER, + master_id INTEGER, + status TEXT +); + +CREATE TABLE release_artist ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + artist_id INTEGER NOT NULL, + artist_name TEXT, + extra INTEGER NOT NULL, + anv TEXT, + position INTEGER, + join_string TEXT, + role TEXT, + tracks TEXT +); + +CREATE TABLE release_label ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + label_id INTEGER, + label_name TEXT NOT NULL, + catno TEXT +); + +CREATE TABLE release_genre ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + genre TEXT +); + +CREATE TABLE release_style ( + release_id INTEGER NOT NULL, + style TEXT +); + +CREATE TABLE release_format ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + name TEXT, + qty NUMERIC, + text_string TEXT, + descriptions TEXT +); + +CREATE TABLE release_track ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + sequence INTEGER NOT NULL, + position TEXT, + parent TEXT, + title TEXT, + duration TEXT, + track_id TEXT +); + +CREATE TABLE release_track_artist ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + track_id TEXT, + release_id INTEGER NOT NULL, + track_sequence TEXT, + artist_id INTEGER NOT NULL, + artist_name TEXT, + extra INTEGER NOT NULL, + anv TEXT, + position INTEGER, + join_string TEXT, + role TEXT, + tracks TEXT +); + +CREATE TABLE release_identifier ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + description TEXT, + type TEXT, + value TEXT +); + +CREATE TABLE release_video ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + duration INTEGER, + title TEXT, + description TEXT, + uri TEXT +); + +CREATE TABLE release_company ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + release_id INTEGER NOT NULL, + company_id INTEGER NOT NULL, + company_name TEXT NOT NULL, + entity_type TEXT, + entity_type_name TEXT, + uri TEXT +); + +CREATE TABLE release_image ( + release_id INTEGER NOT NULL, + type TEXT, + width INTEGER, + height INTEGER +);