Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,19 @@ $ mysql/importcsv.sh /csvdir/*
$ mysql/exec_sql.sh < mysql/AssignPrimaryKeys.sql
```

#### Importing into SQLite

```sh
# Create database tables
$ sqlite3 /path/to/discogs.sqlite < sqlite/sql/CreateTables.sql

# Import CSV files
$ python3 sqlite/importcsv.py --db=/path/to/discogs.sqlite /csvdir/*

# Create indexes (optional but recommended for querying)
$ sqlite3 /path/to/discogs.sqlite < sqlite/sql/CreateIndexes.sql
```

#### Importing into MongoDB

The CSV files can be imported into MongoDB using
Expand Down
122 changes: 122 additions & 0 deletions sqlite/importcsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python
"""Usage:
importcsv.py --db=<db> [--batch=<n>] [--fast] PATH ...

Options:
--db=<db> path to sqlite database file
--batch=<n> number of rows per batch insert [default: 5000]
--fast enable faster (less durable) sqlite settings
PATH one or more csv files (optionally .bz2)

"""
import bz2
import csv
import os
import pathlib
import sqlite3
import sys

from docopt import docopt

# since we run this as a script, we need to add the parent folder
# so we can import discogsxml2db from it
parent_path = str(pathlib.Path(__file__).absolute().parent.parent)
sys.path.insert(1, parent_path)
from discogsxml2db.exporter import csv_headers # noqa


def _open_csv(path):
if path.endswith('.csv'):
return open(path, newline='', encoding='utf-8')
if path.endswith('.csv.bz2'):
return bz2.open(path, mode='rt', newline='', encoding='utf-8')
return None


def _normalize_header(header):
if not header:
return header
header[0] = header[0].lstrip('\ufeff')
return header


def _apply_fast_pragmas(db):
db.execute("PRAGMA synchronous=OFF")
db.execute("PRAGMA journal_mode=OFF")
db.execute("PRAGMA temp_store=MEMORY")
db.execute("PRAGMA foreign_keys=OFF")


def import_csv(path, db, batch_size):
base, filename = os.path.split(path)
table, ext = filename.split('.', 1)
if ext not in ('csv', 'csv.bz2'):
print("%s can not be imported: not a .csv or .csv.bz2 file" % filename)
return

columns = csv_headers.get(table)
if not columns:
print("%s can not be imported: unknown table" % filename)
return

fp = _open_csv(path)
if not fp:
print("%s can not be imported: failed to open" % filename)
return

print("importing %s" % filename)
reader = csv.reader(fp)
header = _normalize_header(next(reader, None))
if header is None:
print("%s can not be imported: empty file" % filename)
return
if header != columns:
print("warning: header mismatch in %s" % filename)

placeholders = ", ".join(["?"] * len(columns))
col_list = ", ".join(columns)
sql = "INSERT INTO {} ({}) VALUES ({})".format(table, col_list, placeholders)

cursor = db.cursor()
batch = []
for row in reader:
batch.append(row)
if len(batch) >= batch_size:
cursor.executemany(sql, batch)
db.commit()
batch.clear()

if batch:
cursor.executemany(sql, batch)
db.commit()
cursor.close()
fp.close()


arguments = docopt(__doc__, version='0.1')
db_path = arguments['--db']
try:
batch_size = int(arguments['--batch'])
except ValueError:
print("error: --batch must be an integer")
sys.exit(1)

if not db_path:
print("error: --db is required")
sys.exit(1)

if batch_size <= 0:
print("error: --batch must be positive")
sys.exit(1)

connection = sqlite3.connect(db_path)
if arguments['--fast']:
_apply_fast_pragmas(connection)

for path in arguments['PATH']:
if os.path.isfile(path):
import_csv(os.path.abspath(path), connection, batch_size)
else:
print("error: '%s' is not a readable file" % path)

connection.close()
39 changes: 39 additions & 0 deletions sqlite/sql/CreateIndexes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
-- artists
CREATE INDEX artist_url_idx_artist ON artist_url (artist_id);
CREATE INDEX artist_namevariation_idx_artist ON artist_namevariation (artist_id);
CREATE INDEX artist_alias_idx_artist ON artist_alias (artist_id);
CREATE INDEX group_member_idx_group ON group_member (group_artist_id);
CREATE INDEX group_member_idx_member ON group_member (member_artist_id);

-- labels
CREATE INDEX label_idx_parent_label ON label (parent_id);
CREATE INDEX label_url_idx_url ON label_url (label_id);

-- masters
CREATE INDEX master_artist_idx_master ON master_artist (master_id);
CREATE INDEX master_artist_idx_artist ON master_artist (artist_id);
CREATE INDEX master_video_idx_master ON master_video (master_id);
CREATE INDEX master_genre_idx_master ON master_genre (master_id);
CREATE INDEX master_style_idx_master ON master_style (master_id);

-- releases
CREATE INDEX release_idx_master ON release (master_id);
CREATE INDEX release_artist_idx_release ON release_artist (release_id);
CREATE INDEX release_artist_idx_artist ON release_artist (artist_id);
CREATE INDEX release_label_idx_release ON release_label (release_id);
CREATE INDEX release_label_idx_label ON release_label (label_id);
CREATE INDEX release_genre_idx_release ON release_genre (release_id);
CREATE INDEX release_style_idx_release ON release_style (release_id);
CREATE INDEX release_format_idx_release ON release_format (release_id);
CREATE INDEX release_track_idx_release ON release_track (release_id);
CREATE INDEX release_track_idx_sequence ON release_track (sequence);
CREATE INDEX release_track_idx_parent ON release_track (parent);
CREATE INDEX release_track_idx_title ON release_track (title);
CREATE INDEX release_track_artist_idx_release ON release_track_artist (release_id);
CREATE INDEX release_track_artist_idx_track_id ON release_track_artist (track_id);
CREATE INDEX release_track_artist_idx_track_sequence ON release_track_artist (track_sequence);
CREATE INDEX release_track_artist_idx_artist ON release_track_artist (artist_id);
CREATE INDEX release_identifier_idx_release ON release_identifier (release_id);
CREATE INDEX release_video_idx_release ON release_video (release_id);
CREATE INDEX release_company_idx_release ON release_company (release_id);
CREATE INDEX release_company_idx_company ON release_company (company_id);
Loading
Loading