forked from gdcc/dataverse-recipes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_datasets_from_excel.py
More file actions
152 lines (120 loc) · 4.58 KB
/
create_datasets_from_excel.py
File metadata and controls
152 lines (120 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Create Dataverse datasets from an Excel spreadsheet.
This script reads data from a 'data.xlsx' Excel file and creates corresponding datasets
in a Dataverse collection. Each row in the spreadsheet becomes a separate dataset.
Usage:
1. Place your data in 'data.xlsx' in the same directory as this script
2. Set environment variables (optional):
- SERVER_URL: URL of your Dataverse instance (default: http://localhost:8080)
- API_TOKEN: Your Dataverse API token (default: 3e2b87bf-6652-47f1-b0bb-ddc0aa1a9106)
- COLLECTION: Name of collection to create datasets in (default: collection1)
3. Run the script: python create-datasets-from-excel.py
The Excel file should contain columns matching the metadata fields you want to populate
in your datasets. Special characters will be automatically cleaned during import.
Requirements:
- easyDataverse
- openpyxl
"""
import time
import os
import json
from easyDataverse import Dataverse
# we use openpyxl because it can pull hyperlinks out of Excel
from openpyxl import load_workbook
###### Environment variables ######
# SERVER_URL="https://demo.dataverse.org"
SERVER_URL = str(
os.environ.get(
"SERVER_URL",
"http://localhost:8080", # default if not set
)
)
API_TOKEN = str(
os.environ.get(
"API_TOKEN",
"3e2b87bf-6652-47f1-b0bb-ddc0aa1a9106", # default if not set
)
)
# The collection where the datasets will be created
COLLECTION = str(
os.environ.get(
"COLLECTION",
"collection1", # default if not set
)
)
###### Functions ######
def clean(value):
"""Helper function to replace special characters with standard ones"""
if value is None:
return ""
if isinstance(value, str):
# Replace special characters with standard ones
return value.replace("’", "'").replace("–", "-")
return str(value)
###### Main script ######
wb = load_workbook("data.xlsx")
sheets = wb.sheetnames
ws = wb[sheets[1]]
# Connect to a Dataverse installation
dataverse = Dataverse(
server_url=SERVER_URL,
api_token=API_TOKEN,
)
# min_row=2 to skip the header row
count = 0
for key, *values in ws.iter_rows(min_row=2):
# Strangely, datasets created with EasyDataverse have
# Custom Terms rather than CC0, which is the default,
# and it's not possible to change the license afterward
# using EasyDataverse: https://github.com/gdcc/easyDataverse/issues/29
# So, we create a basic dataset using requests and
# then update the fields later with EasyDataverse.
with open("initial_dataset.json") as f:
# Use the native API to create the dataset
response = dataverse.native_api.create_dataset(
dataverse=COLLECTION,
metadata=json.load(f),
)
# Catch errors early
response.raise_for_status()
# We will need this pid later to update the dataset
pid = response.json()["data"]["persistentId"]
# Get the fields from Excel
title = clean(values[0].value)
description = clean(values[1].value)
agency_responsible = clean(values[2].value)
access_link_name = clean(values[3].value)
if values[3].hyperlink:
access_link_url = values[3].hyperlink.target
else:
# In case there is no hyperlink, we set the
# access link url to None
access_link_url = None
print(f"Title: {title}")
print(f"Description: {description}")
print(f"Agency Responsible: {agency_responsible}")
print(f"Access Link Name: {access_link_name}")
print(f"Access Link URL: {access_link_url}")
# Load the dataset we created earlier
dataset = dataverse.load_dataset(pid)
# Update the metadata fields
dataset.citation.title = title
dataset.citation.subject = ["Other"]
dataset.metadatablocks["citation"].ds_description[0].value = description
dataset.metadatablocks["citation"].author[0].name = agency_responsible
dataset.metadatablocks["citation"].author[0].affiliation = None
ds_contact = dataset.metadatablocks["citation"].dataset_contact[0]
ds_contact.name = "Harvard Dataverse Support"
ds_contact.email = "support@dataverse.harvard.edu"
if access_link_url:
dataset.metadatablocks[
"citation"
].origin_of_sources = f'<a href="{access_link_url}">{access_link_name}</a>'
# Update the dataset, pushing the new fields to the server
dataset.update()
# Sleep a bit before creating the next dataset
time.sleep(2)
# Uncomment the following line to create only one dataset
# break
# Uncomment the following line to create only one dataset
# break