dataverse-recipes/python/create_datasets_from_excel/create_datasets_from_excel.py at main · koormath/dataverse-recipes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Create Dataverse datasets from an Excel spreadsheet.

This script reads data from a 'data.xlsx' Excel file and creates corresponding datasets
in a Dataverse collection. Each row in the spreadsheet becomes a separate dataset.

Usage:
    1. Place your data in 'data.xlsx' in the same directory as this script
    2. Set environment variables (optional):
        - SERVER_URL: URL of your Dataverse instance (default: http://localhost:8080)
        - API_TOKEN: Your Dataverse API token (default: 3e2b87bf-6652-47f1-b0bb-ddc0aa1a9106)
        - COLLECTION: Name of collection to create datasets in (default: collection1)
    3. Run the script: python create-datasets-from-excel.py

The Excel file should contain columns matching the metadata fields you want to populate
in your datasets. Special characters will be automatically cleaned during import.

Requirements:
    - easyDataverse
    - openpyxl
"""

import time
import os
import json

from easyDataverse import Dataverse

# we use openpyxl because it can pull hyperlinks out of Excel
from openpyxl import load_workbook

###### Environment variables ######

# SERVER_URL="https://demo.dataverse.org"
SERVER_URL = str(
    os.environ.get(
        "SERVER_URL",
        "http://localhost:8080",  # default if not set
    )
)
API_TOKEN = str(
    os.environ.get(
        "API_TOKEN",
        "3e2b87bf-6652-47f1-b0bb-ddc0aa1a9106",  # default if not set
    )
)

# The collection where the datasets will be created
COLLECTION = str(
    os.environ.get(
        "COLLECTION",
        "collection1",  # default if not set
    )
)


###### Functions ######


def clean(value):
    """Helper function to replace special characters with standard ones"""
    if value is None:
        return ""
    if isinstance(value, str):
        # Replace special characters with standard ones
        return value.replace("’", "'").replace("–", "-")
    return str(value)


###### Main script ######

wb = load_workbook("data.xlsx")
sheets = wb.sheetnames
ws = wb[sheets[1]]

# Connect to a Dataverse installation
dataverse = Dataverse(
    server_url=SERVER_URL,
    api_token=API_TOKEN,
)

# min_row=2 to skip the header row
count = 0
for key, *values in ws.iter_rows(min_row=2):
    # Strangely, datasets created with EasyDataverse have
    # Custom Terms rather than CC0, which is the default,
    # and it's not possible to change the license afterward
    # using EasyDataverse: https://github.com/gdcc/easyDataverse/issues/29
    # So, we create a basic dataset using requests and
    # then update the fields later with EasyDataverse.

    with open("initial_dataset.json") as f:
        # Use the native API to create the dataset
        response = dataverse.native_api.create_dataset(
            dataverse=COLLECTION,
            metadata=json.load(f),
        )

        # Catch errors early
        response.raise_for_status()

        # We will need this pid later to update the dataset
        pid = response.json()["data"]["persistentId"]

    # Get the fields from Excel
    title = clean(values[0].value)
    description = clean(values[1].value)
    agency_responsible = clean(values[2].value)
    access_link_name = clean(values[3].value)

    if values[3].hyperlink:
        access_link_url = values[3].hyperlink.target
    else:
        # In case there is no hyperlink, we set the
        # access link url to None
        access_link_url = None

    print(f"Title: {title}")
    print(f"Description: {description}")
    print(f"Agency Responsible: {agency_responsible}")
    print(f"Access Link Name: {access_link_name}")
    print(f"Access Link URL: {access_link_url}")

    # Load the dataset we created earlier
    dataset = dataverse.load_dataset(pid)

    # Update the metadata fields
    dataset.citation.title = title
    dataset.citation.subject = ["Other"]
    dataset.metadatablocks["citation"].ds_description[0].value = description
    dataset.metadatablocks["citation"].author[0].name = agency_responsible
    dataset.metadatablocks["citation"].author[0].affiliation = None

    ds_contact = dataset.metadatablocks["citation"].dataset_contact[0]
    ds_contact.name = "Harvard Dataverse Support"
    ds_contact.email = "support@dataverse.harvard.edu"

    if access_link_url:
        dataset.metadatablocks[
            "citation"
        ].origin_of_sources = f'<a href="{access_link_url}">{access_link_name}</a>'

    # Update the dataset, pushing the new fields to the server
    dataset.update()

    # Sleep a bit before creating the next dataset
    time.sleep(2)

    # Uncomment the following line to create only one dataset
    # break
    # Uncomment the following line to create only one dataset
    # break