forked from openMetadataInitiative/openMINDS_Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcollection.py
More file actions
249 lines (212 loc) · 8.98 KB
/
collection.py
File metadata and controls
249 lines (212 loc) · 8.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
"""
This module provides the Collection class, which can be used to
create a collection of openMINDS metadata nodes.
The collection can be saved to and loaded from disk, in JSON-LD format.
"""
import json
import os
from .registry import lookup_type
from .base import Link
DEFAULT_VERSION = "v4"
class Collection:
"""
A collection of metadata nodes that can be saved to
and loaded from disk.
Args
----
*nodes (LinkedMetadata):
Nodes to store in the collection when creating it.
Child nodes that are referenced from the explicitly
listed nodes will also be added.
"""
def __init__(self, *nodes):
self.nodes = {}
self.add(*nodes)
def __len__(self):
return len(self.nodes)
def __iter__(self):
return iter(self.nodes.values())
def add(self, *nodes):
"""
Add one or more metadata nodes to the collection.
Child nodes that are referenced from the explicitly
listed nodes will also be added.
"""
for node in nodes:
self._add_node(node)
def _add_node(self, node):
if node.id is None:
node.id = self._get_blank_node_identifier()
self.nodes[node.id] = node
for linked_node in node.links:
self._add_node(linked_node)
def _get_blank_node_identifier(self):
# see https://www.w3.org/TR/json-ld11/#identifying-blank-nodes
# here we're choosing to use a zero-padded identifier to make
# testing and debugging easier.
# It might be easier just to use uuids, however
fmt = f"_:{{identifier:06d}}"
identifier = len(self.nodes)
return fmt.format(identifier=identifier)
def _sort_nodes_by_id(self):
sorted_nodes = dict(sorted(self.nodes.items()))
self.nodes = sorted_nodes
def save(self, path, individual_files=False, include_empty_properties=False):
"""
Save the node collection to disk in JSON-LD format.
Args
----
path (str):
either a file or a directory into which the metadata will be written.
individual_files (bool):
if False (default), save the entire collection into a single file.
if True, `path` must be a directory, and each node is saved into a
separate file within that directory.
Returns
-------
A list of the file paths created.
"""
# in case a user has added additional child nodes _after_ adding the parent node to the collection
# we first re-add all child nodes to the collection.
# This is probably not the most elegant or fast way to do this, but it is simple and robust.
for node in tuple(self.nodes.values()):
if node.type_.startswith("https://openminds.ebrains.eu/"):
data_context = {"@vocab": "https://openminds.ebrains.eu/vocab/"}
else:
data_context = {"@vocab": "https://openminds.om-i.org/props/"}
for linked_node in node.links:
self._add_node(linked_node)
# Now we can actually save the nodes
if not individual_files:
if os.path.exists(path):
if not os.path.isfile(path):
raise OSError(f"Cannot create file {path} because a directory with that name already exists.")
else:
parent_dir = os.path.dirname(path)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
self._sort_nodes_by_id()
data = {
"@context": data_context,
"@graph": [
node.to_jsonld(
embed_linked_nodes=False, include_empty_properties=include_empty_properties, with_context=False
)
for node in self
],
}
with open(path, "w") as fp:
json.dump(data, fp, indent=2)
output_paths = [path]
else:
if not os.path.exists(path):
os.makedirs(path, exist_ok=True)
if not os.path.isdir(path):
raise OSError(
f"If saving to multiple files, `path` must be a directory. path={path}, pwd={os.getcwd()}"
)
self._sort_nodes_by_id()
output_paths = []
for node in self:
if node.id.startswith("http"):
file_identifier = node.uuid
else:
assert node.id.startswith("_:")
file_identifier = node.id[2:]
file_path = os.path.join(path, f"{file_identifier}.jsonld")
with open(file_path, "w") as fp:
data = node.to_jsonld(embed_linked_nodes=False, include_empty_properties=include_empty_properties)
json.dump(data, fp, indent=2)
output_paths.append(file_path)
return output_paths
def load(self, *paths, version=DEFAULT_VERSION):
"""
Load openMINDS metadata from one or more JSON-LD files.
`*paths` may contain either:
1) a single directory, in which case
all JSON-LD files all the top level of this directory will be loaded
(but without descending into subdirectories)
2) one or more JSON-LD files, which will all be loaded.
By default, openMINDS v4 will be used.
If the JSON-LD files use a different openMINDS version, specify it with the `version` argument.
"""
if len(paths) == 1 and os.path.isdir(paths[0]):
data_dir = paths[0]
json_paths = [
os.path.join(data_dir, item)
for item in os.listdir(data_dir)
if os.path.splitext(item)[1] in (".json", ".jsonld")
]
else:
json_paths = paths
for path in json_paths:
assert os.path.isfile(path)
with open(path, "r") as fp:
data = json.load(fp)
if "@graph" in data:
for item in data["@graph"]:
if "@type" in item:
cls = lookup_type(item["@type"], version=version)
node = cls.from_jsonld(item)
else:
# allow links to metadata instances outside this collection
if not item["@id"].startswith("http"):
raise ValueError("Local nodes must have @type specified")
node = Link(item["@id"])
self.add(node)
else:
if "@type" in data:
cls = lookup_type(data["@type"])
node = cls.from_jsonld(data)
else:
# allow links to metadata instances outside this collection
if not data["@id"].startswith("http"):
raise ValueError("Local nodes must have @type specified")
node = Link(data["@id"])
self.add(node)
self._resolve_links()
def _resolve_links(self):
"""Replace `Link` attributes with typed Nodes where possible"""
for node in self.nodes.values():
node._resolve_links(self.nodes)
def validate(self, ignore=None):
"""
Check whether all constraints are satisfied.
Arguments:
ignore: an optional list of check types that should be ignored
("required", "type", "multiplicity")
Returns a dict containing information about any validation failures.
"""
all_failures = {}
for node in self:
failures = node.validate(ignore=ignore)
if failures:
all_failures[node.id] = failures
return all_failures
@property
def is_valid(self):
failures = self.validate()
return len(failures) == 0
def sort_nodes_for_upload(self):
"""
Return a list of nodes, sorted so that they can be uploaded to a graph database safely,
i.e., child nodes will be saved before their parents.
The upload code is assumed to generate @ids and update the Python instances accordingly.
"""
unsorted = set(self.nodes.keys())
sorted = []
# initial step: move nodes with no children (downstream links) directly to `sorted`
for node_id in unsorted:
if len(self.nodes[node_id].links) == 0:
sorted.append(node_id)
unsorted -= set(sorted)
# now iteratively add nodes to `sorted` if all their children are already in `sorted`
while len(unsorted) > 0:
newly_sorted = []
for node_id in unsorted:
child_ids = set(child.id for child in self.nodes[node_id].links)
if not child_ids.difference(sorted):
sorted.append(node_id)
newly_sorted.append(node_id)
unsorted -= set(newly_sorted)
return [self.nodes[node_id] for node_id in sorted]