-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathproperties_to_json.py
More file actions
127 lines (104 loc) · 4.61 KB
/
properties_to_json.py
File metadata and controls
127 lines (104 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Transforms a set of ``properties_Description.csv`` files from the main CodeMeta repo
into JSON processable by Hugo.
For example, this turns this :file:`v3.0.csv`:
.. code-block: csv
Parent Type,Property,Type,Description
schema:CreativeWork,author,Organization or Person,The author of this content
codemeta:SoftwareSourceCode,readme,URL,link to software Readme file
codemeta:SoftwareSourceCode,embargoEndDate,Date,"Software may be embargoed from public access until a specified date
and this :file:`v2.0.csv`:
.. code-block: csv
Parent Type,Property,Type,Description
schema:CreativeWork,author,Organization or Person,The author of this content
codemeta:SoftwareSourceCode,readme,URL,link to software Readme file
codemeta:SoftwareSourceCode,embargoDate,Date,"Software may be embargoed from public access until a specified date
into:
.. code-block: json
[
{
"versions": ["v3.0", "v2.0"],
"Parent Type": "schema:CreativeWork",
"Property": "author",
"Type": "Organization or Person",
"Description": "The author of this content"
},
{
"versions": ["v3.0", "v2.0"],
"Parent Type": "schema:SoftwareSourceCode",
"Property": "readme",
"Type": "URL",
"Description": "The author of this content"
},
{
"versions": ["v3.0"],
"Parent Type": "schema:SoftwareSourceCode",
"Property": "embargoEndDate",
"Type": "Date",
"Description": "Software may be embargoed from public access until a specified date"
},
{
"versions": ["v2.0"],
"Parent Type": "schema:SoftwareSourceCode",
"Property": "embargoDate",
"Type": "Date",
"Description": "Software may be embargoed from public access until a specified date"
}
]
"""
import csv
import json
import pathlib
import re
DIR = pathlib.Path(__file__).parent.parent
CSV_PATH = DIR / "data/properties_description/"
JSON_PATH = DIR / "data/properties_description.json"
json_items = []
# List .csv files in reverse version order, so Description from the latest version
# takes precedence.
paths = sorted(
CSV_PATH.glob("*.csv"), key=lambda p: float(p.stem.lstrip("v")), reverse=True
)
def canonicalize(s):
"""strips non-letters and lower-cases"""
return re.sub("\\W", "", s).lower()
for csv_path in paths:
version = csv_path.stem
# header = ["Parent Type", "Property", "Type", "Description"]
(header, *rows) = list(csv.reader(csv_path.open()))
for row in rows:
item = dict(zip(header, row))
if item["Property"] == "":
continue # skip empty rows
# Look for a similar existing item from a newer CodeMeta version
for existing_item in json_items:
if existing_item.items() >= item.items():
# We found an identical existing item, add this version to its list
assert (
version not in existing_item["versions"]
), f"CodeMeta {version} has duplicated property {item}"
existing_item["versions"].append(version)
# check for existing properties that have differing types or descriptions
# values from newer versions of properties_description.json take precedence
# over new ones.
# update the versions for these here and break to avoid duplicate rows
if item["Property"] == existing_item["Property"] and item["Parent Type"] == existing_item["Parent Type"]:
if canonicalize(item["Type"]) != canonicalize(existing_item["Type"]):
# both types meaningfully differ
item["versions"] = [version]
json_items.append(item)
else:
item["Type"] = existing_item["Type"]
if version not in existing_item["versions"]:
existing_item["versions"].append(version)
if item["Description"] != existing_item["Description"] and item["Type"] == existing_item["Type"]:
item["Description"] = existing_item["Description"]
if version not in existing_item["versions"]:
existing_item["versions"].append(version)
break
else:
# No similar item, create a new one
item["versions"] = [version]
json_items.append(item)
# Sort properties by their name
json_items.sort(key=lambda item: item["Property"])
JSON_PATH.write_text(json.dumps(json_items, indent=" "))