Skip to content

Commit 9b733f6

Browse files
progvalmeldra
andauthored
Fix merging of properties type and description (#104)
* Update properties_to_json.py * Refactor the merging logic to work * Further refactoring to address multiple issues: - There are multiple Identifier properties with different parents. These are now addressed independently. - A property with a change between versions for both type and description were not correctly limiting the versions. I believe this is correctly happening now. * Deduplicate canonicalization, add comments * Fix canonicalize() * Append version even when types have slight differences, instead of hiding one --------- Co-authored-by: Melissa Draper <melissa@meldraweb.com>
1 parent 122aea7 commit 9b733f6

1 file changed

Lines changed: 26 additions & 1 deletion

File tree

scripts/properties_to_json.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import csv
5959
import json
6060
import pathlib
61+
import re
6162

6263
DIR = pathlib.Path(__file__).parent.parent
6364
CSV_PATH = DIR / "data/properties_description/"
@@ -71,6 +72,10 @@
7172
CSV_PATH.glob("*.csv"), key=lambda p: float(p.stem.lstrip("v")), reverse=True
7273
)
7374

75+
def canonicalize(s):
76+
"""strips non-letters and lower-cases"""
77+
return re.sub("\\W", "", s).lower()
78+
7479
for csv_path in paths:
7580
version = csv_path.stem
7681
# header = ["Parent Type", "Property", "Type", "Description"]
@@ -85,12 +90,32 @@
8590
# Look for a similar existing item from a newer CodeMeta version
8691
for existing_item in json_items:
8792
if existing_item.items() >= item.items():
88-
# We found an existing item, add this version to its list
93+
# We found an identical existing item, add this version to its list
8994
assert (
9095
version not in existing_item["versions"]
9196
), f"CodeMeta {version} has duplicated property {item}"
9297
existing_item["versions"].append(version)
98+
# check for existing properties that have differing types or descriptions
99+
# values from newer versions of properties_description.json take precedence
100+
# over new ones.
101+
# update the versions for these here and break to avoid duplicate rows
102+
if item["Property"] == existing_item["Property"] and item["Parent Type"] == existing_item["Parent Type"]:
103+
if canonicalize(item["Type"]) != canonicalize(existing_item["Type"]):
104+
# both types meaningfully differ
105+
item["versions"] = [version]
106+
json_items.append(item)
107+
else:
108+
item["Type"] = existing_item["Type"]
109+
if version not in existing_item["versions"]:
110+
existing_item["versions"].append(version)
111+
112+
if item["Description"] != existing_item["Description"] and item["Type"] == existing_item["Type"]:
113+
item["Description"] = existing_item["Description"]
114+
if version not in existing_item["versions"]:
115+
existing_item["versions"].append(version)
116+
93117
break
118+
94119
else:
95120
# No similar item, create a new one
96121
item["versions"] = [version]

0 commit comments

Comments
 (0)