-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopen_data_relationships.py
More file actions
134 lines (118 loc) · 4.37 KB
/
open_data_relationships.py
File metadata and controls
134 lines (118 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import csv
import gzip
import json
from datetime import datetime, timezone
import requests
SOURCE_URL = "https://open.canada.ca/static/od-do-canada.jsonl.gz"
OUTPUT_CSV = "docs/open_data_relationships.csv"
OUTPUT_JSONL = "docs/open_data_relationships.jsonl"
def _translated_value(data, key):
value = data.get(key) if isinstance(data, dict) else None
if isinstance(value, dict):
return value
return {}
def _relationship_rows(
source_level,
package_id,
package_titles,
resource_id,
resource_titles,
relationship,
):
related_urls = relationship.get("related_url") or {}
related_url_en = related_urls.get("en", "") if isinstance(related_urls, dict) else ""
related_url_fr = related_urls.get("fr", "") if isinstance(related_urls, dict) else ""
return [
source_level,
package_id,
package_titles.get("en", ""),
package_titles.get("fr", ""),
resource_id or "",
resource_titles.get("en", ""),
resource_titles.get("fr", ""),
relationship.get("related_relationship", ""),
relationship.get("resource_type", ""),
related_url_en,
related_url_fr,
]
def fetch_relationships():
headers = [
"source_level",
"package_id",
"package_title_en",
"package_title_fr",
"resource_id",
"resource_title_en",
"resource_title_fr",
"relationship_type",
"related_resource_type",
"related_url_en",
"related_url_fr",
]
with (
requests.get(SOURCE_URL, stream=True, timeout=60) as response,
open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csv_file,
open(OUTPUT_JSONL, "w", encoding="utf-8") as jsonl_file,
):
response.raise_for_status()
writer = csv.writer(csv_file)
writer.writerow(headers)
for line in gzip.GzipFile(fileobj=response.raw):
if not line.strip():
continue
package = json.loads(line)
package_id = package.get("id") or package.get("package_id")
package_titles = _translated_value(package, "title_translated")
package_relationships = package.get("relationship") or []
resources_with_relationships = []
for relationship in package_relationships:
writer.writerow(
_relationship_rows(
"package",
package_id,
package_titles,
"",
{},
relationship,
)
)
for resource in package.get("resources", []) or []:
resource_relationships = resource.get("relationship") or []
if not resource_relationships:
continue
resource_id = resource.get("resource_id") or resource.get("id")
resource_titles = _translated_value(resource, "name_translated")
resources_with_relationships.append(
{
"resource_id": resource_id,
"name_translated": resource_titles,
"relationship": resource_relationships,
}
)
for relationship in resource_relationships:
writer.writerow(
_relationship_rows(
"resource",
package_id,
package_titles,
resource_id,
resource_titles,
relationship,
)
)
if package_relationships or resources_with_relationships:
jsonl_file.write(
json.dumps(
{
"package_id": package_id,
"title_translated": package_titles,
"relationship": package_relationships,
"resources": resources_with_relationships,
"source_timestamp": datetime.now(timezone.utc).isoformat(),
},
ensure_ascii=False,
)
+ "\n"
)
if __name__ == "__main__":
fetch_relationships()