-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathextract_resource_data.py
More file actions
70 lines (58 loc) · 2.66 KB
/
extract_resource_data.py
File metadata and controls
70 lines (58 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
#!/usr/bin/env python
# NOTE that query-slinger is a good tool for downloading filtered versions of
# CKAN tables.
import sys, os
from pprint import pprint
import json
from gadgets import write_to_csv, get_all_records, get_fields, get_metadata
from icecream import ic
def obtain_resource(site, r_id, API_key, filename=None):
# This function pulls information from a particular resource on a
# CKAN instance and outputs it to a CSV file.
# Tests suggest that there are some small differences between this
# file and the one downloaded directly through the CKAN web
# interface: 1) the row order is different (this version is ordered
# by _id (logically) while the downloaded version has the same weird
# order as that shown in the Data Explorer (which can have an _id
# sequence like 1,2,3,4,5,6,7,45,8,9,10... for no obvious reason
# (it may be that the data is being sorted by the order they appear
# in the database, which may be by their time of last update...))
# and 2) weird non-ASCII characters are not being handled optimally,
# so probably some work on character encoding is in order in the
# Python script.
metadata = get_metadata(site,r_id,API_key)
ic(metadata)
if filename is None:
filename = "{}.csv".format(r_id)
try:
list_of_dicts = get_all_records(site, r_id, API_key, chunk_size=20000)
print("len(list_of_dicts) = {}".format(len(list_of_dicts)))
fields = get_fields(site,r_id,API_key)
print("len(fields) = {}".format(len(fields)))
metadata = get_metadata(site,r_id,API_key)
except:
print("Something went wrong and the resource/fields/metadata was not obtained.")
print("(Note that if the CKAN package is private, this function can not obtain its data through SQL queries in older versions of CKAN.)")
return False
#Eliminate _id field
fields.remove("_id")
print("The resource has the following fields: {}".format(fields))
write_to_csv(filename,list_of_dicts,fields)
metaname = filename + '-metadata.json'
with open(metaname, 'w', encoding='utf-8') as outfile:
json.dump(metadata, outfile, indent=4, sort_keys=True)
return True
def main():
print("This script may work best under Python 3 to handle Unicode data.")
# see utf-8 encoding in file-opening above.
path = os.path.dirname(os.path.realpath(__file__))
resource_id = sys.argv[1]
filename = None
if len(sys.argv) > 2:
filename = sys.argv[2]
from credentials import API_key, site
success = obtain_resource(site, resource_id, API_key, filename)
############
if __name__ == '__main__':
main()