Skip to content

Commit 5c35a46

Browse files
author
Steve Baskauf
authored
Merge pull request #19 from HeardLibrary/v1_7_1
update vanderbot.py to v1.7.1
2 parents cb036a9 + 77acda9 commit 5c35a46

2 files changed

Lines changed: 111 additions & 47 deletions

File tree

vanderbot/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ Username and password are created on the `Bot passwords` page, accessed from `Sp
6363
| --path | -P | credentials directory: "home", "working", or path with trailing "/" | "home" |
6464
| --update | -U | "allow" or "suppress" automatic updates to labels and descriptions | "suppress" |
6565
| --endpoint | -E | a Wikibase SPARQL endpoint URL | "https://query.wikidata.org/sparql" |
66+
| --version | -V | no values; displays current version information | |
67+
| --help | -H | no values; displays link to this page | |
6668

6769
**Examples:**
6870

vanderbot/vanderbot.py

Lines changed: 109 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
# VanderBot v1.7 (2021-03-01) vanderbot.py
1+
# VanderBot, a script for writing CSV data to a Wikibase API. vanderbot.py
2+
version = '1.7.1'
3+
created = '2021-04-xx'
4+
25
# (c) 2021 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
36
# Author: Steve Baskauf
47
# For more information, see https://github.com/HeardLibrary/linked-data/tree/master/vanderbot
@@ -86,6 +89,10 @@
8689
# - enable logging of some errors to be displayed (and saved to the log file if used): label/description fault, date fault
8790
# - prior to writing new items, check that there are no existing items with the same labels and descriptions
8891
# - move mutable configuration variables to the top of the script
92+
# -----------------------------------------
93+
# Version 1.7.1 change notes (2021-03-xx):
94+
# - enable --version option.
95+
# add more complete error trapping for dates
8996

9097
import json
9198
import requests
@@ -94,6 +101,8 @@
94101
from time import sleep
95102
import sys
96103
import uuid
104+
import re
105+
from datetime import datetime
97106

98107
# Change the following lines to hard-code different defaults if not running from the command line.
99108

@@ -102,7 +111,7 @@
102111
log_object = sys.stdout # log output defaults to the console screen
103112
allow_label_description_changes = False # labels and descriptions in the local CSV file that differ from existing Wikidata items are not automatically written
104113
endpoint = 'https://query.wikidata.org/sparql' # default to the Wikidata Query Service endpoint
105-
sparqlSleep = 0.25 # delay time between calls to SPARQL endpoint
114+
sparqlSleep = 0.1 # delay time between calls to SPARQL endpoint
106115
json_metadata_description_file = 'csv-metadata.json' # "Generating RDF from Tabular Data on the Web" metadata description file (mapping schema)
107116
credentials_path_string = 'home' # value is "home", "working", "gdrive", or a relative or absolute path with trailing "/"
108117
credentials_filename = 'wikibase_credentials.txt' # name of the API credentials file
@@ -115,9 +124,40 @@
115124
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
116125
'''
117126

127+
arg_vals = sys.argv[1:]
128+
# see https://www.gnu.org/prep/standards/html_node/_002d_002dversion.html
129+
if '--version' in arg_vals or '-V' in arg_vals: # provide version information according to GNU standards
130+
# Remove version argument to avoid disrupting pairing of other arguments
131+
# Not really necessary here, since the script terminates, but use in the future for other no-value arguments
132+
if '--version' in arg_vals:
133+
arg_vals.remove('--version')
134+
if '-V' in arg_vals:
135+
arg_vals.remove('-V')
136+
print('VanderBot', version)
137+
print('Copyright ©', created[:4], 'Vanderbilt University')
138+
print('License GNU GPL version 3.0 <http://www.gnu.org/licenses/gpl-3.0>')
139+
print('This is free software: you are free to change and redistribute it.')
140+
print('There is NO WARRANTY, to the extent permitted by law.')
141+
print('Author: Steve Baskauf')
142+
print('Revision date:', created)
143+
sys.exit()
144+
145+
if '--help' in arg_vals or '-H' in arg_vals: # provide help information according to GNU standards
146+
# needs to be expanded to include brief info on invoking the program
147+
print('For help, see the VanderBot landing page at https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/README.md')
148+
print('Report bugs to: steve.baskauf@vanderbilt.edu')
149+
sys.exit()
150+
118151
# Code from https://realpython.com/python-command-line-arguments/#a-few-methods-for-parsing-python-command-line-arguments
119-
opts = [opt for opt in sys.argv[1:] if opt.startswith('-')]
120-
args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
152+
opts = [opt for opt in arg_vals if opt.startswith('-')]
153+
args = [arg for arg in arg_vals if not arg.startswith('-')]
154+
155+
if '--version' in opts: # allow labels and descriptions that differ locally from existing Wikidata items to be updated
156+
if args[opts.index('--version')] == 'allow':
157+
allow_label_description_changes = True
158+
if '-V' in opts: # allow labels and descriptions that differ locally from existing Wikidata items to be updated
159+
if args[opts.index('-V')] == 'allow':
160+
allow_label_description_changes = True
121161

122162
if '--log' in opts: # set output to specified log file or path including file name
123163
log_path = args[opts.index('--log')]
@@ -390,6 +430,38 @@ def generateNodeId(rowData, columnNameRoot):
390430
rowData[columnNameRoot + '_nodeId'] = str(uuid.uuid4())
391431
return rowData
392432

433+
# Function to check for the particular form of xsd:dateTime required for full dates in Wikidata
434+
# See https://stackoverflow.com/questions/41129921/validate-an-iso-8601-datetime-string-in-python
435+
regex = r'^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[0-9])-(3[01]|0[0-9]|[12][0-9])T([0][0]):([0][0]):([0][0])(Z)$'
436+
match_iso8601 = re.compile(regex).match
437+
def validate_iso8601(str_val):
438+
try:
439+
if match_iso8601( str_val ) is not None:
440+
return True
441+
except:
442+
pass
443+
return False
444+
445+
# Function to check for valid abbreviated dates
446+
def validate_time(date_text):
447+
try:
448+
if date_text != datetime.strptime(date_text, "%Y-%m-%d").strftime("%Y-%m-%d"):
449+
raise ValueError
450+
form = 'day'
451+
except ValueError:
452+
try:
453+
if date_text != datetime.strptime(date_text, "%Y-%m").strftime('%Y-%m'):
454+
raise ValueError
455+
form = 'month'
456+
except ValueError:
457+
try:
458+
if date_text != datetime.strptime(date_text, "%Y").strftime('%Y'):
459+
raise ValueError
460+
form = 'year'
461+
except ValueError:
462+
form ='none'
463+
return form
464+
393465
# Function to convert times to the format required by Wikidata
394466
def convertDates(rowData, dateColumnNameRoot):
395467
error = False
@@ -404,33 +476,43 @@ def convertDates(rowData, dateColumnNameRoot):
404476
timeString = rowData[dateColumnNameRoot + '_val']
405477

406478
value = rowData[dateColumnNameRoot + '_val']
479+
date_type = validate_time(value)
407480
# date is YYYY-MM-DD
408-
if len(value) == 10:
481+
if date_type == 'day':
409482
timeString = value + 'T00:00:00Z'
410483
precisionNumber = 11 # precision to days
411484
# date is YYYY-MM
412-
elif len(value) == 7:
485+
elif date_type == 'month':
413486
timeString = value + '-00T00:00:00Z'
414487
precisionNumber = 10 # precision to months
415488
# date is YYYY
416-
elif len(value) == 4:
489+
elif date_type == 'year':
417490
timeString = value + '-00-00T00:00:00Z'
418491
precisionNumber = 9 # precision to years
419-
# date is xsd:dateTime and doesn't need adjustment
420-
elif len(value) == 20:
421-
timeString = value
422-
precisionNumber = 11 # assume precision to days since Wikibase doesn't support greater resolution than that
423-
# date form unknown, don't adjust
492+
# date does not conform to any of the tested options
424493
else:
425-
#print('Warning: date for ' + dateColumnNameRoot + '_val:', rowData[dateColumnNameRoot + '_val'], 'does not conform to any standard format! Check manually.')
426-
error = True
427-
precisionNumber = '' # must have a value to prevent an error, will be ignored since the write and save will be killed
494+
# date is xsd:dateTime and doesn't need adjustment
495+
if validate_iso8601(value):
496+
timeString = value
497+
precisionNumber = 11 # assume precision to days since Wikibase doesn't support greater resolution than that
498+
# date form unknown, don't adjust
499+
else:
500+
#print('Warning: date for ' + dateColumnNameRoot + '_val:', rowData[dateColumnNameRoot + '_val'], 'does not conform to any standard format! Check manually.')
501+
error = True
502+
precisionNumber = '' # must have a value to prevent an error, will be ignored since the write and save will be killed
428503
# assign the changed values back to the dict
429504
rowData[dateColumnNameRoot + '_val'] = timeString
430505
rowData[dateColumnNameRoot + '_prec'] = precisionNumber
431506
else:
432-
# a pre-existing precisionNumber must be an integer when written to the API
433-
rowData[dateColumnNameRoot + '_prec'] = int(rowData[dateColumnNameRoot + '_prec'])
507+
# Check that a pre-existing value for the date string conforms to the Wikidata format requirements
508+
if validate_iso8601(rowData[dateColumnNameRoot + '_val']):
509+
# a pre-existing precisionNumber must be an integer when written to the API
510+
try:
511+
rowData[dateColumnNameRoot + '_prec'] = int(rowData[dateColumnNameRoot + '_prec'])
512+
except: # throw an error if characters can't be converted to an integer
513+
error = True
514+
else:
515+
error = True
434516

435517
return rowData, error
436518

@@ -1194,9 +1276,6 @@ def attemptPost(apiUrl, parameters):
11941276
print()
11951277
print()
11961278

1197-
# If there are dates in the table that are not in the format Wikibase requires, they will be converted here
1198-
print('converting dates and generating value node IDs')
1199-
12001279
# Figure out the column name roots for column sets that are dates and value nodes
12011280
dateColumnNameList = []
12021281
valueColumnNameList = []
@@ -1226,29 +1305,6 @@ def attemptPost(apiUrl, parameters):
12261305
valueColumnNameList.append(propertiesReferencesList[propertyNumber][referenceNumber]['refValueColumnList'][refPropNumber])
12271306
#print(dateColumnNameList)
12281307

1229-
#errorFlag = False
1230-
#for rowNumber in range(0, len(tableData)):
1231-
#print('row: ' + str(rowNumber))
1232-
#print(tableData[rowNumber])
1233-
# for dateColumnName in dateColumnNameList:
1234-
# tableData[rowNumber], error = convertDates(tableData[rowNumber], dateColumnName)
1235-
# if error:
1236-
# errorFlag = True
1237-
# error_log += 'Incorrect date format in row ' + str(rowNumber) + ', column ' + dateColumnName + '\n'
1238-
# for valueColumnName in valueColumnNameList:
1239-
# tableData[rowNumber] = generateNodeId(tableData[rowNumber], valueColumnName)
1240-
#print(tableData[rowNumber])
1241-
#print()
1242-
1243-
# Write the file with the converted dates in case the script crashes
1244-
#writeToFile(tableFileName, fieldnames, tableData)
1245-
1246-
# If any of the date formats in the table were bad, don't try to write to the API
1247-
#if errorFlag:
1248-
# print(error_log)
1249-
# sys.exit('Fix incorrectly formatted dates in file and restart')
1250-
#print()
1251-
12521308
# Find out what languages are represented in the labels and descriptions
12531309
languages_list = labelLanguageList + descriptionLanguageList
12541310
languages_list = list(set(languages_list))
@@ -1296,7 +1352,8 @@ def attemptPost(apiUrl, parameters):
12961352
new_item = True
12971353
if log_path != '': # if logging to a file, print something so we know something is going on
12981354
print(status_message)
1299-
print(status_message, file=log_object)
1355+
if status_message != 'processing row: ':
1356+
print(status_message, file=log_object)
13001357

13011358
if new_item:
13021359
# -------------
@@ -1673,8 +1730,9 @@ def attemptPost(apiUrl, parameters):
16731730

16741731
# don't try to write if there aren't any data to send
16751732
if parameterDictionary['data'] == '{}':
1676-
print('no data to write', file=log_object)
1677-
print('', file=log_object)
1733+
pass
1734+
#print('no data to write', file=log_object)
1735+
#print('', file=log_object)
16781736
else:
16791737
if maxlag > 0:
16801738
parameterDictionary['maxlag'] = maxlag
@@ -1747,7 +1805,11 @@ def attemptPost(apiUrl, parameters):
17471805
if count > 1:
17481806
# I don't think this should actually happen, since if there were already at least one statement with this value,
17491807
# it would have already been downloaded in the processing prior to running this script.
1750-
print('Warning: duplicate statement ', tableData[rowNumber][subjectWikidataIdColumnHeader], ' ', propertiesIdList[statementIndex], ' ', tableData[rowNumber][propertiesColumnList[statementIndex]])
1808+
# OK, here's the situation where it happens: the script fails or is killed after writing to the API, but before the data are written to the CSV.
1809+
# In that case, the statement will be written a second time and both will show up in the JSON returned from the API
1810+
dup_message = 'Warning: duplicate statement ', tableData[rowNumber][subjectWikidataIdColumnHeader], ' ', propertiesIdList[statementIndex], ' ', tableData[rowNumber][propertiesColumnList[statementIndex]]
1811+
print(dup_message)
1812+
error_log += dup_message + '\n'
17511813
tableData[rowNumber][propertiesUuidColumnList[statementIndex]] = statement['id'].split('$')[1] # just keep the UUID part after the dollar sign
17521814

17531815
if 'references' in statement: # skip reference checking if the item doesn't have any references

0 commit comments

Comments
 (0)