Skip to content

Commit ea2e101

Browse files
committed
Add --validate-data flag to run in validation mode
In this mode we validate the data against the table schema and report any errors. No RDFization is done, and a return code is propogated upon a failure to the shell.
1 parent 9fcfa8c commit ea2e101

7 files changed

Lines changed: 140 additions & 8 deletions

File tree

src/csv2rdf/csvw.clj

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,53 @@
1818
annotated-rows (csv/annotated-rows url table dialect)]
1919
(table-statements context table annotated-rows)))
2020

21+
(defn annotate-tables [tabular-source metadata-source]
22+
(processing/get-metadata tabular-source metadata-source))
23+
24+
(defn- validate-rows
25+
"Validates the CSVW schema for the given tabular file, metadata and options.
26+
27+
`tabular-source` and `metadata-source` can be any of the following
28+
types:
29+
30+
- java.io.File
31+
- java.lang.String
32+
- java.net.URI
33+
- java.nio.file.Path (including nio Paths that are inside zip filesystems)
34+
35+
If metadata-source is non-nil then processing will start from the
36+
asscociated metadata document, otherwise it will start from
37+
tabular-source."
38+
[tabular-source metadata-source]
39+
(let [{:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source)
40+
table-group-dialect (:dialect metadata)
41+
output-tables (remove properties/suppress-output? tables)
42+
;;ctx (table-group-context mode metadata) ;; TODO this might be useful later when iterating over tables
43+
]
44+
45+
(util/liberal-mapcat (fn [{:keys [url dialect] :as table}]
46+
;;(validated-rows ctx table table-group-dialect)
47+
(let [dialect (or dialect table-group-dialect)]
48+
(csv/annotated-rows url table dialect)))
49+
50+
output-tables)))
51+
52+
(defn only-validate-schema
53+
"Only validate the data against the schemas in the metadata file, and
54+
report errors. Does not convert into RDF.
55+
56+
Returns a map with the key `:data-validation-errors?` set to a
57+
boolean indicating whether any schema errors occurred."
58+
[{:keys [tabular-source metadata-source]}]
59+
(let [errors? (atom false)]
60+
(doseq [{:keys [cells] row-number :source-number :as row} (validate-rows tabular-source metadata-source)
61+
{:keys [errors column-number column] :as cell} cells
62+
:when (seq errors)]
63+
(reset! errors? true)
64+
(doseq [error errors]
65+
(println (format "Row #%d col #%d (column '%s') has error: " row-number column-number (:name column)) error)))
66+
{:data-validation-errors? @errors?}))
67+
2168
(defn csv->rdf
2269
"Runs the CSVW process for the given tabular or metadata data sources
2370
and options.

src/csv2rdf/main.clj

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
(def options-spec
1313
[["-t" "--tabular TABULAR" "Location of the tabular file"]
1414
["-u" "--user-metadata METADATA" "Location of the metadata file"]
15+
["-s" "--validate-schema" "Validate the schema only"]
16+
["-d" "--validate-data" "Validate the data against the schema only (no RDFization)"]
1517
["-o" "--output-file OUTPUT" "Output file to write to"]
1618
["-m" "--mode MODE" "CSVW mode to run"
1719
:validate [#(contains? #{:minimal :standard :annotated} %)]
@@ -67,30 +69,37 @@
6769
(println "Usage:")
6870
(println summary)))
6971

70-
(defn- inner-main [args]
72+
73+
74+
(defn inner-main [args]
7175
(let [options (parse-cli-options args)
72-
{:keys [mode tabular user-metadata output-file]} options
76+
{:keys [mode tabular user-metadata output-file validate-data annotate-tables]} options
7377
opts {:tabular-source (some-> tabular parse-source)
7478
:metadata-source (some-> user-metadata parse-source)
7579
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
7680
:mode mode}
7781
output-file (some-> output-file io/file)]
78-
(if output-file
79-
(with-open [w (io/writer output-file)]
80-
(write-output w opts))
81-
(write-output (io/writer *out*) opts))))
82+
83+
(cond validate-data (csvw/only-validate-schema opts)
84+
85+
:else (if output-file
86+
(with-open [w (io/writer output-file)]
87+
(write-output w opts))
88+
(write-output (io/writer *out*) opts)))))
8289

8390
(defn- -main [& args]
8491
(try
85-
(inner-main args)
86-
(System/exit 0)
92+
(if (:data-validation-errors? (inner-main args))
93+
(System/exit 2)
94+
(System/exit 0))
8795
(catch Throwable ex
8896
(display-error ex)
8997
(System/exit 1))))
9098

9199

92100
(comment
93101

102+
(inner-main ["-s" "-t" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/certificates.csv" "-u" "/Users/rick/repos/dclg-epcs/resources/public/csvw/basic/epc_domestic.json"])
94103
(time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))
95104

96105
(require '[clj-async-profiler.core :as prof])

test/csv2rdf/main_test.clj

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
(ns csv2rdf.main-test
2+
(:require [csv2rdf.main :as sut]
3+
[clojure.test :as t]))
4+
5+
;; See issue 47
6+
;; Resolving template property URIs with values containing spaces should work
7+
8+
(defmacro capture
9+
"Capture return value of body and stdout, and return a hashmap
10+
of :return-value and :stdout."
11+
[body]
12+
`(let [s# (new java.io.StringWriter)]
13+
(binding [*out* s#]
14+
(let [ret# ~body]
15+
{:return-value ret#
16+
:stdout (str s#)}))))
17+
18+
(t/deftest inner-main-test-validate-data
19+
(t/testing "--validate-data")
20+
(let [{:keys [return-value stdout]}
21+
(capture (sut/inner-main ["-t" "./test/examples/validation/success.csv"
22+
"-u" "./test/examples/validation/named-numbers.json"
23+
"--validate-data"]))]
24+
(t/is (= {:data-validation-errors? false} return-value))
25+
(t/is (= "" stdout)))
26+
27+
(let [{:keys [return-value stdout]}
28+
(capture (sut/inner-main ["-t" "./test/examples/validation/fail-1.csv"
29+
"-u" "./test/examples/validation/named-numbers.json"
30+
"--validate-data"]))]
31+
(t/is (= {:data-validation-errors? true} return-value))
32+
(t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'two' as type 'int': For input string: \"two\"\n"
33+
stdout)))
34+
35+
(let [{:keys [return-value stdout]}
36+
(capture (sut/inner-main ["-t" "./test/examples/validation/fail-2.csv"
37+
"-u" "./test/examples/validation/named-numbers.json"
38+
"--validate-data"]))]
39+
(t/is (= {:data-validation-errors? true} return-value))
40+
(t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'three' as type 'int': For input string: \"three\"\n"
41+
stdout))))
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name,number
2+
one,1
3+
two,two
4+
three,3
5+
four,4
6+
five,5
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name,number
2+
one,1
3+
3,three
4+
four,4
5+
five,5
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"@context": "http://www.w3.org/ns/csvw",
3+
"url": "fail-2.csv",
4+
"tableSchema": {
5+
"columns": [
6+
{
7+
"name": "name",
8+
"datatype": "string",
9+
"required": true
10+
},
11+
{
12+
"name": "number",
13+
"required": true,
14+
"datatype": "int"
15+
}
16+
]
17+
}
18+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name,number
2+
one,1
3+
two,2
4+
three,3
5+
four,4
6+
five,5

0 commit comments

Comments
 (0)