Skip to content

Commit ca0ef6f

Browse files
authored
#139 - Support checksums and conditional loading for a given CSV row (#322)
1 parent 43788da commit ca0ef6f

8 files changed

Lines changed: 364 additions & 3 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ See the [documentation on Initializer's logging properties](readme/rtprops.md#lo
231231
* Fix conceptsets domain to prevent incorrect unretiring of associated concept
232232
* Fix to ensure concept lookups correctly handle concepts with % characters
233233
* Fix for performance to only process displays preloaders if they are present within csvs
234+
* Add support for row-level checksums in CSV domains to enable only re-loading those rows that have changed
234235

235236
#### Version 2.11.0
236237
* Added support for patient flags (flags, flagpriorities, flagtags) domains

api/src/main/java/org/openmrs/module/initializer/InitializerConfig.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
import static org.openmrs.module.initializer.InitializerConstants.PROPS_DOMAINS;
3434
import static org.openmrs.module.initializer.InitializerConstants.PROPS_EXCLUDE;
35+
import static org.openmrs.module.initializer.InitializerConstants.PROPS_ROW_CHECKSUMS_ENABLED;
3536
import static org.openmrs.module.initializer.InitializerConstants.PROPS_SKIPCHECKSUMS;
3637
import static org.openmrs.module.initializer.InitializerConstants.PROPS_STARTUP_LOAD;
3738
import static org.openmrs.module.initializer.InitializerConstants.PROPS_STARTUP_LOAD_CONTINUE_ON_ERROR;
@@ -53,6 +54,8 @@ public class InitializerConfig implements InitializingBean {
5354

5455
private Boolean skipChecksums = false;
5556

57+
private Boolean rowChecksumsEnabled = false;
58+
5659
private String startupLoadingMode = "";
5760

5861
@Override
@@ -90,6 +93,8 @@ public void init() {
9093

9194
// checksums
9295
skipChecksums = BooleanUtils.toBoolean(Optional.ofNullable(getPropertyValue(PROPS_SKIPCHECKSUMS)).orElse(""));
96+
rowChecksumsEnabled = BooleanUtils
97+
.toBoolean(Optional.ofNullable(getPropertyValue(PROPS_ROW_CHECKSUMS_ENABLED)).orElse(""));
9398

9499
// Startup Loading Configuration
95100
startupLoadingMode = getPropertyValue(PROPS_STARTUP_LOAD);
@@ -138,6 +143,14 @@ public boolean skipChecksums() {
138143
return skipChecksums;
139144
}
140145

146+
/**
147+
* @return true to enable per-row checksum tracking for CSV loaders, false (default) to use only
148+
* file-level checksums
149+
*/
150+
public boolean isRowChecksumsEnabled() {
151+
return rowChecksumsEnabled;
152+
}
153+
141154
/**
142155
* @return how configuration should be loaded in at startup in the module activator
143156
*/

api/src/main/java/org/openmrs/module/initializer/InitializerConstants.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ public class InitializerConstants {
2929

3030
public static final String PROPS_SKIPCHECKSUMS = MODULE_ARTIFACT_ID + "." + "skip.checksums";
3131

32+
public static final String PROPS_ROW_CHECKSUMS_ENABLED = MODULE_ARTIFACT_ID + "." + "row.checksums.enabled";
33+
3234
/*
3335
* Startup properties
3436
*/

api/src/main/java/org/openmrs/module/initializer/api/ConfigDirUtil.java

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@
44
import java.io.FileInputStream;
55
import java.io.FilenameFilter;
66
import java.io.IOException;
7+
import java.nio.charset.StandardCharsets;
78
import java.nio.file.Files;
89
import java.nio.file.Path;
910
import java.nio.file.Paths;
1011
import java.util.ArrayList;
1112
import java.util.Arrays;
1213
import java.util.Collections;
14+
import java.util.HashSet;
1315
import java.util.List;
16+
import java.util.Map;
1417
import java.util.Optional;
18+
import java.util.Set;
19+
import java.util.TreeMap;
1520
import java.util.stream.Stream;
1621

1722
import org.apache.commons.codec.digest.DigestUtils;
@@ -36,6 +41,8 @@ public class ConfigDirUtil {
3641

3742
public static final String CHECKSUM_FILE_EXT = "checksum";
3843

44+
public static final String ROW_CHECKSUM_FILE_EXT = "rows.checksum";
45+
3946
protected static final Logger log = LoggerFactory.getLogger(ConfigDirUtil.class);
4047

4148
protected boolean skipChecksums = false;
@@ -303,6 +310,109 @@ public void writeChecksum(File configFile, String checksum) {
303310
}
304311
}
305312

313+
/**
314+
* Computes a stable hash for a single CSV row. Each populated column header is paired with its
315+
* (possibly empty) value, and pairs are sorted by header name. This means column reordering does
316+
* not change the hash, but column addition/removal/rename or any value change — including a
317+
* transition between "absent" and "present-but-empty" — does. The latter distinction matters
318+
* because some line processors (e.g. NestedConceptLineProcessor) treat a present-but-empty cell as
319+
* an explicit clear, distinct from an absent column.
320+
*
321+
* @param headerLine The CSV header line (column names).
322+
* @param line The CSV data row.
323+
* @return The MD5 hex hash of the normalized row.
324+
*/
325+
public static String computeRowChecksum(String[] headerLine, String[] line) {
326+
if (headerLine == null || line == null) {
327+
return NOT_COMPUTABLE_CHECKSUM;
328+
}
329+
Map<String, String> sorted = new TreeMap<>();
330+
for (int i = 0; i < headerLine.length; i++) {
331+
String header = headerLine[i];
332+
if (StringUtils.isEmpty(header)) {
333+
continue;
334+
}
335+
String value = (i < line.length && line[i] != null) ? line[i] : "";
336+
sorted.put(header, value);
337+
}
338+
StringBuilder sb = new StringBuilder();
339+
for (Map.Entry<String, String> e : sorted.entrySet()) {
340+
sb.append(e.getKey()).append('=').append(e.getValue()).append('\n');
341+
}
342+
return DigestUtils.md5Hex(sb.toString().getBytes(StandardCharsets.UTF_8));
343+
}
344+
345+
/**
346+
* Reads the previously-saved set of row hashes for a CSV configuration file.
347+
*
348+
* @param configFile The CSV configuration file.
349+
* @return The set of row hashes from the previous successful processing, or an empty set if no row
350+
* checksum file exists yet.
351+
*/
352+
public Set<String> readRowChecksums(File configFile) {
353+
final String rowChecksumFilename = getLocatedFilename(domainDirPath, configFile) + "." + ROW_CHECKSUM_FILE_EXT;
354+
final Path rowChecksumPath = Paths.get(domainChecksumsDirPath, rowChecksumFilename);
355+
Set<String> hashes = new HashSet<>();
356+
if (!rowChecksumPath.toFile().exists()) {
357+
return hashes;
358+
}
359+
try {
360+
for (String line : Files.readAllLines(rowChecksumPath, StandardCharsets.UTF_8)) {
361+
String trimmed = line.trim();
362+
if (!trimmed.isEmpty()) {
363+
hashes.add(trimmed);
364+
}
365+
}
366+
}
367+
catch (IOException e) {
368+
log.warn("Error reading row checksum file: " + rowChecksumPath, e);
369+
}
370+
return hashes;
371+
}
372+
373+
/**
374+
* Writes the given set of row hashes for a CSV configuration file. Overwrites any previously saved
375+
* row checksums for the same file.
376+
*
377+
* @param configFile The CSV configuration file.
378+
* @param rowChecksums The row hashes to persist.
379+
*/
380+
public void writeRowChecksums(File configFile, Set<String> rowChecksums) {
381+
if (skipChecksums) {
382+
return;
383+
}
384+
final String rowChecksumFilename = getLocatedFilename(domainDirPath, configFile) + "." + ROW_CHECKSUM_FILE_EXT;
385+
final Path rowChecksumPath = Paths.get(domainChecksumsDirPath, rowChecksumFilename);
386+
try {
387+
Files.deleteIfExists(rowChecksumPath);
388+
if (rowChecksums == null || rowChecksums.isEmpty()) {
389+
return;
390+
}
391+
List<String> sorted = new ArrayList<>(rowChecksums);
392+
Collections.sort(sorted);
393+
FileUtils.writeLines(rowChecksumPath.toFile(), "UTF-8", sorted);
394+
}
395+
catch (IOException e) {
396+
log.error("Error writing row checksum file at: " + rowChecksumPath, e);
397+
}
398+
}
399+
400+
/**
401+
* Removes the row checksum file for a CSV configuration file, if any.
402+
*
403+
* @param configFile The CSV configuration file.
404+
*/
405+
public void deleteRowChecksums(File configFile) {
406+
final String rowChecksumFilename = getLocatedFilename(domainDirPath, configFile) + "." + ROW_CHECKSUM_FILE_EXT;
407+
final Path rowChecksumPath = Paths.get(domainChecksumsDirPath, rowChecksumFilename);
408+
try {
409+
Files.deleteIfExists(rowChecksumPath);
410+
}
411+
catch (IOException e) {
412+
log.warn("Error deleting row checksum file at: " + rowChecksumPath, e);
413+
}
414+
}
415+
306416
/**
307417
* Removes the specified checksum file in the specified checksums directory.
308418
*

api/src/main/java/org/openmrs/module/initializer/api/loaders/BaseCsvLoader.java

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
import java.io.File;
66
import java.io.IOException;
77
import java.io.InputStream;
8+
import java.util.ArrayList;
9+
import java.util.HashSet;
810
import java.util.List;
11+
import java.util.Set;
912
import java.util.stream.Collectors;
1013

1114
import org.openmrs.OpenmrsObject;
@@ -88,8 +91,32 @@ protected void load(InputStream is) throws Exception {
8891
//
8992

9093
final CsvParser<T, BaseLineProcessor<T>> parser = getParser(is);
91-
List<String[]> remainingLines = parser.getLines();
92-
int totalCount = remainingLines.size();
94+
final List<String[]> allLines = parser.getLines();
95+
final int totalCount = allLines.size();
96+
final String[] headerLine = parser.getHeaderLine();
97+
98+
final boolean rowChecksumsEnabled = cfg.isRowChecksumsEnabled() && !cfg.skipChecksums();
99+
final File file = getLoadedFile();
100+
final ConfigDirUtil dirUtil = getDirUtil();
101+
102+
final Set<String> previousRowHashes = rowChecksumsEnabled ? dirUtil.readRowChecksums(file) : new HashSet<>();
103+
104+
List<String[]> remainingLines;
105+
if (rowChecksumsEnabled) {
106+
remainingLines = new ArrayList<>();
107+
for (String[] line : allLines) {
108+
if (!previousRowHashes.contains(ConfigDirUtil.computeRowChecksum(headerLine, line))) {
109+
remainingLines.add(line);
110+
}
111+
}
112+
int skipped = totalCount - remainingLines.size();
113+
if (skipped > 0) {
114+
log.info(skipped + " of " + totalCount + " CSV rows in " + file.getName()
115+
+ " are unchanged since last load and will be skipped.");
116+
}
117+
} else {
118+
remainingLines = new ArrayList<>(allLines);
119+
}
93120

94121
int lastFailCount = 0;
95122
CsvFailingLines result = new CsvFailingLines();
@@ -100,11 +127,28 @@ protected void load(InputStream is) throws Exception {
100127
remainingLines = result.getFailingLines();
101128
}
102129

130+
// Row checksum bookkeeping: persist hashes for rows considered processed — i.e. all rows in
131+
// the file except those that failed on this run. Failing rows are excluded so they will be
132+
// retried on the next load.
133+
if (rowChecksumsEnabled) {
134+
Set<String> failingHashes = new HashSet<>();
135+
for (String[] failingLine : result.getFailingLines()) {
136+
failingHashes.add(ConfigDirUtil.computeRowChecksum(headerLine, failingLine));
137+
}
138+
Set<String> newRowHashes = new HashSet<>();
139+
for (String[] line : allLines) {
140+
String rowHash = ConfigDirUtil.computeRowChecksum(headerLine, line);
141+
if (!failingHashes.contains(rowHash)) {
142+
newRowHashes.add(rowHash);
143+
}
144+
}
145+
dirUtil.writeRowChecksums(file, newRowHashes);
146+
}
147+
103148
//
104149
// logging
105150
//
106151

107-
final File file = getLoadedFile();
108152
// success logging
109153
if (isEmpty(result.getFailingLines())) {
110154
log.info(file.getName() + " ('" + getDomainName() + "' domain) was entirely successfully processed.");

api/src/test/java/org/openmrs/module/initializer/api/ConfigDirUtilTest.java

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
package org.openmrs.module.initializer.api;
22

33
import static org.hamcrest.CoreMatchers.is;
4+
import static org.hamcrest.CoreMatchers.not;
45
import static org.hamcrest.MatcherAssert.assertThat;
56
import static org.hamcrest.Matchers.containsInAnyOrder;
67
import static org.hamcrest.Matchers.empty;
78
import static org.openmrs.module.initializer.api.ConfigDirUtil.CHECKSUM_FILE_EXT;
9+
import static org.openmrs.module.initializer.api.ConfigDirUtil.ROW_CHECKSUM_FILE_EXT;
10+
import static org.openmrs.module.initializer.api.ConfigDirUtil.computeRowChecksum;
811
import static org.openmrs.module.initializer.api.ConfigDirUtil.getLocatedFilename;
912

1013
import java.io.File;
@@ -14,8 +17,10 @@
1417
import java.util.Arrays;
1518
import java.util.Collection;
1619
import java.util.HashMap;
20+
import java.util.HashSet;
1721
import java.util.List;
1822
import java.util.Map;
23+
import java.util.Set;
1924
import java.util.stream.Collectors;
2025

2126
import org.apache.commons.codec.digest.DigestUtils;
@@ -136,6 +141,100 @@ public void getFiles_shouldHandleChecksumsWhenNestedFiles() throws IOException {
136141
}
137142
}
138143

144+
@Test
145+
public void computeRowChecksum_shouldBeStableUnderColumnReordering() {
146+
String[] header1 = { "uuid", "name", "description" };
147+
String[] row1 = { "abc-123", "Acme", "A clinic" };
148+
String[] header2 = { "name", "description", "uuid" };
149+
String[] row2 = { "Acme", "A clinic", "abc-123" };
150+
Assert.assertEquals(computeRowChecksum(header1, row1), computeRowChecksum(header2, row2));
151+
}
152+
153+
@Test
154+
public void computeRowChecksum_shouldChangeWhenAValueChanges() {
155+
String[] header = { "uuid", "name" };
156+
String[] row1 = { "abc-123", "Acme" };
157+
String[] row2 = { "abc-123", "Beta" };
158+
assertThat(computeRowChecksum(header, row1), is(not(computeRowChecksum(header, row2))));
159+
}
160+
161+
@Test
162+
public void computeRowChecksum_shouldDistinguishAbsentFromPresentButEmpty() {
163+
// Adding a column — even when its cell is empty — must change the hash, because some line
164+
// processors (e.g. NestedConceptLineProcessor) treat a present-but-empty cell as a directive
165+
// to clear an existing field and treat an absent column as "leave the field alone".
166+
String[] header1 = { "uuid", "name" };
167+
String[] row1 = { "abc-123", "Acme" };
168+
String[] header2 = { "uuid", "name", "description" };
169+
String[] row2 = { "abc-123", "Acme", null };
170+
String[] row3 = { "abc-123", "Acme", "" };
171+
assertThat(computeRowChecksum(header1, row1), is(not(computeRowChecksum(header2, row2))));
172+
assertThat(computeRowChecksum(header1, row1), is(not(computeRowChecksum(header2, row3))));
173+
// However null and empty values within an existing column should be treated identically,
174+
// since CsvParser normalizes blank cells to null on read.
175+
Assert.assertEquals(computeRowChecksum(header2, row2), computeRowChecksum(header2, row3));
176+
}
177+
178+
@Test
179+
public void computeRowChecksum_shouldChangeWhenAColumnIsRenamed() {
180+
String[] header1 = { "uuid", "name" };
181+
String[] header2 = { "uuid", "label" };
182+
String[] row = { "abc-123", "Acme" };
183+
assertThat(computeRowChecksum(header1, row), is(not(computeRowChecksum(header2, row))));
184+
}
185+
186+
@Test
187+
public void rowChecksums_shouldRoundTripThroughDisk() throws IOException {
188+
String configDirPath = getClass().getClassLoader().getResource("org/openmrs/module/initializer/include").getPath();
189+
String checksumsDirPath = Files.createTempDirectory("configuration_checksums_rows").toString();
190+
String domain = "file_patterns";
191+
192+
ConfigDirUtil dirUtil = new ConfigDirUtil(configDirPath, checksumsDirPath, domain);
193+
File configFile = new File(Paths.get(configDirPath, domain, "diagnoses.csv").toString());
194+
195+
Set<String> hashes = new HashSet<>();
196+
hashes.add("hash-a");
197+
hashes.add("hash-b");
198+
hashes.add("hash-c");
199+
200+
// Writing then reading should round-trip the set.
201+
dirUtil.writeRowChecksums(configFile, hashes);
202+
File rowsFile = Paths.get(checksumsDirPath, domain,
203+
getLocatedFilename(Paths.get(configDirPath, domain).toString(), configFile) + "." + ROW_CHECKSUM_FILE_EXT)
204+
.toFile();
205+
assertThat(rowsFile.exists(), is(true));
206+
assertThat(dirUtil.readRowChecksums(configFile), is(hashes));
207+
208+
// Writing an empty set should remove the file.
209+
dirUtil.writeRowChecksums(configFile, new HashSet<>());
210+
assertThat(rowsFile.exists(), is(false));
211+
212+
// Reading when no file exists returns an empty set.
213+
assertThat(dirUtil.readRowChecksums(configFile), is(empty()));
214+
}
215+
216+
@Test
217+
public void deleteRowChecksums_shouldRemoveTheRowChecksumFile() throws IOException {
218+
String configDirPath = getClass().getClassLoader().getResource("org/openmrs/module/initializer/include").getPath();
219+
String checksumsDirPath = Files.createTempDirectory("configuration_checksums_rows_delete").toString();
220+
String domain = "file_patterns";
221+
222+
ConfigDirUtil dirUtil = new ConfigDirUtil(configDirPath, checksumsDirPath, domain);
223+
File configFile = new File(Paths.get(configDirPath, domain, "diagnoses.csv").toString());
224+
225+
Set<String> hashes = new HashSet<>();
226+
hashes.add("hash-a");
227+
dirUtil.writeRowChecksums(configFile, hashes);
228+
229+
File rowsFile = Paths.get(checksumsDirPath, domain,
230+
getLocatedFilename(Paths.get(configDirPath, domain).toString(), configFile) + "." + ROW_CHECKSUM_FILE_EXT)
231+
.toFile();
232+
assertThat(rowsFile.exists(), is(true));
233+
234+
dirUtil.deleteRowChecksums(configFile);
235+
assertThat(rowsFile.exists(), is(false));
236+
}
237+
139238
/*
140239
* One of the CSV files has a non-parseable _order.
141240
* The resulting exception is logged as an error.

0 commit comments

Comments
 (0)