Skip to content

Commit 5f5bb97

Browse files
authored
Merge pull request #1566 from lesserwhirls/gh-1542
Zarr bug fix and improvements
2 parents d8e4e42 + 65eb73b commit 5f5bb97

410 files changed

Lines changed: 138 additions & 10 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrHeader.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ private class DelayedVarMaker {
5454
private RandomAccessDirectoryItem var;
5555
private ZArray zarray;
5656
private Map<Integer, Long> initializedChunks; // track any uninitialized chunks for var
57+
private Map<Integer, Long> chunkStarts; // byte offset of each chunk within the store, keyed by chunk index
5758
private List<Attribute> attrs; // list of variable attributes
5859
private long dataOffset; // byte position where data starts
5960

@@ -65,6 +66,7 @@ void setVar(RandomAccessDirectoryItem var) {
6566
this.var = var;
6667
this.attrs = null;
6768
this.initializedChunks = new HashMap<>();
69+
this.chunkStarts = new HashMap<>();
6870
this.dataOffset = -1;
6971
if (var != null) {
7072
try {
@@ -104,6 +106,11 @@ void processItem(RandomAccessDirectoryItem item) {
104106
this.var = null; // skip rest of var is unrecognized files found
105107
}
106108
this.initializedChunks.put(index, item.length());
109+
// Record the actual byte offset of this chunk within the store, keyed by its numeric chunk index.
110+
// This avoids any dependency on the order in which the store lists files (which is lexicographic
111+
// and would otherwise place e.g. chunk 0.10 before chunk 0.2, which is the root cause of
112+
// https://github.com/Unidata/netcdf-java/issues/1542)
113+
this.chunkStarts.put(index, item.startIndex());
107114
// if data offset is uninitialized, set here
108115
if (this.dataOffset < 0) {
109116
this.dataOffset = item.startIndex();
@@ -115,7 +122,7 @@ void makeVar() {
115122
return; // do nothing if no variable is in progress
116123
}
117124
try {
118-
makeVariable(var, dataOffset, zarray, initializedChunks, attrs);
125+
makeVariable(var, dataOffset, zarray, initializedChunks, chunkStarts, attrs);
119126
} catch (ZarrFormatException ex) {
120127
logger.error(ex.getMessage());
121128
}
@@ -200,7 +207,8 @@ private void makeGroup(RandomAccessDirectoryItem item, List<Attribute> attrs) {
200207
}
201208

202209
private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArray zarray,
203-
Map<Integer, Long> initializedChunks, List<Attribute> attrs) throws ZarrFormatException {
210+
Map<Integer, Long> initializedChunks, Map<Integer, Long> chunkStarts, List<Attribute> attrs)
211+
throws ZarrFormatException {
204212
// make new Variable
205213
Variable.Builder<?> var = Variable.builder();
206214
String location = ZarrUtils.trimLocation(item.getLocation());
@@ -303,7 +311,7 @@ private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArra
303311

304312
// create VInfo
305313
VInfo vinfo = new VInfo(chunks, zarray.getFillValue(), zarray.getCompressor(), zarray.getByteOrder(),
306-
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks,
314+
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks, chunkStarts,
307315
zarray.getElementSize(), zarray.isUnicodeString());
308316
var.setSPobject(vinfo);
309317

@@ -389,7 +397,7 @@ private static int getChunkIndex(RandomAccessDirectoryItem item, ZArray zarray)
389397
int[] shape = zarray.getShape();
390398
int[] chunkSize = zarray.getChunks();
391399
for (int i = 0; i < nDims; i++) {
392-
nChunks[i] = (int) Math.ceil(shape[i] / chunkSize[i]);
400+
nChunks[i] = (int) Math.ceil((double) shape[i] / chunkSize[i]);
393401
}
394402
return ZarrUtils.subscriptsToIndex(subs, nChunks);
395403
} else {
@@ -422,12 +430,13 @@ class VInfo {
422430
private final List<Filter> filters;
423431
private final long offset;
424432
private final Map<Integer, Long> initializedChunks;
433+
private final Map<Integer, Long> chunkStarts;
425434
private final int elementSize;
426435
private final boolean unicodeString;
427436

428437
VInfo(int[] chunks, Object fillValue, Filter compressor, ByteOrder byteOrder, ZArray.Order order, String separator,
429-
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks, int elementSize,
430-
boolean unicodeString) {
438+
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks, Map<Integer, Long> chunkStarts,
439+
int elementSize, boolean unicodeString) {
431440
this.chunks = chunks;
432441
this.fillValue = fillValue;
433442
this.byteOrder = byteOrder;
@@ -437,6 +446,7 @@ class VInfo {
437446
this.filters = filters;
438447
this.offset = offset;
439448
this.initializedChunks = initializedChunks;
449+
this.chunkStarts = chunkStarts;
440450
this.elementSize = elementSize;
441451
this.unicodeString = unicodeString;
442452
}
@@ -477,6 +487,10 @@ public Map<Integer, Long> getInitializedChunks() {
477487
return this.initializedChunks;
478488
}
479489

490+
public Map<Integer, Long> getChunkStarts() {
491+
return this.chunkStarts;
492+
}
493+
480494
int getElementSize() {
481495
return this.elementSize;
482496
}

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrLayoutBB.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ public class ZarrLayoutBB implements LayoutBB {
3838
private int totalNChunks; // total number of chunks
3939
private boolean F_order = false; // F order storage?
4040
private Map<Integer, Long> initializedChunks; // set of chunks that exist as files and their compressed size
41+
private Map<Integer, Long> chunkStarts; // byte offset of each existing chunk within the store, keyed by chunk index
4142
private Filter compressor;
4243
private List<Filter> filters;
4344

@@ -55,12 +56,13 @@ public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
5556
this.chunkSize = vinfo.getChunks();
5657
int ndims = this.chunkSize.length;
5758
this.initializedChunks = vinfo.getInitializedChunks();
59+
this.chunkStarts = vinfo.getChunkStarts();
5860
this.nChunks = new int[ndims];
5961
this.totalNChunks = 1;
6062
for (int i = 0; i < ndims; i++) {
6163
Dimension dim = v2.getDimension(i);
6264
// round up nchunks if not evenly divisible by chunk size
63-
this.nChunks[i] = (int) Math.ceil(dim.getLength() / this.chunkSize[i]);
65+
this.nChunks[i] = (int) Math.ceil((double) dim.getLength() / this.chunkSize[i]);
6466
this.totalNChunks *= nChunks[i];
6567
}
6668

@@ -120,15 +122,16 @@ private class DataChunkIterator implements LayoutBBTiled.DataChunkIterator {
120122
DataChunkIterator() {
121123
this.currChunk = new int[chunkSize.length];
122124
this.chunkNum = 0;
123-
this.currOffset = varOffset; // start at start of variable data
125+
this.currOffset = chunkStarts.getOrDefault(this.chunkNum, varOffset);
124126
}
125127

126128
public boolean hasNext() {
127129
return this.chunkNum < totalNChunks;
128130
}
129131

130132
public LayoutBBTiled.DataChunk next() {
131-
DataChunk chunk = new ZarrLayoutBB.DataChunk(this.currChunk, this.chunkNum, this.currOffset);
133+
long offset = chunkStarts.getOrDefault(this.chunkNum, this.currOffset);
134+
DataChunk chunk = new ZarrLayoutBB.DataChunk(this.currChunk, this.chunkNum, offset);
132135
incrementChunk();
133136
return chunk;
134137
}
@@ -142,7 +145,6 @@ private void incrementChunk() {
142145
i--;
143146
}
144147
this.currChunk[i]++;
145-
this.currOffset += initializedChunks.getOrDefault(this.chunkNum, (long) 0);
146148
this.chunkNum = ZarrUtils.subscriptsToIndex(this.currChunk, nChunks);
147149
} else {
148150
// scalar array
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import numpy as np
2+
import zarr
3+
4+
store = zarr.storage.LocalStore('../test_o10_multichunk.zarr')
5+
6+
# create array
7+
data = np.arange(10000).reshape((100,100))
8+
9+
root_group = zarr.group(store, overwrite=True, zarr_format=2)
10+
11+
# create array with more than 10 chunks in each dimension
12+
# 10 chunks in first dimension, 20 chunks in second
13+
# so chunks will be [0-9].[0-19]
14+
multichunk = root_group.create_array('ten_by_five', shape=data.shape, chunks=(10,5), dtype='<u8', overwrite=True, compressors=None)
15+
multichunk[:] = data
16+
17+
multichunk_blosc = root_group.create_array('ten_by_five_blosc', shape=data.shape, chunks=(10,5), dtype='<u8', overwrite=True)
18+
multichunk_blosc[:] = data
19+
20+
compressors=None
21+
print(multichunk)
22+
print(multichunk[:])
23+
24+
print(multichunk_blosc)
25+
print(multichunk_blosc[:])
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"zarr_format": 2
3+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"shape": [
3+
100,
4+
100
5+
],
6+
"chunks": [
7+
10,
8+
5
9+
],
10+
"dtype": "<u8",
11+
"fill_value": 0,
12+
"order": "C",
13+
"filters": null,
14+
"dimension_separator": ".",
15+
"compressor": null,
16+
"zarr_format": 2
17+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}
400 Bytes
Binary file not shown.
400 Bytes
Binary file not shown.
400 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)