Skip to content

Commit c11f046

Browse files
committed
Handle U and S dtypes in Zarr
Zarr arrays with U and S dtypes need special handling, as the numeric value in their dtype definition does not simply represent the byte size of each element. Also, update the script that generates test_dtypes.zarr to include a U2 and S2 array, and migrate to the Zar v3 API (still writing the Zarr V2 format). Fixes #1534.
1 parent a7a2ec2 commit c11f046

35 files changed

Lines changed: 538 additions & 352 deletions

File tree

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZArray.java

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
/*
2+
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
3+
* See LICENSE for license information.
4+
*/
5+
16
package ucar.nc2.iosp.zarr;
27

38
import com.fasterxml.jackson.core.JsonParser;
@@ -16,6 +21,7 @@
1621
import java.io.IOException;
1722
import java.nio.ByteOrder;
1823
import java.util.*;
24+
import java.util.regex.Pattern;
1925
import java.util.stream.Collectors;
2026
import java.util.stream.Stream;
2127
import java.util.stream.StreamSupport;
@@ -36,6 +42,10 @@ public enum Order {
3642
// maps zarr datatypes to CDM datatypes
3743
private static Map<String, DataType> dTypeMap;
3844

45+
// regex for matching numpy byteorder marks
46+
// see https://numpy.org/doc/stable/reference/generated/numpy.dtype.byteorder.html#numpy-dtype-byteorder
47+
private static final Pattern BYTE_ORDER_PATTERN = Pattern.compile("[><|=]");
48+
3949
static {
4050
dTypeMap = new HashMap<>();
4151
dTypeMap.put("b1", DataType.BOOLEAN);
@@ -71,6 +81,8 @@ public enum Order {
7181
private final Order order;
7282
private final List<Filter> filters;
7383
private final String separator;
84+
private final int elementSize; // size of a single element on disk, in bytes
85+
private final boolean unicodeString; // true for numpy U dtype fixed-length strings
7486

7587
public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter compressor, String order,
7688
List<Filter> filters, String separator) throws ZarrFormatException {
@@ -80,6 +92,8 @@ public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter
8092
this.dtype = dtype;
8193
this.datatype = parseDataType(this.dtype);
8294
this.byteOrder = parseByteOrder(this.dtype);
95+
this.elementSize = parseElementSize(this.dtype);
96+
this.unicodeString = stripByteOrder(this.dtype).charAt(0) == 'U';
8397
this.compressor = compressor;
8498
this.filters = filters;
8599
this.order = parseOrder(order);
@@ -126,17 +140,57 @@ public ByteOrder getByteOrder() {
126140
return this.byteOrder;
127141
}
128142

143+
/**
144+
* The size, in bytes, of a single element of this array as stored on disk.
145+
*/
146+
public int getElementSize() {
147+
return this.elementSize;
148+
}
149+
150+
/**
151+
* True if this array holds numpy U dtype.
152+
*/
153+
boolean isUnicodeString() {
154+
return this.unicodeString;
155+
}
156+
157+
private static String stripByteOrder(String dtype) {
158+
return BYTE_ORDER_PATTERN.matcher(dtype).replaceAll("");
159+
}
160+
129161
private static DataType parseDataType(String dtype) throws ZarrFormatException {
130-
dtype = dtype.replace(">", "");
131-
dtype = dtype.replace("<", "");
132-
dtype = dtype.replace("|", "");
162+
dtype = stripByteOrder(dtype);
163+
final char typeChar = dtype.charAt(0);
164+
// S (fixed-length byte strings) and U (fixed-length unicode strings) do not follow the
165+
// usual [type char][type size in bytes] pattern: the trailing integer is a fixed character
166+
// count, not a byte size. See https://github.com/Unidata/netcdf-java/issues/1534
167+
if (typeChar == 'S' || typeChar == 'U') {
168+
final int nChars = parseLength(dtype);
169+
// a single byte char maps to CDM CHAR, otherwise it is a fixed-length String
170+
return (typeChar == 'S' && nChars == 1) ? DataType.CHAR : DataType.STRING;
171+
}
133172
DataType dataType = dTypeMap.get(dtype);
134173
if (dataType == null) {
135174
throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
136175
}
137176
return dataType;
138177
}
139178

179+
private static int parseElementSize(String dtype) throws ZarrFormatException {
180+
dtype = stripByteOrder(dtype);
181+
final char typeChar = dtype.charAt(0);
182+
final int length = parseLength(dtype);
183+
return (typeChar == 'U') ? 4 * length : length;
184+
}
185+
186+
private static int parseLength(String dtype) throws ZarrFormatException {
187+
try {
188+
return Integer.parseInt(dtype.substring(1));
189+
} catch (NumberFormatException | IndexOutOfBoundsException ex) {
190+
throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
191+
}
192+
}
193+
140194
private static ByteOrder parseByteOrder(String dtype) throws ZarrFormatException {
141195
if (dtype.startsWith(">")) {
142196
return ByteOrder.BIG_ENDIAN;

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrHeader.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
2+
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
33
* See LICENSE for license information.
44
*/
55

@@ -303,7 +303,8 @@ private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArra
303303

304304
// create VInfo
305305
VInfo vinfo = new VInfo(chunks, zarray.getFillValue(), zarray.getCompressor(), zarray.getByteOrder(),
306-
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks);
306+
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks,
307+
zarray.getElementSize(), zarray.isUnicodeString());
307308
var.setSPobject(vinfo);
308309

309310
// Include some info from .zarray file in attributes for display when showing variable detail.
@@ -421,9 +422,12 @@ class VInfo {
421422
private final List<Filter> filters;
422423
private final long offset;
423424
private final Map<Integer, Long> initializedChunks;
425+
private final int elementSize;
426+
private final boolean unicodeString;
424427

425428
VInfo(int[] chunks, Object fillValue, Filter compressor, ByteOrder byteOrder, ZArray.Order order, String separator,
426-
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks) {
429+
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks, int elementSize,
430+
boolean unicodeString) {
427431
this.chunks = chunks;
428432
this.fillValue = fillValue;
429433
this.byteOrder = byteOrder;
@@ -433,6 +437,8 @@ class VInfo {
433437
this.filters = filters;
434438
this.offset = offset;
435439
this.initializedChunks = initializedChunks;
440+
this.elementSize = elementSize;
441+
this.unicodeString = unicodeString;
436442
}
437443

438444
public int[] getChunks() {
@@ -471,6 +477,14 @@ public Map<Integer, Long> getInitializedChunks() {
471477
return this.initializedChunks;
472478
}
473479

480+
int getElementSize() {
481+
return this.elementSize;
482+
}
483+
484+
boolean isUnicodeString() {
485+
return this.unicodeString;
486+
}
487+
474488
}
475489

476490
}

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrIosp.java

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 University Corporation for Atmospheric Research/Unidata
2+
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
33
* See LICENSE for license information.
44
*/
55

@@ -20,6 +20,10 @@
2020

2121
import java.io.IOException;
2222
import java.lang.invoke.MethodHandles;
23+
import java.nio.ByteBuffer;
24+
import java.nio.ByteOrder;
25+
import java.nio.charset.Charset;
26+
import java.nio.charset.StandardCharsets;
2327

2428
/**
2529
* IOSP for reading/writing Zarr/NCZarr formats
@@ -83,8 +87,14 @@ public Array readData(Variable v2, Section section) {
8387
Object fillValue = getFillValue(vinfo, dataType);
8488

8589
// create layout object
86-
Layout layout = new ZarrLayoutBB(v2, section, this.raf);
87-
Object data = IospHelper.readDataFill((LayoutBB) layout, dataType, fillValue);
90+
LayoutBB layout = new ZarrLayoutBB(v2, section, this.raf);
91+
final Object data;
92+
if (dataType == DataType.STRING) {
93+
// fixed-length string types (S/U) need custom decoding (not handled by the generic IospHelper string reader).
94+
data = readStringData(layout, vinfo, fillValue);
95+
} else {
96+
data = IospHelper.readDataFill(layout, dataType, fillValue);
97+
}
8898

8999
Array array = Array.factory(dataType, section.getShape(), data);
90100
if (vinfo.getOrder() == ZArray.Order.F) {
@@ -99,6 +109,56 @@ public Array readData(Variable v2, Section section) {
99109
return array;
100110
}
101111

112+
/**
113+
* Read fixed-length string data ('S' or 'U' dtypes) from the layout.
114+
*
115+
* <p>
116+
* See https://github.com/Unidata/netcdf-java/issues/1534
117+
*/
118+
private static String[] readStringData(LayoutBB layout, ZarrHeader.VInfo vinfo, Object fillValue) {
119+
final int nelems = (int) layout.getTotalNelems();
120+
final int recSize = layout.getElemSize();
121+
final String[] pa = new String[nelems];
122+
if (fillValue instanceof String) {
123+
java.util.Arrays.fill(pa, (String) fillValue);
124+
}
125+
126+
final Charset charset;
127+
if (vinfo.isUnicodeString()) {
128+
charset =
129+
vinfo.getByteOrder() == ByteOrder.BIG_ENDIAN ? Charset.forName("UTF-32BE") : Charset.forName("UTF-32LE");
130+
} else {
131+
charset = StandardCharsets.ISO_8859_1;
132+
}
133+
134+
while (layout.hasNext()) {
135+
LayoutBB.Chunk chunk = layout.next();
136+
ByteBuffer bb = chunk.getByteBuffer();
137+
// if chunk is empty, use fill value
138+
if (!bb.hasRemaining()) {
139+
continue;
140+
}
141+
bb.position(chunk.getSrcElem() * recSize);
142+
int pos = (int) chunk.getDestElem();
143+
final byte[] raw = new byte[recSize];
144+
for (int i = 0; i < chunk.getNelems(); i++) {
145+
bb.get(raw);
146+
pa[pos++] = decodeFixedLengthString(raw, charset);
147+
}
148+
}
149+
return pa;
150+
}
151+
152+
private static String decodeFixedLengthString(byte[] raw, Charset charset) {
153+
String s = new String(raw, charset);
154+
// NumPy fixed-length strings are null-padded, so strip trailing NUL characters
155+
int end = s.length();
156+
while (end > 0 && s.charAt(end - 1) == '\0') {
157+
end--;
158+
}
159+
return s.substring(0, end);
160+
}
161+
102162
private Object getFillValue(ZarrHeader.VInfo vinfo, DataType dataType) {
103163

104164
// Watch for floating point fill values encoded as Strings

cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrLayoutBB.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
2+
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
33
* See LICENSE for license information.
44
*/
55

@@ -80,7 +80,10 @@ public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
8080
this.want = wantSection;
8181
}
8282

83-
this.elemSize = v2.getDataType().getSize();
83+
// Use the on-disk element byte width from the .zarray metadata. For most types this matches
84+
// DataType.getSize(), but for fixed-length string types (S/U) it captures the true element
85+
// width (N bytes for S, 4*N bytes for U).
86+
this.elemSize = vinfo.getElementSize();
8487

8588
// create delegate and chunk iterator
8689
ZarrLayoutBB.DataChunkIterator iter = new ZarrLayoutBB.DataChunkIterator();

cdm/zarr/src/test/data/scripts/make_zarr_dtype_test_data.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,15 @@
5757

5858

5959
import zarr
60-
store = zarr.DirectoryStore('../test_dtypes.zarr')
60+
# Zarr-Python 3 API. The store is written using the Zarr format 2
61+
# specification by passing zarr_format=2 to the top level API.
62+
store = zarr.storage.LocalStore('../test_dtypes.zarr')
6163

6264

6365
# In[ ]:
6466

6567

66-
root_grp = zarr.group(store, overwrite=True)
68+
root_grp = zarr.group(store, overwrite=True, zarr_format=2)
6769
# create a group for byte-order independent data types
6870
unordered_group = root_grp.create_group('unordered_group', overwrite=True)
6971

@@ -82,66 +84,74 @@
8284

8385

8486
# add data to unordered group
85-
b = unordered_group.create_dataset('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressor=None)
87+
b = unordered_group.create_array('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressors=None)
8688
b[:] = boolean_data
87-
byte = unordered_group.create_dataset('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressor=None)
89+
byte = unordered_group.create_array('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressors=None)
8890
byte[:] = bdata
89-
ubyte = unordered_group.create_dataset('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressor=None)
91+
ubyte = unordered_group.create_array('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressors=None)
9092
ubyte[:] = bdata
9193

9294

9395
# In[ ]:
9496

9597

9698
# add data to big endian group
97-
shorts = big_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressor=None)
99+
shorts = big_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressors=None)
98100
shorts[:] = be_short_data
99-
ushorts = big_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressor=None)
101+
ushorts = big_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressors=None)
100102
ushorts[:] = be_short_data
101-
ints = big_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressor=None)
103+
ints = big_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressors=None)
102104
ints[:] = be_int_data
103-
uints = big_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressor=None)
105+
uints = big_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressors=None)
104106
uints[:] = be_int_data
105-
longs = big_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressor=None)
107+
longs = big_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressors=None)
106108
longs[:] = be_long_data
107-
ulongs = big_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressor=None)
109+
ulongs = big_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressors=None)
108110
ulongs[:] = be_long_data
109-
floats = big_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressor=None)
111+
floats = big_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressors=None)
110112
floats[:] = be_float_data
111-
doubles = big_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressor=None)
113+
doubles = big_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressors=None)
112114
doubles[:] = be_double_data
113115

114116

115117
# In[ ]:
116118

117119

118120
# add data to little endian group
119-
shorts = little_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressor=None)
121+
shorts = little_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressors=None)
120122
shorts[:] = le_short_data
121-
ushorts = little_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressor=None)
123+
ushorts = little_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressors=None)
122124
ushorts[:] = le_short_data
123-
ints = little_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressor=None)
125+
ints = little_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressors=None)
124126
ints[:] = le_int_data
125-
uints = little_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressor=None)
127+
uints = little_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressors=None)
126128
uints[:] = le_int_data
127-
longs = little_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressor=None)
129+
longs = little_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressors=None)
128130
longs[:] = le_long_data
129-
ulongs = little_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressor=None)
131+
ulongs = little_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressors=None)
130132
ulongs[:] = le_long_data
131-
floats = little_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressor=None)
133+
floats = little_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressors=None)
132134
floats[:] = le_float_data
133-
doubles = little_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressor=None)
135+
doubles = little_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressors=None)
134136
doubles[:] = le_double_data
135137

136138

137139
# In[ ]:
138140

139141

140142
# add string data
141-
chars = string_group.create_dataset('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressor=None)
143+
chars = string_group.create_array('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressors=None)
142144
chars[:] = charar
143-
strs = string_group.create_dataset('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressor=None)
145+
strs = string_group.create_array('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressors=None)
144146
strs[:] = charar
145-
unicode = string_group.create_dataset('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressor=None)
147+
strs2 = string_group.create_array('str_data_2', shape=(10,12), chunks=(5,6), dtype='S2', overwrite=True, compressors=None)
148+
strs2[:] = charar
149+
unicode = string_group.create_array('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressors=None)
146150
unicode[:] = charar
147-
151+
unicode2 = string_group.create_array('unicode_data_2', shape=(10,12), chunks=(5,6), dtype='U2', overwrite=True, compressors=None)
152+
unicode2[:] = charar
153+
print(chars[:])
154+
print(strs[:])
155+
print(strs2[:])
156+
print(unicode[:])
157+
print(unicode2[:])
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"zarr_format": 2
2+
"zarr_format": 2
33
}

0 commit comments

Comments
 (0)