Skip to content

Commit 95722c3

Browse files
committed
Fix var-length write for equal-length sub-arrays
1 parent ed8ba44 commit 95722c3

3 files changed

Lines changed: 101 additions & 3 deletions

File tree

tiledb/array.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,21 @@ def _write_array(
869869
attr_val = np.nan_to_num(values[i])
870870
else:
871871
attr_val = values[i]
872-
buffer, offsets = array_to_buffer(attr_val, True, False)
872+
873+
# Numpy coalesces equal-length sub-arrays into N-D
874+
if (
875+
attr_val.ndim > 1
876+
and attr_val.size > 0
877+
and not isinstance(attr_val.flat[0], (np.ndarray, str, bytes))
878+
):
879+
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
880+
n = attr_val.shape[0]
881+
offsets = np.arange(n, dtype=np.uint64) * np.uint64(
882+
attr_val.strides[0]
883+
)
884+
buffer = attr_val.ravel().view(np.uint8)
885+
else:
886+
buffer, offsets = array_to_buffer(attr_val, True, False)
873887
except Exception as exc:
874888
raise type(exc)(
875889
f"Failed to convert buffer for attribute: '{attr.name}'"

tiledb/sparse_array.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,14 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
174174
)
175175

176176
ncells = sparse_coords[0].shape[0]
177-
if attr_val.size != ncells:
177+
# For var-length attributes, numpy may coalesce equal-length
178+
# sub-arrays into a higher-dimensional array (e.g. shape (n, m)
179+
# instead of (n,) with dtype=object). Use shape[0] in that case.
180+
nvals = attr_val.shape[0] if attr.isvar and attr_val.ndim > 1 else attr_val.size
181+
if nvals != ncells:
178182
raise ValueError(
179183
"value length ({}) does not match "
180-
"coordinate length ({})".format(attr_val.size, ncells)
184+
"coordinate length ({})".format(nvals, ncells)
181185
)
182186
sparse_attributes.append(attr._internal_name)
183187
sparse_values.append(attr_val)

tiledb/tests/test_libtiledb.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,39 @@ def test_varlen_write_floats(self):
14191419
# can't use assert_array_equal w/ object array
14201420
self.assertTrue(all(np.array_equal(x, A[i]) for i, x in enumerate(T_)))
14211421

1422+
def test_varlen_write_homogeneous_subarrays(self):
1423+
"""Test writing var-length attributes where all sub-arrays have the
1424+
same length. numpy coalesces these into a 2D array which previously
1425+
caused errors. See https://github.com/TileDB-Inc/TileDB-Py/issues/494
1426+
"""
1427+
# All sub-arrays have length 3 — numpy will coalesce into shape (4, 3)
1428+
A = np.array(
1429+
[
1430+
np.array([1, 2, 9], dtype=np.int64),
1431+
np.array([3, 4, 5], dtype=np.int64),
1432+
np.array([7, 8, 6], dtype=np.int64),
1433+
np.array([10, 11, 12], dtype=np.int64),
1434+
],
1435+
dtype="O",
1436+
)
1437+
1438+
dom = tiledb.Domain(tiledb.Dim(domain=(1, 4), tile=4))
1439+
att = tiledb.Attr(name="val", dtype=np.int64, var=True)
1440+
schema = tiledb.ArraySchema(dom, (att,))
1441+
tiledb.DenseArray.create(self.path("homogeneous_varlen"), schema)
1442+
1443+
with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="w") as T:
1444+
T[:] = {"val": A}
1445+
1446+
with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="r") as T:
1447+
res = T[:]["val"]
1448+
expected = np.empty(4, dtype=object)
1449+
expected[0] = np.array([1, 2, 9], dtype=np.int64)
1450+
expected[1] = np.array([3, 4, 5], dtype=np.int64)
1451+
expected[2] = np.array([7, 8, 6], dtype=np.int64)
1452+
expected[3] = np.array([10, 11, 12], dtype=np.int64)
1453+
assert_subarrays_equal(res, expected)
1454+
14221455
def test_varlen_write_floats_2d(self):
14231456
A = np.array(
14241457
[np.random.rand(x) for x in np.arange(1, 10)], dtype=object
@@ -2249,6 +2282,53 @@ def test_sparse_2d_varlen_int(self, fx_sparse_cell_order):
22492282
assert_unordered_equal(res["__dim_0"], c1)
22502283
assert_unordered_equal(res["__dim_1"], c2)
22512284

2285+
@pytest.mark.parametrize(
2286+
"dtype,use_object_dtype",
2287+
[
2288+
(np.int64, True),
2289+
(np.int64, False),
2290+
(np.int32, True),
2291+
(np.float32, True),
2292+
(np.float64, False),
2293+
(np.uint32, True),
2294+
],
2295+
)
2296+
def test_sparse_varlen_homogeneous_subarrays(
2297+
self, fx_sparse_cell_order, dtype, use_object_dtype
2298+
):
2299+
"""Test writing var-length attributes where all sub-arrays have the
2300+
same length. numpy coalesces these into a 2D array which previously
2301+
caused a 'value length does not match coordinate length' error.
2302+
See https://github.com/TileDB-Inc/TileDB-Py/issues/494
2303+
"""
2304+
path = self.path("test_sparse_varlen_homogeneous_subarrays")
2305+
dom = tiledb.Domain(tiledb.Dim(domain=(0, 10), dtype=np.int64))
2306+
att = tiledb.Attr(name="val", var=True, dtype=dtype)
2307+
schema = tiledb.ArraySchema(
2308+
dom, (att,), sparse=True, cell_order=fx_sparse_cell_order
2309+
)
2310+
tiledb.SparseArray.create(path, schema)
2311+
2312+
a = np.array([1, 2, 9], dtype=dtype)
2313+
b = np.array([3, 4, 5], dtype=dtype)
2314+
2315+
if use_object_dtype:
2316+
# User explicitly passes dtype='O'; becomes 2D after dtype conversion
2317+
vals = np.array([a, b], dtype="O")
2318+
else:
2319+
# User has no control over dtype; numpy coalesces to 2D native
2320+
vals = np.array([a, b])
2321+
2322+
with tiledb.SparseArray(path, "w") as A:
2323+
A[[1, 2]] = {"val": vals}
2324+
2325+
with tiledb.SparseArray(path, "r") as A:
2326+
res = A[:]
2327+
expected = np.empty(2, dtype=object)
2328+
expected[0] = a
2329+
expected[1] = b
2330+
assert_subarrays_equal(res["val"], expected)
2331+
22522332
def test_sparse_mixed_domain_uint_float64(self, fx_sparse_cell_order):
22532333
path = self.path("mixed_domain_uint_float64")
22542334
dims = [

0 commit comments

Comments
 (0)