|
16 | 16 | lance_dataset_module = importlib.import_module("lance.dataset") |
17 | 17 |
|
18 | 18 |
|
| 19 | +def _blob_row_ids(dataset): |
| 20 | + return dataset.to_table(columns=[], with_row_id=True).column("_rowid").to_pylist() |
| 21 | + |
| 22 | + |
| 23 | +def _blob_row_addresses(dataset): |
| 24 | + return ( |
| 25 | + dataset.to_table(columns=["idx"], with_row_address=True) |
| 26 | + .column("_rowaddr") |
| 27 | + .to_pylist() |
| 28 | + ) |
| 29 | + |
| 30 | + |
| 31 | +def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): |
| 32 | + addresses = _blob_row_addresses(dataset_with_blobs) |
| 33 | + expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] |
| 34 | + |
| 35 | + if selection_kind == "ids": |
| 36 | + return [ |
| 37 | + _blob_row_ids(dataset_with_blobs)[4], |
| 38 | + _blob_row_ids(dataset_with_blobs)[0], |
| 39 | + ], expected |
| 40 | + if selection_kind == "addresses": |
| 41 | + return [addresses[4], addresses[0]], expected |
| 42 | + return [4, 0], expected |
| 43 | + |
| 44 | + |
19 | 45 | def test_blob_read_from_binary(): |
20 | 46 | values = [b"foo", b"bar", b"baz"] |
21 | 47 | data = pa.table( |
@@ -251,6 +277,134 @@ def test_blob_by_indices(tmp_path, dataset_with_blobs): |
251 | 277 | assert f1.read() == f2.read() |
252 | 278 |
|
253 | 279 |
|
| 280 | +@pytest.mark.parametrize( |
| 281 | + ("selection_kind", "selection_values", "expected"), |
| 282 | + [ |
| 283 | + ("ids", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 284 | + ("addresses", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 285 | + ("indices", [0, 4], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 286 | + ], |
| 287 | +) |
| 288 | +def test_read_blobs(dataset_with_blobs, selection_kind, selection_values, expected): |
| 289 | + kwargs = {selection_kind: selection_values} |
| 290 | + |
| 291 | + blobs = dataset_with_blobs.read_blobs( |
| 292 | + "blobs", |
| 293 | + **kwargs, |
| 294 | + io_buffer_size=1024, |
| 295 | + preserve_order=True, |
| 296 | + ) |
| 297 | + |
| 298 | + assert blobs == expected |
| 299 | + |
| 300 | + |
| 301 | +def test_read_blobs_requires_single_selector(dataset_with_blobs): |
| 302 | + with pytest.raises( |
| 303 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 304 | + ): |
| 305 | + dataset_with_blobs.read_blobs("blobs", ids=[0], indices=[0]) |
| 306 | + |
| 307 | + |
| 308 | +def test_read_blobs_requires_selector(dataset_with_blobs): |
| 309 | + with pytest.raises( |
| 310 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 311 | + ): |
| 312 | + dataset_with_blobs.read_blobs("blobs") |
| 313 | + |
| 314 | + |
| 315 | +def test_read_blobs_rejects_non_blob_column(dataset_with_blobs): |
| 316 | + with pytest.raises(ValueError, match="not a blob column"): |
| 317 | + dataset_with_blobs.read_blobs("idx", indices=[0]) |
| 318 | + |
| 319 | + |
| 320 | +@pytest.mark.parametrize( |
| 321 | + ("selection_kind", "selection_values", "expected"), |
| 322 | + [ |
| 323 | + ( |
| 324 | + "ids", |
| 325 | + pa.array([0, (1 << 32) + 1], type=pa.uint64()), |
| 326 | + [(0, b"foo"), ((1 << 32) + 1, b"quux")], |
| 327 | + ), |
| 328 | + ( |
| 329 | + "addresses", |
| 330 | + pa.array([0, (1 << 32) + 1], type=pa.uint64()), |
| 331 | + [(0, b"foo"), ((1 << 32) + 1, b"quux")], |
| 332 | + ), |
| 333 | + ( |
| 334 | + "indices", |
| 335 | + pa.array([0, 4], type=pa.uint64()), |
| 336 | + [(0, b"foo"), ((1 << 32) + 1, b"quux")], |
| 337 | + ), |
| 338 | + ], |
| 339 | +) |
| 340 | +def test_read_blobs_accepts_arrow_array_selectors( |
| 341 | + dataset_with_blobs, selection_kind, selection_values, expected |
| 342 | +): |
| 343 | + kwargs = {selection_kind: selection_values} |
| 344 | + |
| 345 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs) |
| 346 | + |
| 347 | + assert blobs == expected |
| 348 | + |
| 349 | + |
| 350 | +@pytest.mark.parametrize( |
| 351 | + ("selection_kind", "selection_values"), |
| 352 | + [ |
| 353 | + ("ids", []), |
| 354 | + ("addresses", []), |
| 355 | + ("indices", []), |
| 356 | + ("ids", pa.array([], type=pa.uint64())), |
| 357 | + ("addresses", pa.array([], type=pa.uint64())), |
| 358 | + ("indices", pa.array([], type=pa.uint64())), |
| 359 | + ], |
| 360 | +) |
| 361 | +def test_read_blobs_accepts_empty_selection( |
| 362 | + dataset_with_blobs, selection_kind, selection_values |
| 363 | +): |
| 364 | + kwargs = {selection_kind: selection_values} |
| 365 | + |
| 366 | + assert dataset_with_blobs.read_blobs("blobs", **kwargs) == [] |
| 367 | + |
| 368 | + |
| 369 | +@pytest.mark.parametrize( |
| 370 | + ("planner_kwargs", "error_message"), |
| 371 | + [ |
| 372 | + ({"io_buffer_size": 0}, "io_buffer_size must be greater than 0"), |
| 373 | + ], |
| 374 | +) |
| 375 | +def test_read_blobs_rejects_invalid_planner_options( |
| 376 | + dataset_with_blobs, planner_kwargs, error_message |
| 377 | +): |
| 378 | + with pytest.raises(ValueError, match=error_message): |
| 379 | + dataset_with_blobs.read_blobs("blobs", indices=[0], **planner_kwargs) |
| 380 | + |
| 381 | + |
| 382 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 383 | +def test_read_blobs_preserves_input_order(dataset_with_blobs, selection_kind): |
| 384 | + selection_values, expected = _out_of_order_blob_selection( |
| 385 | + dataset_with_blobs, selection_kind |
| 386 | + ) |
| 387 | + kwargs = {selection_kind: selection_values} |
| 388 | + |
| 389 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=True) |
| 390 | + |
| 391 | + assert blobs == expected |
| 392 | + |
| 393 | + |
| 394 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 395 | +def test_read_blobs_without_preserve_order_returns_same_rows( |
| 396 | + dataset_with_blobs, selection_kind |
| 397 | +): |
| 398 | + selection_values, expected = _out_of_order_blob_selection( |
| 399 | + dataset_with_blobs, selection_kind |
| 400 | + ) |
| 401 | + kwargs = {selection_kind: selection_values} |
| 402 | + |
| 403 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=False) |
| 404 | + |
| 405 | + assert sorted(blobs) == sorted(expected) |
| 406 | + |
| 407 | + |
254 | 408 | def test_blob_file_seek(tmp_path, dataset_with_blobs): |
255 | 409 | row_ids = ( |
256 | 410 | dataset_with_blobs.to_table(columns=[], with_row_id=True) |
@@ -466,6 +620,12 @@ def test_blob_extension_write_external_slice(tmp_path): |
466 | 620 | with blob_file as f: |
467 | 621 | assert f.read() == expected |
468 | 622 |
|
| 623 | + assert ds.read_blobs("blob", indices=[0, 1, 2]) == [ |
| 624 | + (0, b"alpha"), |
| 625 | + (1, b"bravo"), |
| 626 | + (2, b"charlie"), |
| 627 | + ] |
| 628 | + |
469 | 629 |
|
470 | 630 | def test_blob_extension_write_external_slice_ingest(tmp_path): |
471 | 631 | tar_path = tmp_path / "container.tar" |
@@ -548,6 +708,8 @@ def test_blob_extension_take_blobs_multi_base(payload, is_dataset_root, tmp_path |
548 | 708 | with blobs[0] as f: |
549 | 709 | assert f.read() == payload |
550 | 710 |
|
| 711 | + assert ds.read_blobs("blob", indices=[0]) == [(0, payload)] |
| 712 | + |
551 | 713 |
|
552 | 714 | @pytest.fixture |
553 | 715 | def dataset_for_pandas_blob_tests(tmp_path): |
|
0 commit comments