|
16 | 16 | lance_dataset_module = importlib.import_module("lance.dataset") |
17 | 17 |
|
18 | 18 |
|
| 19 | +def _blob_row_ids(dataset): |
| 20 | + return dataset.to_table(columns=[], with_row_id=True).column("_rowid").to_pylist() |
| 21 | + |
| 22 | + |
| 23 | +def _blob_row_addresses(dataset): |
| 24 | + return ( |
| 25 | + dataset.to_table(columns=["idx"], with_row_address=True) |
| 26 | + .column("_rowaddr") |
| 27 | + .to_pylist() |
| 28 | + ) |
| 29 | + |
| 30 | + |
| 31 | +def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): |
| 32 | + addresses = _blob_row_addresses(dataset_with_blobs) |
| 33 | + expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] |
| 34 | + |
| 35 | + if selection_kind == "ids": |
| 36 | + return [_blob_row_ids(dataset_with_blobs)[4], _blob_row_ids(dataset_with_blobs)[0]], expected |
| 37 | + if selection_kind == "addresses": |
| 38 | + return [addresses[4], addresses[0]], expected |
| 39 | + return [4, 0], expected |
| 40 | + |
| 41 | + |
19 | 42 | def test_blob_read_from_binary(): |
20 | 43 | values = [b"foo", b"bar", b"baz"] |
21 | 44 | data = pa.table( |
@@ -251,6 +274,125 @@ def test_blob_by_indices(tmp_path, dataset_with_blobs): |
251 | 274 | assert f1.read() == f2.read() |
252 | 275 |
|
253 | 276 |
|
| 277 | +@pytest.mark.parametrize( |
| 278 | + ("selection_kind", "selection_values", "expected"), |
| 279 | + [ |
| 280 | + ("ids", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 281 | + ("addresses", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 282 | + ("indices", [0, 4], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 283 | + ], |
| 284 | +) |
| 285 | +def test_read_blobs(dataset_with_blobs, selection_kind, selection_values, expected): |
| 286 | + kwargs = {selection_kind: selection_values} |
| 287 | + |
| 288 | + blobs = dataset_with_blobs.read_blobs( |
| 289 | + "blobs", |
| 290 | + **kwargs, |
| 291 | + target_request_bytes=1024, |
| 292 | + max_gap_bytes=64, |
| 293 | + max_concurrency=2, |
| 294 | + preserve_order=True, |
| 295 | + ) |
| 296 | + |
| 297 | + assert blobs == expected |
| 298 | + |
| 299 | + |
| 300 | +def test_read_blobs_requires_single_selector(dataset_with_blobs): |
| 301 | + with pytest.raises( |
| 302 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 303 | + ): |
| 304 | + dataset_with_blobs.read_blobs("blobs", ids=[0], indices=[0]) |
| 305 | + |
| 306 | + |
| 307 | +def test_read_blobs_requires_selector(dataset_with_blobs): |
| 308 | + with pytest.raises( |
| 309 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 310 | + ): |
| 311 | + dataset_with_blobs.read_blobs("blobs") |
| 312 | + |
| 313 | + |
| 314 | +def test_read_blobs_rejects_non_blob_column(dataset_with_blobs): |
| 315 | + with pytest.raises(ValueError, match="not a blob column"): |
| 316 | + dataset_with_blobs.read_blobs("idx", indices=[0]) |
| 317 | + |
| 318 | + |
| 319 | +@pytest.mark.parametrize( |
| 320 | + ("selection_kind", "selection_values", "expected"), |
| 321 | + [ |
| 322 | + ("ids", pa.array([0, (1 << 32) + 1], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 323 | + ("addresses", pa.array([0, (1 << 32) + 1], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 324 | + ("indices", pa.array([0, 4], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 325 | + ], |
| 326 | +) |
| 327 | +def test_read_blobs_accepts_arrow_array_selectors( |
| 328 | + dataset_with_blobs, selection_kind, selection_values, expected |
| 329 | +): |
| 330 | + kwargs = {selection_kind: selection_values} |
| 331 | + |
| 332 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs) |
| 333 | + |
| 334 | + assert blobs == expected |
| 335 | + |
| 336 | + |
| 337 | +@pytest.mark.parametrize( |
| 338 | + ("selection_kind", "selection_values"), |
| 339 | + [ |
| 340 | + ("ids", []), |
| 341 | + ("addresses", []), |
| 342 | + ("indices", []), |
| 343 | + ("ids", pa.array([], type=pa.uint64())), |
| 344 | + ("addresses", pa.array([], type=pa.uint64())), |
| 345 | + ("indices", pa.array([], type=pa.uint64())), |
| 346 | + ], |
| 347 | +) |
| 348 | +def test_read_blobs_accepts_empty_selection( |
| 349 | + dataset_with_blobs, selection_kind, selection_values |
| 350 | +): |
| 351 | + kwargs = {selection_kind: selection_values} |
| 352 | + |
| 353 | + assert dataset_with_blobs.read_blobs("blobs", **kwargs) == [] |
| 354 | + |
| 355 | + |
| 356 | +@pytest.mark.parametrize( |
| 357 | + ("planner_kwargs", "error_message"), |
| 358 | + [ |
| 359 | + ({"target_request_bytes": 0}, "target_request_bytes must be greater than 0"), |
| 360 | + ({"max_concurrency": 0}, "max_concurrency must be greater than 0"), |
| 361 | + ], |
| 362 | +) |
| 363 | +def test_read_blobs_rejects_invalid_planner_options( |
| 364 | + dataset_with_blobs, planner_kwargs, error_message |
| 365 | +): |
| 366 | + with pytest.raises(ValueError, match=error_message): |
| 367 | + dataset_with_blobs.read_blobs("blobs", indices=[0], **planner_kwargs) |
| 368 | + |
| 369 | + |
| 370 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 371 | +def test_read_blobs_preserves_input_order(dataset_with_blobs, selection_kind): |
| 372 | + selection_values, expected = _out_of_order_blob_selection( |
| 373 | + dataset_with_blobs, selection_kind |
| 374 | + ) |
| 375 | + kwargs = {selection_kind: selection_values} |
| 376 | + |
| 377 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=True) |
| 378 | + |
| 379 | + assert blobs == expected |
| 380 | + |
| 381 | + |
| 382 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 383 | +def test_read_blobs_without_preserve_order_returns_same_rows( |
| 384 | + dataset_with_blobs, selection_kind |
| 385 | +): |
| 386 | + selection_values, expected = _out_of_order_blob_selection( |
| 387 | + dataset_with_blobs, selection_kind |
| 388 | + ) |
| 389 | + kwargs = {selection_kind: selection_values} |
| 390 | + |
| 391 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=False) |
| 392 | + |
| 393 | + assert sorted(blobs) == sorted(expected) |
| 394 | + |
| 395 | + |
254 | 396 | def test_blob_file_seek(tmp_path, dataset_with_blobs): |
255 | 397 | row_ids = ( |
256 | 398 | dataset_with_blobs.to_table(columns=[], with_row_id=True) |
@@ -466,6 +608,12 @@ def test_blob_extension_write_external_slice(tmp_path): |
466 | 608 | with blob_file as f: |
467 | 609 | assert f.read() == expected |
468 | 610 |
|
| 611 | + assert ds.read_blobs("blob", indices=[0, 1, 2]) == [ |
| 612 | + (0, b"alpha"), |
| 613 | + (1, b"bravo"), |
| 614 | + (2, b"charlie"), |
| 615 | + ] |
| 616 | + |
469 | 617 |
|
470 | 618 | def test_blob_extension_write_external_slice_ingest(tmp_path): |
471 | 619 | tar_path = tmp_path / "container.tar" |
@@ -548,6 +696,7 @@ def test_blob_extension_take_blobs_multi_base(payload, is_dataset_root, tmp_path |
548 | 696 | with blobs[0] as f: |
549 | 697 | assert f.read() == payload |
550 | 698 |
|
| 699 | + assert ds.read_blobs("blob", indices=[0]) == [(0, payload)] |
551 | 700 |
|
552 | 701 | @pytest.fixture |
553 | 702 | def dataset_for_pandas_blob_tests(tmp_path): |
|
0 commit comments