|
13 | 13 | from lance import Blob, BlobColumn, DatasetBasePath |
14 | 14 |
|
15 | 15 |
|
| 16 | +def _blob_row_ids(dataset): |
| 17 | + return dataset.to_table(columns=[], with_row_id=True).column("_rowid").to_pylist() |
| 18 | + |
| 19 | + |
| 20 | +def _blob_row_addresses(dataset): |
| 21 | + return ( |
| 22 | + dataset.to_table(columns=["idx"], with_row_address=True) |
| 23 | + .column("_rowaddr") |
| 24 | + .to_pylist() |
| 25 | + ) |
| 26 | + |
| 27 | + |
| 28 | +def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): |
| 29 | + addresses = _blob_row_addresses(dataset_with_blobs) |
| 30 | + expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] |
| 31 | + |
| 32 | + if selection_kind == "ids": |
| 33 | + return [_blob_row_ids(dataset_with_blobs)[4], _blob_row_ids(dataset_with_blobs)[0]], expected |
| 34 | + if selection_kind == "addresses": |
| 35 | + return [addresses[4], addresses[0]], expected |
| 36 | + return [4, 0], expected |
| 37 | + |
| 38 | + |
16 | 39 | def test_blob_read_from_binary(): |
17 | 40 | values = [b"foo", b"bar", b"baz"] |
18 | 41 | data = pa.table( |
@@ -248,6 +271,125 @@ def test_blob_by_indices(tmp_path, dataset_with_blobs): |
248 | 271 | assert f1.read() == f2.read() |
249 | 272 |
|
250 | 273 |
|
| 274 | +@pytest.mark.parametrize( |
| 275 | + ("selection_kind", "selection_values", "expected"), |
| 276 | + [ |
| 277 | + ("ids", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 278 | + ("addresses", [0, (1 << 32) + 1], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 279 | + ("indices", [0, 4], [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 280 | + ], |
| 281 | +) |
| 282 | +def test_read_blobs(dataset_with_blobs, selection_kind, selection_values, expected): |
| 283 | + kwargs = {selection_kind: selection_values} |
| 284 | + |
| 285 | + blobs = dataset_with_blobs.read_blobs( |
| 286 | + "blobs", |
| 287 | + **kwargs, |
| 288 | + target_request_bytes=1024, |
| 289 | + max_gap_bytes=64, |
| 290 | + max_concurrency=2, |
| 291 | + preserve_order=True, |
| 292 | + ) |
| 293 | + |
| 294 | + assert blobs == expected |
| 295 | + |
| 296 | + |
| 297 | +def test_read_blobs_requires_single_selector(dataset_with_blobs): |
| 298 | + with pytest.raises( |
| 299 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 300 | + ): |
| 301 | + dataset_with_blobs.read_blobs("blobs", ids=[0], indices=[0]) |
| 302 | + |
| 303 | + |
| 304 | +def test_read_blobs_requires_selector(dataset_with_blobs): |
| 305 | + with pytest.raises( |
| 306 | + ValueError, match="Exactly one of ids, indices, or addresses must be specified" |
| 307 | + ): |
| 308 | + dataset_with_blobs.read_blobs("blobs") |
| 309 | + |
| 310 | + |
| 311 | +def test_read_blobs_rejects_non_blob_column(dataset_with_blobs): |
| 312 | + with pytest.raises(ValueError, match="not a blob column"): |
| 313 | + dataset_with_blobs.read_blobs("idx", indices=[0]) |
| 314 | + |
| 315 | + |
| 316 | +@pytest.mark.parametrize( |
| 317 | + ("selection_kind", "selection_values", "expected"), |
| 318 | + [ |
| 319 | + ("ids", pa.array([0, (1 << 32) + 1], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 320 | + ("addresses", pa.array([0, (1 << 32) + 1], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 321 | + ("indices", pa.array([0, 4], type=pa.uint64()), [(0, b"foo"), ((1 << 32) + 1, b"quux")]), |
| 322 | + ], |
| 323 | +) |
| 324 | +def test_read_blobs_accepts_arrow_array_selectors( |
| 325 | + dataset_with_blobs, selection_kind, selection_values, expected |
| 326 | +): |
| 327 | + kwargs = {selection_kind: selection_values} |
| 328 | + |
| 329 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs) |
| 330 | + |
| 331 | + assert blobs == expected |
| 332 | + |
| 333 | + |
| 334 | +@pytest.mark.parametrize( |
| 335 | + ("selection_kind", "selection_values"), |
| 336 | + [ |
| 337 | + ("ids", []), |
| 338 | + ("addresses", []), |
| 339 | + ("indices", []), |
| 340 | + ("ids", pa.array([], type=pa.uint64())), |
| 341 | + ("addresses", pa.array([], type=pa.uint64())), |
| 342 | + ("indices", pa.array([], type=pa.uint64())), |
| 343 | + ], |
| 344 | +) |
| 345 | +def test_read_blobs_accepts_empty_selection( |
| 346 | + dataset_with_blobs, selection_kind, selection_values |
| 347 | +): |
| 348 | + kwargs = {selection_kind: selection_values} |
| 349 | + |
| 350 | + assert dataset_with_blobs.read_blobs("blobs", **kwargs) == [] |
| 351 | + |
| 352 | + |
| 353 | +@pytest.mark.parametrize( |
| 354 | + ("planner_kwargs", "error_message"), |
| 355 | + [ |
| 356 | + ({"target_request_bytes": 0}, "target_request_bytes must be greater than 0"), |
| 357 | + ({"max_concurrency": 0}, "max_concurrency must be greater than 0"), |
| 358 | + ], |
| 359 | +) |
| 360 | +def test_read_blobs_rejects_invalid_planner_options( |
| 361 | + dataset_with_blobs, planner_kwargs, error_message |
| 362 | +): |
| 363 | + with pytest.raises(ValueError, match=error_message): |
| 364 | + dataset_with_blobs.read_blobs("blobs", indices=[0], **planner_kwargs) |
| 365 | + |
| 366 | + |
| 367 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 368 | +def test_read_blobs_preserves_input_order(dataset_with_blobs, selection_kind): |
| 369 | + selection_values, expected = _out_of_order_blob_selection( |
| 370 | + dataset_with_blobs, selection_kind |
| 371 | + ) |
| 372 | + kwargs = {selection_kind: selection_values} |
| 373 | + |
| 374 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=True) |
| 375 | + |
| 376 | + assert blobs == expected |
| 377 | + |
| 378 | + |
| 379 | +@pytest.mark.parametrize("selection_kind", ["ids", "addresses", "indices"]) |
| 380 | +def test_read_blobs_without_preserve_order_returns_same_rows( |
| 381 | + dataset_with_blobs, selection_kind |
| 382 | +): |
| 383 | + selection_values, expected = _out_of_order_blob_selection( |
| 384 | + dataset_with_blobs, selection_kind |
| 385 | + ) |
| 386 | + kwargs = {selection_kind: selection_values} |
| 387 | + |
| 388 | + blobs = dataset_with_blobs.read_blobs("blobs", **kwargs, preserve_order=False) |
| 389 | + |
| 390 | + assert sorted(blobs) == sorted(expected) |
| 391 | + |
| 392 | + |
251 | 393 | def test_blob_file_seek(tmp_path, dataset_with_blobs): |
252 | 394 | row_ids = ( |
253 | 395 | dataset_with_blobs.to_table(columns=[], with_row_id=True) |
@@ -422,6 +564,12 @@ def test_blob_extension_write_external_slice(tmp_path): |
422 | 564 | with blob_file as f: |
423 | 565 | assert f.read() == expected |
424 | 566 |
|
| 567 | + assert ds.read_blobs("blob", indices=[0, 1, 2]) == [ |
| 568 | + (0, b"alpha"), |
| 569 | + (1, b"bravo"), |
| 570 | + (2, b"charlie"), |
| 571 | + ] |
| 572 | + |
425 | 573 |
|
426 | 574 | @pytest.mark.parametrize( |
427 | 575 | ("payload", "is_dataset_root"), |
@@ -460,3 +608,5 @@ def test_blob_extension_take_blobs_multi_base(payload, is_dataset_root, tmp_path |
460 | 608 | assert len(blobs) == 1 |
461 | 609 | with blobs[0] as f: |
462 | 610 | assert f.read() == payload |
| 611 | + |
| 612 | + assert ds.read_blobs("blob", indices=[0]) == [(0, payload)] |
0 commit comments