From 8f527bd4c098f160c0d73b68cb86d7faf3d4a8d6 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Mon, 9 Feb 2026 09:33:48 -0700 Subject: [PATCH 1/2] Added support to Python API for regions of type numpy.ndarray Also, regions of type None are now handled explicitly with all other types raising an exception. --- apis/python/src/tiledbvcf/dataset.py | 52 +++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/apis/python/src/tiledbvcf/dataset.py b/apis/python/src/tiledbvcf/dataset.py index 3e88cb427..00b6a3de5 100644 --- a/apis/python/src/tiledbvcf/dataset.py +++ b/apis/python/src/tiledbvcf/dataset.py @@ -4,6 +4,7 @@ from collections import namedtuple from typing import Generator, List +import numpy as np import pandas as pd import pyarrow as pa import pyarrow.compute as pc @@ -279,7 +280,7 @@ def read_arrow( self, attrs: List[str] = DEFAULT_ATTRS, samples: (str, List[str]) = None, - regions: (str, List[str]) = None, + regions: (str, List[str], np.ndarray) = None, samples_file: str = None, bed_file: str = None, skip_check_samples: bool = False, @@ -324,10 +325,18 @@ def read_arrow( if isinstance(regions, str): regions = [regions] + elif isinstance(regions, np.ndarray): + if regions.ndim != 1: + raise Exception( + f'"regions" parameter of type {type(regions)} must be 1-dimensional' + ) + regions = regions.tolist() if isinstance(regions, list): regions = map(str, self._prepare_regions(regions)) - else: + elif regions is None: regions = "" + else: + raise Exception(f'"regions" parameter cannot have type: {type(regions)}') if isinstance(samples, str): samples = [samples] @@ -526,7 +535,7 @@ def read( self, attrs: List[str] = DEFAULT_ATTRS, samples: (str, List[str]) = None, - regions: (str, List[str]) = None, + regions: (str, List[str], np.ndarray) = None, samples_file: str = None, bed_file: str = None, skip_check_samples: bool = False, @@ -571,10 +580,19 @@ def read( if isinstance(regions, str): regions = [regions] + elif isinstance(regions, np.ndarray): + if regions.ndim != 1: + raise Exception( + f'"regions" parameter of type {type(regions)} must be 1-dimensional' + ) + regions = regions.tolist() if isinstance(regions, list): regions = map(str, self._prepare_regions(regions)) - else: + elif regions is None: regions = "" + else: + raise Exception(f'"regions" parameter cannot have type: {type(regions)}') + if isinstance(samples, str): samples = [samples] @@ -596,7 +614,7 @@ def read( def export( self, samples: (str, List[str]) = None, - regions: (str, List[str]) = None, + regions: (str, List[str], np.ndarray) = None, samples_file: str = None, bed_file: str = None, skip_check_samples: bool = False, @@ -639,10 +657,19 @@ def export( if isinstance(regions, str): regions = [regions] + elif isinstance(regions, np.ndarray): + if regions.ndim != 1: + raise Exception( + f'"regions" parameter of type {type(regions)} must be 1-dimensional' + ) + regions = regions.tolist() if isinstance(regions, list): regions = map(str, self._prepare_regions(regions)) - else: + elif regions is None: regions = "" + else: + raise Exception(f'"regions" parameter cannot have type: {type(regions)}') + if isinstance(samples, str): samples = [samples] @@ -671,7 +698,7 @@ def read_iter( self, attrs: List[str] = DEFAULT_ATTRS, samples: (str, List[str]) = None, - regions: (str, List[str]) = None, + regions: (str, List[str], np.ndarray) = None, samples_file: str = None, bed_file: str = None, ): @@ -696,10 +723,19 @@ def read_iter( if isinstance(regions, str): regions = [regions] + elif isinstance(regions, np.ndarray): + if regions.ndim != 1: + raise Exception( + f'"regions" parameter of type {type(regions)} must be 1-dimensional' + ) + regions = regions.tolist() if isinstance(regions, list): regions = map(str, self._prepare_regions(regions)) - else: + elif regions is None: regions = "" + else: + raise Exception(f'"regions" parameter cannot have type: {type(regions)}') + if isinstance(samples, str): samples = [samples] From 2170dfdbda2d0f300b0acbbfc03fcba29658b628 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Mon, 9 Feb 2026 09:35:17 -0700 Subject: [PATCH 2/2] Updated Python tests to include new numpy.ndarray regions type Previously untested regions types were added as well. --- apis/python/tests/test_tiledbvcf.py | 99 ++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 23 deletions(-) diff --git a/apis/python/tests/test_tiledbvcf.py b/apis/python/tests/test_tiledbvcf.py index 3d0eb3a27..bd49d084b 100755 --- a/apis/python/tests/test_tiledbvcf.py +++ b/apis/python/tests/test_tiledbvcf.py @@ -125,6 +125,27 @@ def test_retrieve_samples(test_ds): assert test_ds.samples() == ["HG00280", "HG01762"] +def test_read_unsupported_regions_type(test_ds): + unsupported_region = 3.14 + unsupported_type_error = f'"regions" parameter cannot have type: {type(unsupported_region)}' + wrong_dimension_region = np.array([["1:12700-13400"], ["1:12700-13400"]]) + ndarray_wrong_dimension_error = f'"regions" parameter of type {type(wrong_dimension_region)} must be 1-dimensional' + with pytest.raises(Exception, match=unsupported_type_error): + test_ds.read(regions=unsupported_region) + with pytest.raises(Exception, match=ndarray_wrong_dimension_error): + test_ds.read(regions=wrong_dimension_region) + with pytest.raises(Exception, match=unsupported_type_error): + test_ds.read_arrow(regions=unsupported_region) + with pytest.raises(Exception, match=ndarray_wrong_dimension_error): + test_ds.read_arrow(regions=wrong_dimension_region) + with pytest.raises(Exception, match=unsupported_type_error): + for variant in test_ds.read_iter(regions=unsupported_region): + print(variant) + with pytest.raises(Exception, match=ndarray_wrong_dimension_error): + for variant in test_ds.read_iter(regions=wrong_dimension_region): + print(variant) + + def test_read_attrs(test_ds_attrs): attrs = ["sample_name"] df = test_ds_attrs.read(attrs=attrs) @@ -233,6 +254,40 @@ def test_basic_reads(test_ds): _check_dfs( expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) ) + df = test_ds.read_arrow( + attrs=["sample_name", "pos_start", "pos_end"], regions=["1:12700-13400"] + ).to_pandas() + _check_dfs( + expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + ) + + # Regions as string + df = test_ds.read( + attrs=["sample_name", "pos_start", "pos_end"], regions="1:12700-13400" + ) + _check_dfs( + expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + ) + df = test_ds.read_arrow( + attrs=["sample_name", "pos_start", "pos_end"], regions="1:12700-13400" + ).to_pandas() + _check_dfs( + expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + ) + + # Regions as numpy.ndarray + df = test_ds.read( + attrs=["sample_name", "pos_start", "pos_end"], regions=np.array(["1:12700-13400"]) + ) + _check_dfs( + expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + ) + df = test_ds.read_arrow( + attrs=["sample_name", "pos_start", "pos_end"], regions=np.array(["1:12700-13400"]) + ).to_pandas() + _check_dfs( + expected_df, df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + ) # Region and sample intersection df = test_ds.read( @@ -382,41 +437,39 @@ def test_incomplete_read_generator(): uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples") cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg) - - dfs = [] - for df in test_ds.read_iter(attrs=["pos_end"], regions=["1:12700-13400"]): - dfs.append(df) - overall_df = pd.concat(dfs, ignore_index=True) - - assert len(overall_df) == 6 - _check_dfs( - pd.DataFrame.from_dict( + expected_df = pd.DataFrame.from_dict( { "pos_end": np.array( [12771, 12771, 13374, 13389, 13395, 13413], dtype=np.int32 ) } - ), - overall_df, - ) + ) + + # NOTE: Running multiple test shows that the iterator can be reused + + # Regions as string + dfs = [] + for df in test_ds.read_iter(attrs=["pos_end"], regions="1:12700-13400"): + dfs.append(df) + overall_df = pd.concat(dfs, ignore_index=True) + assert len(overall_df) == 6 + _check_dfs(expected_df, overall_df) - # Test that the iterator can be used again + # Regions as list dfs = [] for df in test_ds.read_iter(attrs=["pos_end"], regions=["1:12700-13400"]): dfs.append(df) overall_df = pd.concat(dfs, ignore_index=True) + assert len(overall_df) == 6 + _check_dfs(expected_df, overall_df) + # Regions as numpy.ndarray + dfs = [] + for df in test_ds.read_iter(attrs=["pos_end"], regions=np.array(["1:12700-13400"])): + dfs.append(df) + overall_df = pd.concat(dfs, ignore_index=True) assert len(overall_df) == 6 - _check_dfs( - pd.DataFrame.from_dict( - { - "pos_end": np.array( - [12771, 12771, 13374, 13389, 13395, 13413], dtype=np.int32 - ) - } - ), - overall_df, - ) + _check_dfs(expected_df, overall_df) def test_read_filters(test_ds):