From e5eeafb6d2be7665467a4f11ac9eaa5906e8dbf5 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 21 Jan 2026 15:08:47 -0700 Subject: [PATCH 1/3] Added edge cases to Python Arrow bindings These edge cases are to handle when the buffers returned by a dataset read are empty. Specifically, an array's offsets must still contain the initial 0 value even when there's no data, and a string array must always have 3 buffers, even when there's no data. --- .../python/src/tiledbvcf/binding/vcf_arrow.cc | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/apis/python/src/tiledbvcf/binding/vcf_arrow.cc b/apis/python/src/tiledbvcf/binding/vcf_arrow.cc index 0a124c4df..0f7eaf55f 100644 --- a/apis/python/src/tiledbvcf/binding/vcf_arrow.cc +++ b/apis/python/src/tiledbvcf/binding/vcf_arrow.cc @@ -141,13 +141,11 @@ static std::pair arrow_string_array( schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = offsets.capacity() ? 3 : 2; - auto array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); array->length = length; array->null_count = 0; array->offset = 0; - array->n_buffers = n_buffers; + array->n_buffers = 3; array->n_children = 0; array->buffers = nullptr; array->children = nullptr; @@ -155,18 +153,20 @@ static std::pair arrow_string_array( array->release = &release_array; array->private_data = (void*)new ArrowBuffer(buffer); - array->buffers = (const void**)malloc(n_buffers * sizeof(void*)); - array->buffers[0] = nullptr; // validity - array->buffers[n_buffers - 1] = data.data(); // data - if (n_buffers == 3) { - array->buffers[1] = offsets.data(); // offsets - } - + array->buffers = (const void**)malloc(3 * sizeof(void*)); if (bitmap.capacity()) { schema->flags |= ARROW_FLAG_NULLABLE; array->null_count = -1; array->buffers[0] = bitmap.data(); + } else { + array->buffers[0] = nullptr; // validity + } + // Edge case: empty results should still have a single 0 offset + if (!length && !offsets.size()) { + offsets.push_back(0); } + array->buffers[1] = offsets.data(); // offsets + array->buffers[2] = data.data(); // data return std::make_pair(schema, array); } @@ -205,7 +205,11 @@ static std::pair arrow_list_array( array->private_data = nullptr; // Buffers will be deleted by the child array array->buffers = (const void**)malloc(array->n_buffers * sizeof(void*)); - array->buffers[0] = nullptr; // validity + array->buffers[0] = nullptr; // validity + // Edge case: empty results should still have a single 0 offset + if (!length && !value_offsets.size()) { + value_offsets.push_back(0); + } array->buffers[1] = value_offsets.data(); // data if (bitmap.capacity()) { @@ -234,12 +238,12 @@ void build_arrow_array( auto [values_schema, values_array] = arrow_data_array(buffer, num_data_elements, buffer->data()); + // Edge case: only subtract if there's offsets to avoid underflow + if (num_offsets) { + num_offsets -= 1; + } std::tie(array_schema, array_array) = arrow_list_array( - buffer, - num_offsets - 1, - buffer->offsets(), - values_schema, - values_array); + buffer, num_offsets, buffer->offsets(), values_schema, values_array); } else { std::tie(array_schema, array_array) = arrow_data_array(buffer, num_data_elements, buffer->data()); @@ -278,9 +282,13 @@ void build_arrow_array_from_buffer( uint64_t num_data_elements) { if (buffer->datatype() == TILEDB_VCF_CHAR) { if (buffer->list_offsets().capacity() > 0) { + // Edge case: only subtract if there's offsets to avoid underflow + if (num_offsets) { + num_offsets -= 1; + } // Array of strings auto [values_schema, values_array] = arrow_string_array( - buffer, num_offsets - 1, buffer->offsets(), buffer->data()); + buffer, num_offsets, buffer->offsets(), buffer->data()); // Array of lists of strings auto [arrow_schema, arrow_array] = arrow_list_array( From fb2a5b177729e59e028392121966599f68f0a200 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 21 Jan 2026 15:12:39 -0700 Subject: [PATCH 2/3] The allele count and variant stats Python bindings now return empty arrow tables Previously the allele count binding would be return None and the variant stats binding would try to parse the empty buffers returned by the reader, causing a segmentation fault. --- apis/python/src/tiledbvcf/binding/reader.cc | 62 +++++++++++---------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/apis/python/src/tiledbvcf/binding/reader.cc b/apis/python/src/tiledbvcf/binding/reader.cc index 53f9b17ec..bb1a6e43e 100644 --- a/apis/python/src/tiledbvcf/binding/reader.cc +++ b/apis/python/src/tiledbvcf/binding/reader.cc @@ -516,16 +516,18 @@ py::object Reader::get_variant_stats_results() { auto an = BufferInfo::create("an", TILEDB_VCF_INT32, num_rows); auto af = BufferInfo::create("af", TILEDB_VCF_FLOAT32, num_rows); - check_error( - reader, - tiledb_vcf_reader_read_from_variant_stats( - reader, - reinterpret_cast(pos->data().data()), - reinterpret_cast(allele->data().data()), - reinterpret_cast(allele->offsets().data()), - reinterpret_cast(ac->data().data()), - reinterpret_cast(an->data().data()), - reinterpret_cast(af->data().data()))); + if (num_rows > 0) { + check_error( + reader, + tiledb_vcf_reader_read_from_variant_stats( + reader, + reinterpret_cast(pos->data().data()), + reinterpret_cast(allele->data().data()), + reinterpret_cast(allele->offsets().data()), + reinterpret_cast(ac->data().data()), + reinterpret_cast(an->data().data()), + reinterpret_cast(af->data().data()))); + } build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); build_arrow_array_from_buffer(allele, num_rows, 0, alleles_size); @@ -546,15 +548,16 @@ py::object Reader::get_allele_count_results() { reader, tiledb_vcf_reader_get_allele_count_buffer_sizes( reader, &num_rows, &refs_size, &alts_size, &filters_size, >s_size)); - if (num_rows > 0) { - auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows); - auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size); - auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size); - auto filter = - BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size); - auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size); - auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows); + auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows); + auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size); + auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size); + auto filter = + BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size); + auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size); + auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows); + + if (num_rows > 0) { check_error( reader, tiledb_vcf_reader_read_from_allele_count( @@ -569,19 +572,18 @@ py::object Reader::get_allele_count_results() { reinterpret_cast(gt->data().data()), reinterpret_cast(gt->offsets().data()), reinterpret_cast(count->data().data()))); - - build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); - build_arrow_array_from_buffer(ref, num_rows, 0, num_rows); - build_arrow_array_from_buffer(alt, num_rows, 0, num_rows); - build_arrow_array_from_buffer(filter, num_rows, 0, num_rows); - build_arrow_array_from_buffer(gt, num_rows, 0, num_rows); - build_arrow_array_from_buffer(count, num_rows, 0, num_rows); - - std::vector> buffers = { - pos, ref, alt, filter, gt, count}; - return buffers_to_table(buffers); } - return py::cast Py_None; + + build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); + build_arrow_array_from_buffer(ref, num_rows, 0, num_rows); + build_arrow_array_from_buffer(alt, num_rows, 0, num_rows); + build_arrow_array_from_buffer(filter, num_rows, 0, num_rows); + build_arrow_array_from_buffer(gt, num_rows, 0, num_rows); + build_arrow_array_from_buffer(count, num_rows, 0, num_rows); + + std::vector> buffers = { + pos, ref, alt, filter, gt, count}; + return buffers_to_table(buffers); } void Reader::deleter(tiledb_vcf_reader_t* r) { From ce01c2b61f062f9df8dade8b1001f5e9c67a8f94 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 21 Jan 2026 15:39:36 -0700 Subject: [PATCH 3/3] Added empty region tests for read_variant_stats() and read_allele_count() These tests ensure that both methods return an empty DataFrame when the regions are empty or don't match the constraints of the query. --- apis/python/tests/test_tiledbvcf.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apis/python/tests/test_tiledbvcf.py b/apis/python/tests/test_tiledbvcf.py index 1057e0e29..3d0eb3a27 100755 --- a/apis/python/tests/test_tiledbvcf.py +++ b/apis/python/tests/test_tiledbvcf.py @@ -1344,6 +1344,9 @@ def test_ingest_with_stats_v3( with pytest.raises(Exception, match=interval_error): test_stats_v3_ingestion.read_variant_stats_arrow(regions=["chr1:100-1"]) + # test empty region + assert test_stats_v3_ingestion.read_variant_stats(regions=["chr3:1-10000"]).empty + # test types and deprecated region parameter region1 = "chr1:1-10000" df = test_stats_v3_ingestion.read_variant_stats(region1) @@ -1551,7 +1554,8 @@ def test_ingest_with_stats_v3( with pytest.raises(Exception, match=interval_error): test_stats_v3_ingestion.read_allele_count_arrow(regions=["chr1:100-1"]) - # test allele count + # test empty region + assert test_stats_v3_ingestion.read_allele_count(regions=["chr3:1-10000"]).empty # test types and deprecated region parameter region1 = "chr1:1-10000" @@ -2458,6 +2462,7 @@ def test_delete_dataset(tmp_path): # Check that the dataset does not exist assert not os.path.exists(uri) + def test_equality_old_new_format(): old_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/old_format")) new_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/new_format"))