diff --git a/apis/python/src/tiledbvcf/binding/reader.cc b/apis/python/src/tiledbvcf/binding/reader.cc index 53f9b17ec..bb1a6e43e 100644 --- a/apis/python/src/tiledbvcf/binding/reader.cc +++ b/apis/python/src/tiledbvcf/binding/reader.cc @@ -516,16 +516,18 @@ py::object Reader::get_variant_stats_results() { auto an = BufferInfo::create("an", TILEDB_VCF_INT32, num_rows); auto af = BufferInfo::create("af", TILEDB_VCF_FLOAT32, num_rows); - check_error( - reader, - tiledb_vcf_reader_read_from_variant_stats( - reader, - reinterpret_cast(pos->data().data()), - reinterpret_cast(allele->data().data()), - reinterpret_cast(allele->offsets().data()), - reinterpret_cast(ac->data().data()), - reinterpret_cast(an->data().data()), - reinterpret_cast(af->data().data()))); + if (num_rows > 0) { + check_error( + reader, + tiledb_vcf_reader_read_from_variant_stats( + reader, + reinterpret_cast(pos->data().data()), + reinterpret_cast(allele->data().data()), + reinterpret_cast(allele->offsets().data()), + reinterpret_cast(ac->data().data()), + reinterpret_cast(an->data().data()), + reinterpret_cast(af->data().data()))); + } build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); build_arrow_array_from_buffer(allele, num_rows, 0, alleles_size); @@ -546,15 +548,16 @@ py::object Reader::get_allele_count_results() { reader, tiledb_vcf_reader_get_allele_count_buffer_sizes( reader, &num_rows, &refs_size, &alts_size, &filters_size, >s_size)); - if (num_rows > 0) { - auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows); - auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size); - auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size); - auto filter = - BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size); - auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size); - auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows); + auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows); + auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size); + auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size); + auto filter = + BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size); + auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size); + auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows); + + if (num_rows > 0) { check_error( reader, tiledb_vcf_reader_read_from_allele_count( @@ -569,19 +572,18 @@ py::object Reader::get_allele_count_results() { reinterpret_cast(gt->data().data()), reinterpret_cast(gt->offsets().data()), reinterpret_cast(count->data().data()))); - - build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); - build_arrow_array_from_buffer(ref, num_rows, 0, num_rows); - build_arrow_array_from_buffer(alt, num_rows, 0, num_rows); - build_arrow_array_from_buffer(filter, num_rows, 0, num_rows); - build_arrow_array_from_buffer(gt, num_rows, 0, num_rows); - build_arrow_array_from_buffer(count, num_rows, 0, num_rows); - - std::vector> buffers = { - pos, ref, alt, filter, gt, count}; - return buffers_to_table(buffers); } - return py::cast Py_None; + + build_arrow_array_from_buffer(pos, num_rows, 0, num_rows); + build_arrow_array_from_buffer(ref, num_rows, 0, num_rows); + build_arrow_array_from_buffer(alt, num_rows, 0, num_rows); + build_arrow_array_from_buffer(filter, num_rows, 0, num_rows); + build_arrow_array_from_buffer(gt, num_rows, 0, num_rows); + build_arrow_array_from_buffer(count, num_rows, 0, num_rows); + + std::vector> buffers = { + pos, ref, alt, filter, gt, count}; + return buffers_to_table(buffers); } void Reader::deleter(tiledb_vcf_reader_t* r) { diff --git a/apis/python/src/tiledbvcf/binding/vcf_arrow.cc b/apis/python/src/tiledbvcf/binding/vcf_arrow.cc index 0a124c4df..0f7eaf55f 100644 --- a/apis/python/src/tiledbvcf/binding/vcf_arrow.cc +++ b/apis/python/src/tiledbvcf/binding/vcf_arrow.cc @@ -141,13 +141,11 @@ static std::pair arrow_string_array( schema->release = &release_schema; schema->private_data = nullptr; - int n_buffers = offsets.capacity() ? 3 : 2; - auto array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); array->length = length; array->null_count = 0; array->offset = 0; - array->n_buffers = n_buffers; + array->n_buffers = 3; array->n_children = 0; array->buffers = nullptr; array->children = nullptr; @@ -155,18 +153,20 @@ static std::pair arrow_string_array( array->release = &release_array; array->private_data = (void*)new ArrowBuffer(buffer); - array->buffers = (const void**)malloc(n_buffers * sizeof(void*)); - array->buffers[0] = nullptr; // validity - array->buffers[n_buffers - 1] = data.data(); // data - if (n_buffers == 3) { - array->buffers[1] = offsets.data(); // offsets - } - + array->buffers = (const void**)malloc(3 * sizeof(void*)); if (bitmap.capacity()) { schema->flags |= ARROW_FLAG_NULLABLE; array->null_count = -1; array->buffers[0] = bitmap.data(); + } else { + array->buffers[0] = nullptr; // validity + } + // Edge case: empty results should still have a single 0 offset + if (!length && !offsets.size()) { + offsets.push_back(0); } + array->buffers[1] = offsets.data(); // offsets + array->buffers[2] = data.data(); // data return std::make_pair(schema, array); } @@ -205,7 +205,11 @@ static std::pair arrow_list_array( array->private_data = nullptr; // Buffers will be deleted by the child array array->buffers = (const void**)malloc(array->n_buffers * sizeof(void*)); - array->buffers[0] = nullptr; // validity + array->buffers[0] = nullptr; // validity + // Edge case: empty results should still have a single 0 offset + if (!length && !value_offsets.size()) { + value_offsets.push_back(0); + } array->buffers[1] = value_offsets.data(); // data if (bitmap.capacity()) { @@ -234,12 +238,12 @@ void build_arrow_array( auto [values_schema, values_array] = arrow_data_array(buffer, num_data_elements, buffer->data()); + // Edge case: only subtract if there's offsets to avoid underflow + if (num_offsets) { + num_offsets -= 1; + } std::tie(array_schema, array_array) = arrow_list_array( - buffer, - num_offsets - 1, - buffer->offsets(), - values_schema, - values_array); + buffer, num_offsets, buffer->offsets(), values_schema, values_array); } else { std::tie(array_schema, array_array) = arrow_data_array(buffer, num_data_elements, buffer->data()); @@ -278,9 +282,13 @@ void build_arrow_array_from_buffer( uint64_t num_data_elements) { if (buffer->datatype() == TILEDB_VCF_CHAR) { if (buffer->list_offsets().capacity() > 0) { + // Edge case: only subtract if there's offsets to avoid underflow + if (num_offsets) { + num_offsets -= 1; + } // Array of strings auto [values_schema, values_array] = arrow_string_array( - buffer, num_offsets - 1, buffer->offsets(), buffer->data()); + buffer, num_offsets, buffer->offsets(), buffer->data()); // Array of lists of strings auto [arrow_schema, arrow_array] = arrow_list_array( diff --git a/apis/python/tests/test_tiledbvcf.py b/apis/python/tests/test_tiledbvcf.py index 1057e0e29..3d0eb3a27 100755 --- a/apis/python/tests/test_tiledbvcf.py +++ b/apis/python/tests/test_tiledbvcf.py @@ -1344,6 +1344,9 @@ def test_ingest_with_stats_v3( with pytest.raises(Exception, match=interval_error): test_stats_v3_ingestion.read_variant_stats_arrow(regions=["chr1:100-1"]) + # test empty region + assert test_stats_v3_ingestion.read_variant_stats(regions=["chr3:1-10000"]).empty + # test types and deprecated region parameter region1 = "chr1:1-10000" df = test_stats_v3_ingestion.read_variant_stats(region1) @@ -1551,7 +1554,8 @@ def test_ingest_with_stats_v3( with pytest.raises(Exception, match=interval_error): test_stats_v3_ingestion.read_allele_count_arrow(regions=["chr1:100-1"]) - # test allele count + # test empty region + assert test_stats_v3_ingestion.read_allele_count(regions=["chr3:1-10000"]).empty # test types and deprecated region parameter region1 = "chr1:1-10000" @@ -2458,6 +2462,7 @@ def test_delete_dataset(tmp_path): # Check that the dataset does not exist assert not os.path.exists(uri) + def test_equality_old_new_format(): old_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/old_format")) new_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/new_format"))