Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 32 additions & 30 deletions apis/python/src/tiledbvcf/binding/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -516,16 +516,18 @@ py::object Reader::get_variant_stats_results() {
auto an = BufferInfo::create("an", TILEDB_VCF_INT32, num_rows);
auto af = BufferInfo::create("af", TILEDB_VCF_FLOAT32, num_rows);

check_error(
reader,
tiledb_vcf_reader_read_from_variant_stats(
reader,
reinterpret_cast<uint32_t*>(pos->data().data()),
reinterpret_cast<char*>(allele->data().data()),
reinterpret_cast<int32_t*>(allele->offsets().data()),
reinterpret_cast<int*>(ac->data().data()),
reinterpret_cast<int*>(an->data().data()),
reinterpret_cast<float*>(af->data().data())));
if (num_rows > 0) {
check_error(
reader,
tiledb_vcf_reader_read_from_variant_stats(
reader,
reinterpret_cast<uint32_t*>(pos->data().data()),
reinterpret_cast<char*>(allele->data().data()),
reinterpret_cast<int32_t*>(allele->offsets().data()),
reinterpret_cast<int*>(ac->data().data()),
reinterpret_cast<int*>(an->data().data()),
reinterpret_cast<float*>(af->data().data())));
}

build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
build_arrow_array_from_buffer(allele, num_rows, 0, alleles_size);
Expand All @@ -546,15 +548,16 @@ py::object Reader::get_allele_count_results() {
reader,
tiledb_vcf_reader_get_allele_count_buffer_sizes(
reader, &num_rows, &refs_size, &alts_size, &filters_size, &gts_size));
if (num_rows > 0) {
auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows);
auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size);
auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size);
auto filter =
BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size);
auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size);
auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows);

auto pos = BufferInfo::create("pos", TILEDB_VCF_INT32, num_rows);
auto ref = BufferInfo::create("ref", TILEDB_VCF_CHAR, num_rows, refs_size);
auto alt = BufferInfo::create("alt", TILEDB_VCF_CHAR, num_rows, alts_size);
auto filter =
BufferInfo::create("filter", TILEDB_VCF_CHAR, num_rows, filters_size);
auto gt = BufferInfo::create("gt", TILEDB_VCF_CHAR, num_rows, gts_size);
auto count = BufferInfo::create("count", TILEDB_VCF_INT32, num_rows);

if (num_rows > 0) {
check_error(
reader,
tiledb_vcf_reader_read_from_allele_count(
Expand All @@ -569,19 +572,18 @@ py::object Reader::get_allele_count_results() {
reinterpret_cast<char*>(gt->data().data()),
reinterpret_cast<uint32_t*>(gt->offsets().data()),
reinterpret_cast<int32_t*>(count->data().data())));

build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
build_arrow_array_from_buffer(ref, num_rows, 0, num_rows);
build_arrow_array_from_buffer(alt, num_rows, 0, num_rows);
build_arrow_array_from_buffer(filter, num_rows, 0, num_rows);
build_arrow_array_from_buffer(gt, num_rows, 0, num_rows);
build_arrow_array_from_buffer(count, num_rows, 0, num_rows);

std::vector<std::shared_ptr<BufferInfo>> buffers = {
pos, ref, alt, filter, gt, count};
return buffers_to_table(buffers);
}
return py::cast<py::none> Py_None;

build_arrow_array_from_buffer(pos, num_rows, 0, num_rows);
build_arrow_array_from_buffer(ref, num_rows, 0, num_rows);
build_arrow_array_from_buffer(alt, num_rows, 0, num_rows);
build_arrow_array_from_buffer(filter, num_rows, 0, num_rows);
build_arrow_array_from_buffer(gt, num_rows, 0, num_rows);
build_arrow_array_from_buffer(count, num_rows, 0, num_rows);

std::vector<std::shared_ptr<BufferInfo>> buffers = {
pos, ref, alt, filter, gt, count};
return buffers_to_table(buffers);
}

void Reader::deleter(tiledb_vcf_reader_t* r) {
Expand Down
42 changes: 25 additions & 17 deletions apis/python/src/tiledbvcf/binding/vcf_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,32 +141,32 @@ static std::pair<ArrowSchema*, ArrowArray*> arrow_string_array(
schema->release = &release_schema;
schema->private_data = nullptr;

int n_buffers = offsets.capacity() ? 3 : 2;

auto array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray));
array->length = length;
array->null_count = 0;
array->offset = 0;
array->n_buffers = n_buffers;
array->n_buffers = 3;
array->n_children = 0;
array->buffers = nullptr;
array->children = nullptr;
array->dictionary = nullptr;
array->release = &release_array;
array->private_data = (void*)new ArrowBuffer(buffer);

array->buffers = (const void**)malloc(n_buffers * sizeof(void*));
array->buffers[0] = nullptr; // validity
array->buffers[n_buffers - 1] = data.data(); // data
if (n_buffers == 3) {
array->buffers[1] = offsets.data(); // offsets
}

array->buffers = (const void**)malloc(3 * sizeof(void*));
if (bitmap.capacity()) {
schema->flags |= ARROW_FLAG_NULLABLE;
array->null_count = -1;
array->buffers[0] = bitmap.data();
} else {
array->buffers[0] = nullptr; // validity
}
// Edge case: empty results should still have a single 0 offset
if (!length && !offsets.size()) {
offsets.push_back(0);
}
array->buffers[1] = offsets.data(); // offsets
array->buffers[2] = data.data(); // data

return std::make_pair(schema, array);
}
Expand Down Expand Up @@ -205,7 +205,11 @@ static std::pair<ArrowSchema*, ArrowArray*> arrow_list_array(
array->private_data = nullptr; // Buffers will be deleted by the child array

array->buffers = (const void**)malloc(array->n_buffers * sizeof(void*));
array->buffers[0] = nullptr; // validity
array->buffers[0] = nullptr; // validity
// Edge case: empty results should still have a single 0 offset
if (!length && !value_offsets.size()) {
value_offsets.push_back(0);
}
array->buffers[1] = value_offsets.data(); // data

if (bitmap.capacity()) {
Expand Down Expand Up @@ -234,12 +238,12 @@ void build_arrow_array(
auto [values_schema, values_array] =
arrow_data_array(buffer, num_data_elements, buffer->data());

// Edge case: only subtract if there's offsets to avoid underflow
if (num_offsets) {
num_offsets -= 1;
}
std::tie(array_schema, array_array) = arrow_list_array(
buffer,
num_offsets - 1,
buffer->offsets(),
values_schema,
values_array);
buffer, num_offsets, buffer->offsets(), values_schema, values_array);
} else {
std::tie(array_schema, array_array) =
arrow_data_array(buffer, num_data_elements, buffer->data());
Expand Down Expand Up @@ -278,9 +282,13 @@ void build_arrow_array_from_buffer(
uint64_t num_data_elements) {
if (buffer->datatype() == TILEDB_VCF_CHAR) {
if (buffer->list_offsets().capacity() > 0) {
// Edge case: only subtract if there's offsets to avoid underflow
if (num_offsets) {
num_offsets -= 1;
}
// Array of strings
auto [values_schema, values_array] = arrow_string_array(
buffer, num_offsets - 1, buffer->offsets(), buffer->data());
buffer, num_offsets, buffer->offsets(), buffer->data());

// Array of lists of strings
auto [arrow_schema, arrow_array] = arrow_list_array(
Expand Down
7 changes: 6 additions & 1 deletion apis/python/tests/test_tiledbvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,9 @@ def test_ingest_with_stats_v3(
with pytest.raises(Exception, match=interval_error):
test_stats_v3_ingestion.read_variant_stats_arrow(regions=["chr1:100-1"])

# test empty region
assert test_stats_v3_ingestion.read_variant_stats(regions=["chr3:1-10000"]).empty

# test types and deprecated region parameter
region1 = "chr1:1-10000"
df = test_stats_v3_ingestion.read_variant_stats(region1)
Expand Down Expand Up @@ -1551,7 +1554,8 @@ def test_ingest_with_stats_v3(
with pytest.raises(Exception, match=interval_error):
test_stats_v3_ingestion.read_allele_count_arrow(regions=["chr1:100-1"])

# test allele count
# test empty region
assert test_stats_v3_ingestion.read_allele_count(regions=["chr3:1-10000"]).empty

# test types and deprecated region parameter
region1 = "chr1:1-10000"
Expand Down Expand Up @@ -2458,6 +2462,7 @@ def test_delete_dataset(tmp_path):
# Check that the dataset does not exist
assert not os.path.exists(uri)


def test_equality_old_new_format():
old_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/old_format"))
new_ds = tiledbvcf.Dataset(os.path.join(TESTS_INPUT_DIR, "arrays/new_format"))
Expand Down