Skip to content

Commit 7d637f7

Browse files
authored
VCF roundtrip tests (#91)
Test icechunk transaction
1 parent 5ddf187 commit 7d637f7

6 files changed

Lines changed: 283 additions & 320 deletions

File tree

tests/test_append.py

Lines changed: 8 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,3 @@
1-
# Create the VCF files, one with samples NA00001 and NA00002 and the other with NA00003
2-
3-
# bcftools view -s NA00001,NA00002 --no-update -O z tests/data/vcf/sample.vcf.gz \
4-
# > tests/data/vcf/sample-part1.vcf.gz
5-
# bcftools view -s NA00003 --no-update -O z tests/data/vcf/sample.vcf.gz \
6-
# > tests/data/vcf/sample-part2.vcf.gz
7-
# bcftools index -c tests/data/vcf/sample-part1.vcf.gz
8-
# bcftools index -c tests/data/vcf/sample-part2.vcf.gz
9-
10-
# Similarly for chr22.vcf.gz
11-
# bcftools view --no-update \
12-
# -S <(bcftools query -l tests/data/vcf/chr22.vcf.gz | head -55) \
13-
# tests/data/vcf/chr22.vcf.gz --write-index=csi -o tests/data/vcf/chr22-part1.vcf.gz
14-
# bcftools view --no-update \
15-
# -S <(bcftools query -l tests/data/vcf/chr22.vcf.gz | tail -45) \
16-
# tests/data/vcf/chr22.vcf.gz --write-index=csi -o tests/data/vcf/chr22-part2.vcf.gz
17-
18-
# Create a variants list VCF with no samples.
19-
# Note that the header contains FORMAT fields, even though there are no samples,
20-
# which is necessary for vc2zarr to create empty arrays.
21-
22-
# bin/vcf-drop-samples.sh tests/data/vcf/sample.vcf.gz \
23-
# tests/data/vcf/sample-variants.vcf.gz
24-
25-
261
import numpy as np
272
import pytest
283
import zarr
@@ -31,13 +6,7 @@
316

327
from vczstore.append import append
338

34-
from .utils import (
35-
compare_vcf_and_vcz,
36-
convert_vcf_to_vcz,
37-
convert_vcf_to_vcz_icechunk,
38-
make_vcz,
39-
run_vcztools,
40-
)
9+
from .utils import make_vcz
4110

4211

4312
def test_append():
@@ -104,67 +73,21 @@ def test_append():
10473
)
10574

10675

107-
@pytest.mark.parametrize("samples_chunk_size", [1, 2, 4])
108-
@pytest.mark.parametrize("backend_storage", [None, "obstore"])
109-
def test_append_compare_vcf(tmp_path, samples_chunk_size, backend_storage):
110-
vcz1 = convert_vcf_to_vcz(
111-
"sample-part1.vcf.gz", tmp_path, samples_chunk_size=samples_chunk_size
112-
)
113-
vcz2 = convert_vcf_to_vcz("sample-part2.vcf.gz", tmp_path)
114-
115-
# check samples query
116-
vcztools_out, _ = run_vcztools(f"query -l {vcz1}")
117-
assert vcztools_out.strip() == "NA00001\nNA00002"
118-
119-
append(vcz1, vcz2, backend_storage=backend_storage)
120-
121-
# check samples query
122-
vcztools_out, _ = run_vcztools(f"query -l {vcz1}")
123-
assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003"
124-
125-
# check equivalence with original VCF
126-
compare_vcf_and_vcz(
127-
tmp_path, "view --no-version", "sample.vcf.gz", "view --no-version", vcz1
128-
)
129-
130-
131-
def test_append_from_variants_list(tmp_path):
132-
vcz0 = convert_vcf_to_vcz(
133-
"sample-variants.vcf.gz", tmp_path, ploidy=2, samples_chunk_size=2
134-
)
135-
vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path)
136-
137-
# check samples query
138-
vcztools_out, _ = run_vcztools(f"query -l {vcz0}")
139-
assert vcztools_out.strip() == ""
140-
141-
append(vcz0, vcz1)
142-
143-
# check samples query
144-
vcztools_out, _ = run_vcztools(f"query -l {vcz0}")
145-
assert vcztools_out.strip() == "NA00001\nNA00002"
146-
147-
# check equivalence with original VCF
148-
compare_vcf_and_vcz(
149-
tmp_path, "view --no-version", "sample-part1.vcf.gz", "view --no-version", vcz0
150-
)
151-
152-
153-
def test_append_fail_num_variants_mismatch(tmp_path):
154-
vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path)
155-
vcz2 = convert_vcf_to_vcz("alleles-1.vcf.gz", tmp_path)
76+
def test_append_fail_num_variants_mismatch():
77+
vcz1 = make_vcz([0, 0], [1, 2], [["A", "T"], ["A", "G"]])
78+
vcz2 = make_vcz([0], [1], [["A", "C"]])
15679

15780
with pytest.raises(
15881
ValueError,
15982
match="Stores being appended must have same number of variants. "
160-
"First has 9, second has 2",
83+
"First has 2, second has 1",
16184
):
16285
append(vcz1, vcz2)
16386

16487

165-
def test_append_fail_alleles_mismatch(tmp_path):
166-
vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path)
167-
vcz2 = convert_vcf_to_vcz("sample-part2-alleles-mismatch.vcf.gz", tmp_path)
88+
def test_append_fail_alleles_mismatch():
89+
vcz1 = make_vcz([0], [1], [["A", "T"]])
90+
vcz2 = make_vcz([0], [1], [["A", "C"]])
16891

16992
with pytest.raises(
17093
ValueError,
@@ -419,60 +342,6 @@ def test_append_deletes_stale_destination_chunk_when_source_chunk_is_sparse():
419342
)
420343

421344

422-
def test_append_multiple_chunks(tmp_path):
423-
vcz1 = convert_vcf_to_vcz(
424-
"chr22-part1.vcf.gz", tmp_path, variants_chunk_size=10, samples_chunk_size=50
425-
)
426-
vcz2 = convert_vcf_to_vcz(
427-
"chr22-part2.vcf.gz", tmp_path, variants_chunk_size=10, samples_chunk_size=50
428-
)
429-
430-
# check samples query
431-
vcztools_out, _ = run_vcztools(f"query -l {vcz1}")
432-
assert len(vcztools_out.strip().split("\n")) == 55
433-
434-
append(vcz1, vcz2)
435-
436-
# check samples query
437-
vcztools_out, _ = run_vcztools(f"query -l {vcz1}")
438-
assert len(vcztools_out.strip().split("\n")) == 100
439-
440-
# check equivalence with original VCF
441-
compare_vcf_and_vcz(
442-
tmp_path, "view --no-version", "chr22.vcf.gz", "view --no-version", vcz1
443-
)
444-
445-
446-
def test_append_icechunk(tmp_path):
447-
pytest.importorskip("icechunk")
448-
449-
# note that vcz1 is in icechunk, but the dataset being appended, vcz2, needn't be
450-
vcz1 = convert_vcf_to_vcz_icechunk("sample-part1.vcf.gz", tmp_path)
451-
vcz2 = convert_vcf_to_vcz("sample-part2.vcf.gz", tmp_path, zarr_format=3)
452-
453-
print(vcz1)
454-
print(vcz2)
455-
456-
# check samples query
457-
vcztools_out, _ = run_vcztools(f"query -l {vcz1} --backend-storage icechunk")
458-
assert vcztools_out.strip() == "NA00001\nNA00002"
459-
460-
append(vcz1, vcz2, backend_storage="icechunk")
461-
462-
# check samples query
463-
vcztools_out, _ = run_vcztools(f"query -l {vcz1} --backend-storage icechunk")
464-
assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003"
465-
466-
# check equivalence with original VCF
467-
compare_vcf_and_vcz(
468-
tmp_path,
469-
"view --no-version",
470-
"sample.vcf.gz",
471-
"view --no-version --backend-storage icechunk",
472-
vcz1,
473-
)
474-
475-
476345
def _make_genotype(num_variants, num_samples):
477346
values = np.zeros((num_variants, num_samples, 2), dtype=np.int8)
478347
for variant_index in range(num_variants):

tests/test_normalise.py

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,7 @@
1414
variant_alleles_are_equivalent,
1515
)
1616

17-
from .utils import (
18-
compare_vcf_and_vcz,
19-
convert_vcf_to_vcz,
20-
convert_vcf_to_vcz_icechunk,
21-
make_vcz,
22-
)
17+
from .utils import make_vcz
2318

2419

2520
@pytest.mark.parametrize(
@@ -409,41 +404,3 @@ def test_normalise__other_call_fields_not_implemented():
409404

410405
with pytest.raises(NotImplementedError):
411406
normalise(vcz1, vcz2, vcz2_norm)
412-
413-
414-
def test_normalise_and_append(tmp_path):
415-
vcz0 = convert_vcf_to_vcz("sample-variants.vcf.gz", tmp_path, ploidy=2)
416-
vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path)
417-
vcz1_norm = zarr.storage.MemoryStore()
418-
419-
normalise(vcz0, vcz1, vcz1_norm)
420-
421-
append(vcz0, vcz1_norm)
422-
423-
# check equivalence with original VCF
424-
compare_vcf_and_vcz(
425-
tmp_path, "view --no-version", "sample-part1.vcf.gz", "view --no-version", vcz0
426-
)
427-
428-
429-
def test_normalise_and_append_icechunk(tmp_path):
430-
pytest.importorskip("icechunk")
431-
from vczstore.utils import icechunk_transaction
432-
433-
# note that vcz0 is in icechunk, but the others needn't be
434-
vcz0 = convert_vcf_to_vcz_icechunk("sample-variants.vcf.gz", tmp_path, ploidy=2)
435-
vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path, zarr_format=3)
436-
vcz1_norm = zarr.storage.MemoryStore()
437-
438-
with icechunk_transaction(vcz0, "main", message="append") as store:
439-
normalise(store, vcz1, vcz1_norm)
440-
append(store, vcz1_norm)
441-
442-
# check equivalence with original VCF
443-
compare_vcf_and_vcz(
444-
tmp_path,
445-
"view --no-version",
446-
"sample-part1.vcf.gz",
447-
"view --no-version --backend-storage icechunk",
448-
vcz0,
449-
)

tests/test_remove.py

Lines changed: 1 addition & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,7 @@
55

66
from vczstore.remove import remove
77

8-
from .utils import (
9-
check_removed_sample,
10-
compare_vcf_and_vcz,
11-
convert_vcf_to_vcz,
12-
convert_vcf_to_vcz_icechunk,
13-
make_vcz,
14-
run_vcztools,
15-
)
8+
from .utils import make_vcz
169

1710

1811
def test_remove():
@@ -61,108 +54,6 @@ def test_remove():
6154
)
6255

6356

64-
@pytest.mark.parametrize("backend_storage", [None, "obstore"])
65-
def test_remove_compare_vcf(tmp_path, backend_storage):
66-
vcz = convert_vcf_to_vcz("sample.vcf.gz", tmp_path)
67-
68-
# check samples query
69-
vcztools_out, _ = run_vcztools(f"query -l {vcz}")
70-
assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003"
71-
72-
remove(vcz, "NA00002", backend_storage=backend_storage)
73-
74-
# check samples query
75-
vcztools_out, _ = run_vcztools(f"query -l {vcz}")
76-
assert vcztools_out.strip() == "NA00001\nNA00003"
77-
78-
# check equivalence with original VCF (with sample subsetting)
79-
compare_vcf_and_vcz(
80-
tmp_path,
81-
"view --no-version -s NA00001,NA00003 --no-update",
82-
"sample.vcf.gz",
83-
"view --no-version",
84-
vcz,
85-
)
86-
87-
# check sample values are missing
88-
check_removed_sample(vcz, "NA00002")
89-
90-
91-
def test_remove_multiple_chunks(tmp_path):
92-
vcz = convert_vcf_to_vcz("chr22.vcf.gz", tmp_path, variants_chunk_size=10)
93-
94-
# check samples query
95-
vcztools_out, _ = run_vcztools(f"query -l {vcz}")
96-
assert len(vcztools_out.strip().split("\n")) == 100
97-
98-
remove(vcz, "HG00100")
99-
100-
# check samples query
101-
vcztools_out, _ = run_vcztools(f"query -l {vcz}")
102-
assert "HG00100" not in vcztools_out
103-
assert len(vcztools_out.strip().split("\n")) == 99
104-
105-
# check equivalence with original VCF (with sample subsetting)
106-
reduced_samples = ",".join(vcztools_out.strip().split("\n"))
107-
compare_vcf_and_vcz(
108-
tmp_path,
109-
f"view --no-version -s {reduced_samples} --no-update",
110-
"chr22.vcf.gz",
111-
"view --no-version",
112-
vcz,
113-
)
114-
115-
# check sample values are missing
116-
check_removed_sample(vcz, "HG00100")
117-
118-
119-
def test_remove_icechunk(tmp_path):
120-
pytest.importorskip("icechunk")
121-
from icechunk import Repository
122-
123-
from vczstore.utils import make_icechunk_storage
124-
125-
vcz = convert_vcf_to_vcz_icechunk("sample.vcf.gz", tmp_path)
126-
127-
# check samples query
128-
vcztools_out, _ = run_vcztools(f"query -l {vcz} --backend-storage icechunk")
129-
assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003"
130-
131-
icechunk_storage = make_icechunk_storage(vcz)
132-
repo = Repository.open(icechunk_storage)
133-
134-
snapshots = [snapshot for snapshot in repo.ancestry(branch="main")]
135-
assert len(snapshots) == 2
136-
assert snapshots[0].message == "create"
137-
assert snapshots[1].message == "Repository initialized"
138-
139-
remove(vcz, "NA00002", backend_storage="icechunk")
140-
141-
snapshots = [snapshot for snapshot in repo.ancestry(branch="main")]
142-
assert len(snapshots) == 2
143-
# note that 'create' has been deleted
144-
assert snapshots[0].message == "remove"
145-
assert snapshots[1].message == "Repository initialized"
146-
147-
# check samples query
148-
vcztools_out, _ = run_vcztools(f"query -l {vcz} --backend-storage icechunk")
149-
assert vcztools_out.strip() == "NA00001\nNA00003"
150-
151-
# check equivalence with original VCF (with sample subsetting)
152-
compare_vcf_and_vcz(
153-
tmp_path,
154-
"view --no-version -s NA00001,NA00003 --no-update",
155-
"sample.vcf.gz",
156-
"view --no-version --backend-storage icechunk",
157-
vcz,
158-
)
159-
160-
# check sample values are missing
161-
session = repo.readonly_session("main")
162-
store = session.store
163-
check_removed_sample(store, "NA00002")
164-
165-
16657
def test_remove_fails_for_misaligned_variant_chunks():
16758
vcz = make_vcz(
16859
variant_contig=[0, 0],

0 commit comments

Comments
 (0)