|
1 | | -# Create the VCF files, one with samples NA00001 and NA00002 and the other with NA00003 |
2 | | - |
3 | | -# bcftools view -s NA00001,NA00002 --no-update -O z tests/data/vcf/sample.vcf.gz \ |
4 | | -# > tests/data/vcf/sample-part1.vcf.gz |
5 | | -# bcftools view -s NA00003 --no-update -O z tests/data/vcf/sample.vcf.gz \ |
6 | | -# > tests/data/vcf/sample-part2.vcf.gz |
7 | | -# bcftools index -c tests/data/vcf/sample-part1.vcf.gz |
8 | | -# bcftools index -c tests/data/vcf/sample-part2.vcf.gz |
9 | | - |
10 | | -# Similarly for chr22.vcf.gz |
11 | | -# bcftools view --no-update \ |
12 | | -# -S <(bcftools query -l tests/data/vcf/chr22.vcf.gz | head -55) \ |
13 | | -# tests/data/vcf/chr22.vcf.gz --write-index=csi -o tests/data/vcf/chr22-part1.vcf.gz |
14 | | -# bcftools view --no-update \ |
15 | | -# -S <(bcftools query -l tests/data/vcf/chr22.vcf.gz | tail -45) \ |
16 | | -# tests/data/vcf/chr22.vcf.gz --write-index=csi -o tests/data/vcf/chr22-part2.vcf.gz |
17 | | - |
18 | | -# Create a variants list VCF with no samples. |
19 | | -# Note that the header contains FORMAT fields, even though there are no samples, |
20 | | -# which is necessary for vc2zarr to create empty arrays. |
21 | | - |
22 | | -# bin/vcf-drop-samples.sh tests/data/vcf/sample.vcf.gz \ |
23 | | -# tests/data/vcf/sample-variants.vcf.gz |
24 | | - |
25 | | - |
26 | 1 | import numpy as np |
27 | 2 | import pytest |
28 | 3 | import zarr |
|
31 | 6 |
|
32 | 7 | from vczstore.append import append |
33 | 8 |
|
34 | | -from .utils import ( |
35 | | - compare_vcf_and_vcz, |
36 | | - convert_vcf_to_vcz, |
37 | | - convert_vcf_to_vcz_icechunk, |
38 | | - make_vcz, |
39 | | - run_vcztools, |
40 | | -) |
| 9 | +from .utils import make_vcz |
41 | 10 |
|
42 | 11 |
|
43 | 12 | def test_append(): |
@@ -104,67 +73,21 @@ def test_append(): |
104 | 73 | ) |
105 | 74 |
|
106 | 75 |
|
107 | | -@pytest.mark.parametrize("samples_chunk_size", [1, 2, 4]) |
108 | | -@pytest.mark.parametrize("backend_storage", [None, "obstore"]) |
109 | | -def test_append_compare_vcf(tmp_path, samples_chunk_size, backend_storage): |
110 | | - vcz1 = convert_vcf_to_vcz( |
111 | | - "sample-part1.vcf.gz", tmp_path, samples_chunk_size=samples_chunk_size |
112 | | - ) |
113 | | - vcz2 = convert_vcf_to_vcz("sample-part2.vcf.gz", tmp_path) |
114 | | - |
115 | | - # check samples query |
116 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1}") |
117 | | - assert vcztools_out.strip() == "NA00001\nNA00002" |
118 | | - |
119 | | - append(vcz1, vcz2, backend_storage=backend_storage) |
120 | | - |
121 | | - # check samples query |
122 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1}") |
123 | | - assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003" |
124 | | - |
125 | | - # check equivalence with original VCF |
126 | | - compare_vcf_and_vcz( |
127 | | - tmp_path, "view --no-version", "sample.vcf.gz", "view --no-version", vcz1 |
128 | | - ) |
129 | | - |
130 | | - |
131 | | -def test_append_from_variants_list(tmp_path): |
132 | | - vcz0 = convert_vcf_to_vcz( |
133 | | - "sample-variants.vcf.gz", tmp_path, ploidy=2, samples_chunk_size=2 |
134 | | - ) |
135 | | - vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path) |
136 | | - |
137 | | - # check samples query |
138 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz0}") |
139 | | - assert vcztools_out.strip() == "" |
140 | | - |
141 | | - append(vcz0, vcz1) |
142 | | - |
143 | | - # check samples query |
144 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz0}") |
145 | | - assert vcztools_out.strip() == "NA00001\nNA00002" |
146 | | - |
147 | | - # check equivalence with original VCF |
148 | | - compare_vcf_and_vcz( |
149 | | - tmp_path, "view --no-version", "sample-part1.vcf.gz", "view --no-version", vcz0 |
150 | | - ) |
151 | | - |
152 | | - |
153 | | -def test_append_fail_num_variants_mismatch(tmp_path): |
154 | | - vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path) |
155 | | - vcz2 = convert_vcf_to_vcz("alleles-1.vcf.gz", tmp_path) |
| 76 | +def test_append_fail_num_variants_mismatch(): |
| 77 | + vcz1 = make_vcz([0, 0], [1, 2], [["A", "T"], ["A", "G"]]) |
| 78 | + vcz2 = make_vcz([0], [1], [["A", "C"]]) |
156 | 79 |
|
157 | 80 | with pytest.raises( |
158 | 81 | ValueError, |
159 | 82 | match="Stores being appended must have same number of variants. " |
160 | | - "First has 9, second has 2", |
| 83 | + "First has 2, second has 1", |
161 | 84 | ): |
162 | 85 | append(vcz1, vcz2) |
163 | 86 |
|
164 | 87 |
|
165 | | -def test_append_fail_alleles_mismatch(tmp_path): |
166 | | - vcz1 = convert_vcf_to_vcz("sample-part1.vcf.gz", tmp_path) |
167 | | - vcz2 = convert_vcf_to_vcz("sample-part2-alleles-mismatch.vcf.gz", tmp_path) |
| 88 | +def test_append_fail_alleles_mismatch(): |
| 89 | + vcz1 = make_vcz([0], [1], [["A", "T"]]) |
| 90 | + vcz2 = make_vcz([0], [1], [["A", "C"]]) |
168 | 91 |
|
169 | 92 | with pytest.raises( |
170 | 93 | ValueError, |
@@ -419,60 +342,6 @@ def test_append_deletes_stale_destination_chunk_when_source_chunk_is_sparse(): |
419 | 342 | ) |
420 | 343 |
|
421 | 344 |
|
422 | | -def test_append_multiple_chunks(tmp_path): |
423 | | - vcz1 = convert_vcf_to_vcz( |
424 | | - "chr22-part1.vcf.gz", tmp_path, variants_chunk_size=10, samples_chunk_size=50 |
425 | | - ) |
426 | | - vcz2 = convert_vcf_to_vcz( |
427 | | - "chr22-part2.vcf.gz", tmp_path, variants_chunk_size=10, samples_chunk_size=50 |
428 | | - ) |
429 | | - |
430 | | - # check samples query |
431 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1}") |
432 | | - assert len(vcztools_out.strip().split("\n")) == 55 |
433 | | - |
434 | | - append(vcz1, vcz2) |
435 | | - |
436 | | - # check samples query |
437 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1}") |
438 | | - assert len(vcztools_out.strip().split("\n")) == 100 |
439 | | - |
440 | | - # check equivalence with original VCF |
441 | | - compare_vcf_and_vcz( |
442 | | - tmp_path, "view --no-version", "chr22.vcf.gz", "view --no-version", vcz1 |
443 | | - ) |
444 | | - |
445 | | - |
446 | | -def test_append_icechunk(tmp_path): |
447 | | - pytest.importorskip("icechunk") |
448 | | - |
449 | | - # note that vcz1 is in icechunk, but the dataset being appended, vcz2, needn't be |
450 | | - vcz1 = convert_vcf_to_vcz_icechunk("sample-part1.vcf.gz", tmp_path) |
451 | | - vcz2 = convert_vcf_to_vcz("sample-part2.vcf.gz", tmp_path, zarr_format=3) |
452 | | - |
453 | | - print(vcz1) |
454 | | - print(vcz2) |
455 | | - |
456 | | - # check samples query |
457 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1} --backend-storage icechunk") |
458 | | - assert vcztools_out.strip() == "NA00001\nNA00002" |
459 | | - |
460 | | - append(vcz1, vcz2, backend_storage="icechunk") |
461 | | - |
462 | | - # check samples query |
463 | | - vcztools_out, _ = run_vcztools(f"query -l {vcz1} --backend-storage icechunk") |
464 | | - assert vcztools_out.strip() == "NA00001\nNA00002\nNA00003" |
465 | | - |
466 | | - # check equivalence with original VCF |
467 | | - compare_vcf_and_vcz( |
468 | | - tmp_path, |
469 | | - "view --no-version", |
470 | | - "sample.vcf.gz", |
471 | | - "view --no-version --backend-storage icechunk", |
472 | | - vcz1, |
473 | | - ) |
474 | | - |
475 | | - |
476 | 345 | def _make_genotype(num_variants, num_samples): |
477 | 346 | values = np.zeros((num_variants, num_samples, 2), dtype=np.int8) |
478 | 347 | for variant_index in range(num_variants): |
|
0 commit comments