|
7 | 7 | from types import SimpleNamespace |
8 | 8 | from unittest.mock import MagicMock, patch, call |
9 | 9 |
|
| 10 | +import numpy as np |
10 | 11 | import pytest |
11 | 12 |
|
12 | 13 | from neat.read_simulator.utils.stitch_outputs import concat, merge_vcfs, merge_bam, main |
| 14 | +from neat.variants import SingleNucleotideVariant |
| 15 | +from neat.variants.contig_variants import ContigVariants |
13 | 16 |
|
14 | 17 |
|
15 | | -# --------------------------------------------------------------------------- |
16 | 18 | # Helpers |
17 | | -# --------------------------------------------------------------------------- |
18 | 19 |
|
19 | 20 | def _write_gz(path: Path, text: str) -> Path: |
20 | 21 | with gzip.open(path, "wt") as fh: |
@@ -49,9 +50,7 @@ def _make_ofw(tmp_path: Path, vcf_path: Path = None): |
49 | 50 | return ofw |
50 | 51 |
|
51 | 52 |
|
52 | | -# =========================================================================== |
53 | 53 | # concat |
54 | | -# =========================================================================== |
55 | 54 |
|
56 | 55 | def test_concat_single_file(tmp_path): |
57 | 56 | src = _write_gz(tmp_path / "a.gz", "hello\n") |
@@ -91,9 +90,7 @@ def test_concat_order_is_preserved(tmp_path): |
91 | 90 | assert positions == sorted(positions) |
92 | 91 |
|
93 | 92 |
|
94 | | -# =========================================================================== |
95 | 93 | # merge_vcfs |
96 | | -# =========================================================================== |
97 | 94 |
|
98 | 95 | def test_merge_vcfs_skips_comment_lines(tmp_path): |
99 | 96 | vcf_text = "##header line\n#CHROM\tPOS\n1\t100\tA\tT\n" |
@@ -139,9 +136,70 @@ def test_merge_vcfs_preserves_data_line_order(tmp_path): |
139 | 136 | assert positions == list(range(1, 6)) |
140 | 137 |
|
141 | 138 |
|
142 | | -# =========================================================================== |
| 139 | +def test_merge_vcfs_dedup_removes_identical_lines(tmp_path): |
| 140 | + """Identical lines from two thread VCFs are collapsed to one (Issue #256).""" |
| 141 | + line = "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n" |
| 142 | + v1 = _write_gz(tmp_path / "t0.vcf.gz", line) |
| 143 | + v2 = _write_gz(tmp_path / "t1.vcf.gz", line) |
| 144 | + ofw = _make_ofw(tmp_path) |
| 145 | + merge_vcfs([v1, v2], ofw) |
| 146 | + result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()] |
| 147 | + assert len(result) == 1 |
| 148 | + |
| 149 | + |
| 150 | +def test_merge_vcfs_distinct_lines_are_all_kept(tmp_path): |
| 151 | + """Distinct lines from two threads both appear in merged output.""" |
| 152 | + v1 = _write_gz(tmp_path / "t0.vcf.gz", "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n") |
| 153 | + v2 = _write_gz(tmp_path / "t1.vcf.gz", "chr1\t200\t.\tC\tG\t42\tPASS\t.\tGT\t0|1\n") |
| 154 | + ofw = _make_ofw(tmp_path) |
| 155 | + merge_vcfs([v1, v2], ofw) |
| 156 | + result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()] |
| 157 | + assert len(result) == 2 |
| 158 | + |
| 159 | + |
| 160 | +def test_merge_vcfs_partial_overlap_deduped(tmp_path): |
| 161 | + """Three lines total, two of which are identical: result has two unique lines.""" |
| 162 | + line_a = "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n" |
| 163 | + line_b = "chr1\t200\t.\tC\tG\t42\tPASS\t.\tGT\t0|1\n" |
| 164 | + v1 = _write_gz(tmp_path / "t0.vcf.gz", line_a + line_b) |
| 165 | + v2 = _write_gz(tmp_path / "t1.vcf.gz", line_a) |
| 166 | + ofw = _make_ofw(tmp_path) |
| 167 | + merge_vcfs([v1, v2], ofw) |
| 168 | + result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()] |
| 169 | + assert len(result) == 2 |
| 170 | + |
| 171 | + |
| 172 | +# find_dups (ContigVariants deduplication, Issue #256) |
| 173 | + |
| 174 | +def test_find_dups_same_alt_different_genotype_rejected(tmp_path): |
| 175 | + """Same position + same ALT is a duplicate regardless of genotype.""" |
| 176 | + cv = ContigVariants() |
| 177 | + v1 = SingleNucleotideVariant(10, "T", np.array([1, 0]), 40) |
| 178 | + v2 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40) |
| 179 | + cv.add_variant(v1) |
| 180 | + assert cv.add_variant(v2) == 1 |
| 181 | + |
| 182 | + |
| 183 | +def test_find_dups_different_alt_same_position_accepted(tmp_path): |
| 184 | + """Two SNVs at the same position with different ALTs are not duplicates.""" |
| 185 | + cv = ContigVariants() |
| 186 | + v1 = SingleNucleotideVariant(10, "T", np.array([1, 0]), 40) |
| 187 | + v2 = SingleNucleotideVariant(10, "G", np.array([0, 1]), 40) |
| 188 | + cv.add_variant(v1) |
| 189 | + assert cv.add_variant(v2) == 0 |
| 190 | + assert len(cv.contig_variants[10]) == 2 |
| 191 | + |
| 192 | + |
| 193 | +def test_find_dups_exact_duplicate_rejected(tmp_path): |
| 194 | + """Exact duplicates (same position, ALT, and genotype) are rejected.""" |
| 195 | + cv = ContigVariants() |
| 196 | + v1 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40) |
| 197 | + v2 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40) |
| 198 | + cv.add_variant(v1) |
| 199 | + assert cv.add_variant(v2) == 1 |
| 200 | + |
| 201 | + |
143 | 202 | # merge_bam |
144 | | -# =========================================================================== |
145 | 203 |
|
146 | 204 | def test_merge_bam_calls_pysam_merge_and_sort(tmp_path): |
147 | 205 | ofw = _make_ofw(tmp_path) |
@@ -191,9 +249,7 @@ def test_merge_bam_chunks_large_bam_list(tmp_path): |
191 | 249 | assert mock_pysam.merge.call_count == 3 |
192 | 250 |
|
193 | 251 |
|
194 | | -# =========================================================================== |
195 | 252 | # main |
196 | | -# =========================================================================== |
197 | 253 |
|
198 | 254 | def _file_dict(fq1=None, fq2=None, vcf=None, bam=None): |
199 | 255 | return {"fq1": fq1, "fq2": fq2, "vcf": vcf, "bam": bam} |
|
0 commit comments