Skip to content

Commit 342356c

Browse files
Merge pull request #267 from ncsa/vcf-fixes
Solving VCF-related issues.
2 parents ead3e07 + ce60026 commit 342356c

9 files changed

Lines changed: 310 additions & 148 deletions

File tree

neat/models/mutation_model.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,19 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int, rng: Generat
108108
# First determine which matrix to use
109109
transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]]
110110
# then determine the trans probs based on the middle nucleotide
111-
transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]]
112-
# Creating probabilities from the weights
111+
transition_probs = list(transition_matrix[NUC_IND[trinucleotide[1]]])
112+
# Zero the ref-base probability so that we never pick REF==ALT, even with custom models with trans matrices
113+
# that have non-zero diagonal entries (edge case)
114+
ref_base_idx = NUC_IND[trinucleotide[1]]
115+
transition_probs[ref_base_idx] = 0.0
113116
transition_sum = sum(transition_probs)
117+
if transition_sum == 0.0:
118+
_LOG.warning(
119+
f"Transition matrix row for '{trinucleotide[1]}' has all weight on the reference base. "
120+
f"Falling back to uniform sampling of non-ref bases."
121+
)
122+
transition_probs = [1.0 if i != ref_base_idx else 0.0 for i in range(len(ALLOWED_NUCL))]
123+
transition_sum = sum(transition_probs)
114124
transition_probs = [x/transition_sum for x in transition_probs]
115125
# Now pick a random alternate, weighted by the probabilities
116126
alt = rng.choice(ALLOWED_NUCL, p=transition_probs)

neat/read_simulator/utils/stitch_outputs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,20 @@ def concat(files_to_join: List[Path], dest_file: gzip.GzipFile) -> None:
2929

3030
def merge_vcfs(vcfs: List[Path], ofw: OutputFileWriter) -> None:
3131
dest = ofw.files_to_write[ofw.vcf]
32+
seen: set[str] = set()
33+
n_duplicates = 0
3234
for vcf in vcfs:
3335
with gzip.open(vcf, 'rt') as fh:
3436
for line in fh:
3537
if not line.startswith("#"):
38+
normalized = line.rstrip("\r\n")
39+
if normalized in seen:
40+
n_duplicates += 1
41+
continue
42+
seen.add(normalized)
3643
dest.write(line)
44+
if n_duplicates:
45+
_LOG.warning(f"merge_vcfs: removed {n_duplicates} duplicate VCF line(s) during merge.")
3746

3847
def merge_bam(bam_files: List[Path], ofw: OutputFileWriter, threads: int):
3948
merged_file = ofw.tmp_dir / "temp_merged.bam"

neat/read_simulator/utils/vcf_func.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,13 @@ def parse_input_vcf(
228228
count = 0
229229
for alt in alts:
230230
count += 1
231+
if ref == alt:
232+
_LOG.warning(
233+
f"Skipping variant at {chrom}:{location + 1} — REF == ALT ({ref!r}). "
234+
f"This is not a valid variant."
235+
)
236+
n_skipped += 1
237+
continue
231238
# This temp genotype teases out only the ploids with this particular variant
232239
temp_genotype = variant_genotype(options.ploidy, genotype, count)
233240
if len(ref) == len(alt) == 1:

neat/variants/contig_variants.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,26 @@ def generate_field(self, variant, field):
9494

9595
def find_dups(self, variant):
9696
"""
97-
Checks if the given genotype is already present in a list of variants.
97+
Checks if an equivalent variant already exists at this position.
98+
Two variants are duplicates when they share the same type and ALT allele.
99+
Genotype-only comparison was insufficient: two variants with identical ALT
100+
but different genotypes would produce two identical VCF output lines.
98101
99102
:param variant: A variant to check for duplicates
100-
:return: True or False if found or not
103+
:return: True if a duplicate exists, False otherwise
101104
"""
105+
try:
106+
variant_alt = variant.get_alt()
107+
except (KeyError, AttributeError):
108+
variant_alt = None
109+
102110
for existing_var in self.contig_variants[variant.position1]:
103-
if np.array_equal(variant.genotype, existing_var.genotype):
111+
try:
112+
existing_alt = existing_var.get_alt()
113+
except (KeyError, AttributeError):
114+
existing_alt = None
115+
116+
if type(variant) == type(existing_var) and variant_alt == existing_alt:
104117
return True
105118

106119
return False

tests/test_read_simulator/test_generate_variants.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
from neat.variants import ContigVariants, SingleNucleotideVariant, Insertion, Deletion
1717

1818

19-
# ---------------------------------------------------------------------------
2019
# Helpers
21-
# ---------------------------------------------------------------------------
2220

2321
_CLEAN_SEQ = "ACGT" * 50 # 200 bp, no N's
2422
_N_HEAVY = "N" * 95 + "ACGT" # 99 bases, >10% N
@@ -44,9 +42,7 @@ def _full_rate_regions(seq_len: int, rate: float = 0.01, offset: int = 0):
4442
return [(offset, offset + seq_len, rate)]
4543

4644

47-
# ===========================================================================
4845
# find_random_non_n
49-
# ===========================================================================
5046

5147
def test_find_random_non_n_returns_valid_index():
5248
rng = np.random.default_rng(0)
@@ -77,9 +73,7 @@ def test_find_random_non_n_single_element():
7773
assert find_random_non_n(rng, safe_zones) == 0
7874

7975

80-
# ===========================================================================
8176
# map_non_n_regions
82-
# ===========================================================================
8377

8478
def test_map_non_n_regions_clean_sequence():
8579
result = map_non_n_regions("ACGTACGTACGT")
@@ -88,7 +82,7 @@ def test_map_non_n_regions_clean_sequence():
8882

8983

9084
def test_map_non_n_regions_single_n():
91-
# 1 N in 50 bases = 2% N valid map returned
85+
# 1 N in 50 bases = 2% N (valid map returned)
9286
seq = "A" * 24 + "N" + "A" * 25
9387
result = map_non_n_regions(seq)
9488
assert len(result) == 50
@@ -106,7 +100,7 @@ def test_map_non_n_regions_too_many_ns_returns_empty():
106100

107101
def test_map_non_n_regions_exactly_at_threshold():
108102
"""Exactly 10% N → should return empty (condition is <= 0.90 non-N)."""
109-
# 10 N's + 90 ACGT 90% non-N, average == 0.90, which hits the <= boundary
103+
# 10 N's + 90 ACGT (90% non-N, average == 0.90, which hits the <= boundary)
110104
seq = "N" * 10 + "A" * 90
111105
result = map_non_n_regions(seq)
112106
assert len(result) == 0
@@ -118,7 +112,7 @@ def test_map_non_n_regions_all_n_returns_empty():
118112

119113

120114
def test_map_non_n_regions_run_of_ns():
121-
seq = "ACGT" + "NNNN" + "ACGT" # 12 bp, 4/12 ≈ 33% N empty
115+
seq = "ACGT" + "NNNN" + "ACGT" # 12 bp, 4/12 ≈ 33% N (empty)
122116
result = map_non_n_regions(seq)
123117
assert len(result) == 0
124118

@@ -133,9 +127,7 @@ def test_map_non_n_regions_short_n_run_in_long_sequence():
133127
assert result[0] == 1
134128

135129

136-
# ===========================================================================
137130
# generate_variants — input variants are copied into output
138-
# ===========================================================================
139131

140132
def test_generate_variants_returns_contig_variants():
141133
ref = _make_reference()
@@ -206,9 +198,7 @@ def test_generate_variants_input_variant_before_offset_excluded():
206198
assert 50 not in result
207199

208200

209-
# ===========================================================================
210201
# generate_variants — random mutation generation
211-
# ===========================================================================
212202

213203
def test_generate_variants_adds_at_least_min_mutations():
214204
"""With min_mutations=1, at least 1 variant should be added."""
@@ -339,9 +329,7 @@ def test_generate_variants_input_and_random_together():
339329
assert len(result.variant_locations) > 1
340330

341331

342-
# ===========================================================================
343332
# generate_variants — N-handling paths (lines 139-157, 204)
344-
# ===========================================================================
345333

346334
def test_generate_variants_n_in_mutation_region_completes():
347335
"""Sequence with N's in the mutation region runs to completion.
@@ -370,7 +358,7 @@ def test_generate_variants_n_heavy_subsequence_skipped():
370358
ref = _make_reference(seq)
371359
model = _make_model()
372360
opts = _make_options(seed=3)
373-
opts.min_mutations = 0 # let poisson decide; main goal is no crash
361+
opts.min_mutations = 0 # let Poisson decide; main goal is no crash
374362

375363
result = generate_variants(ref, 0, _full_rate_regions(len(seq)), ContigVariants(), model, opts, 40)
376364
assert isinstance(result, ContigVariants)
@@ -406,9 +394,16 @@ def test_generate_variants_deletion_overlap_handling():
406394

407395
result = generate_variants(ref, 0, _full_rate_regions(len(seq), 0.05), ContigVariants(), model_high, opts, 40)
408396
assert isinstance(result, ContigVariants)
409-
# Variants at the same location are deduplicated correctly
397+
# Variants at the same location are deduplicated by (type, ALT) — not genotype.
398+
# Two variants of the same type with the same ALT at the same position are duplicates.
410399
for loc in result.variant_locations:
411400
variants_here = result.contig_variants[loc]
412-
genotypes = [tuple(v.genotype) for v in variants_here]
413-
assert len(genotypes) == len(set(genotypes)), \
414-
f"Duplicate genotype at location {loc}"
401+
type_alt_keys = []
402+
for v in variants_here:
403+
try:
404+
alt = v.get_alt()
405+
except (KeyError, AttributeError):
406+
alt = None
407+
type_alt_keys.append((type(v).__name__, alt))
408+
assert len(type_alt_keys) == len(set(type_alt_keys)), \
409+
f"Duplicate (type, ALT) pair at location {loc}: {type_alt_keys}"

tests/test_read_simulator/test_stitch_outputs.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
from types import SimpleNamespace
88
from unittest.mock import MagicMock, patch, call
99

10+
import numpy as np
1011
import pytest
1112

1213
from neat.read_simulator.utils.stitch_outputs import concat, merge_vcfs, merge_bam, main
14+
from neat.variants import SingleNucleotideVariant
15+
from neat.variants.contig_variants import ContigVariants
1316

1417

15-
# ---------------------------------------------------------------------------
1618
# Helpers
17-
# ---------------------------------------------------------------------------
1819

1920
def _write_gz(path: Path, text: str) -> Path:
2021
with gzip.open(path, "wt") as fh:
@@ -49,9 +50,7 @@ def _make_ofw(tmp_path: Path, vcf_path: Path = None):
4950
return ofw
5051

5152

52-
# ===========================================================================
5353
# concat
54-
# ===========================================================================
5554

5655
def test_concat_single_file(tmp_path):
5756
src = _write_gz(tmp_path / "a.gz", "hello\n")
@@ -91,9 +90,7 @@ def test_concat_order_is_preserved(tmp_path):
9190
assert positions == sorted(positions)
9291

9392

94-
# ===========================================================================
9593
# merge_vcfs
96-
# ===========================================================================
9794

9895
def test_merge_vcfs_skips_comment_lines(tmp_path):
9996
vcf_text = "##header line\n#CHROM\tPOS\n1\t100\tA\tT\n"
@@ -139,9 +136,70 @@ def test_merge_vcfs_preserves_data_line_order(tmp_path):
139136
assert positions == list(range(1, 6))
140137

141138

142-
# ===========================================================================
139+
def test_merge_vcfs_dedup_removes_identical_lines(tmp_path):
140+
"""Identical lines from two thread VCFs are collapsed to one (Issue #256)."""
141+
line = "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n"
142+
v1 = _write_gz(tmp_path / "t0.vcf.gz", line)
143+
v2 = _write_gz(tmp_path / "t1.vcf.gz", line)
144+
ofw = _make_ofw(tmp_path)
145+
merge_vcfs([v1, v2], ofw)
146+
result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()]
147+
assert len(result) == 1
148+
149+
150+
def test_merge_vcfs_distinct_lines_are_all_kept(tmp_path):
151+
"""Distinct lines from two threads both appear in merged output."""
152+
v1 = _write_gz(tmp_path / "t0.vcf.gz", "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n")
153+
v2 = _write_gz(tmp_path / "t1.vcf.gz", "chr1\t200\t.\tC\tG\t42\tPASS\t.\tGT\t0|1\n")
154+
ofw = _make_ofw(tmp_path)
155+
merge_vcfs([v1, v2], ofw)
156+
result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()]
157+
assert len(result) == 2
158+
159+
160+
def test_merge_vcfs_partial_overlap_deduped(tmp_path):
161+
"""Three lines total, two of which are identical: result has two unique lines."""
162+
line_a = "chr1\t100\t.\tA\tT\t42\tPASS\t.\tGT\t0|1\n"
163+
line_b = "chr1\t200\t.\tC\tG\t42\tPASS\t.\tGT\t0|1\n"
164+
v1 = _write_gz(tmp_path / "t0.vcf.gz", line_a + line_b)
165+
v2 = _write_gz(tmp_path / "t1.vcf.gz", line_a)
166+
ofw = _make_ofw(tmp_path)
167+
merge_vcfs([v1, v2], ofw)
168+
result = [l for l in ofw._vcf_buf.getvalue().splitlines() if l.strip()]
169+
assert len(result) == 2
170+
171+
172+
# find_dups (ContigVariants deduplication, Issue #256)
173+
174+
def test_find_dups_same_alt_different_genotype_rejected(tmp_path):
175+
"""Same position + same ALT is a duplicate regardless of genotype."""
176+
cv = ContigVariants()
177+
v1 = SingleNucleotideVariant(10, "T", np.array([1, 0]), 40)
178+
v2 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40)
179+
cv.add_variant(v1)
180+
assert cv.add_variant(v2) == 1
181+
182+
183+
def test_find_dups_different_alt_same_position_accepted(tmp_path):
184+
"""Two SNVs at the same position with different ALTs are not duplicates."""
185+
cv = ContigVariants()
186+
v1 = SingleNucleotideVariant(10, "T", np.array([1, 0]), 40)
187+
v2 = SingleNucleotideVariant(10, "G", np.array([0, 1]), 40)
188+
cv.add_variant(v1)
189+
assert cv.add_variant(v2) == 0
190+
assert len(cv.contig_variants[10]) == 2
191+
192+
193+
def test_find_dups_exact_duplicate_rejected(tmp_path):
194+
"""Exact duplicates (same position, ALT, and genotype) are rejected."""
195+
cv = ContigVariants()
196+
v1 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40)
197+
v2 = SingleNucleotideVariant(10, "T", np.array([0, 1]), 40)
198+
cv.add_variant(v1)
199+
assert cv.add_variant(v2) == 1
200+
201+
143202
# merge_bam
144-
# ===========================================================================
145203

146204
def test_merge_bam_calls_pysam_merge_and_sort(tmp_path):
147205
ofw = _make_ofw(tmp_path)
@@ -191,9 +249,7 @@ def test_merge_bam_chunks_large_bam_list(tmp_path):
191249
assert mock_pysam.merge.call_count == 3
192250

193251

194-
# ===========================================================================
195252
# main
196-
# ===========================================================================
197253

198254
def _file_dict(fq1=None, fq2=None, vcf=None, bam=None):
199255
return {"fq1": fq1, "fq2": fq2, "vcf": vcf, "bam": bam}

0 commit comments

Comments
 (0)