Skip to content

Commit 71c775d

Browse files
Merge pull request #280 from ncsa/develop
Develop
2 parents 3a88096 + 47d572f commit 71c775d

5 files changed

Lines changed: 153 additions & 4 deletions

File tree

ChangeLog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# NEAT v4.4.1
2+
- Added `readme = "README.md"` to `pyproject.toml` so the project description appears correctly on PyPI.
3+
14
# NEAT v4.4
25
- Official release of NEAT 4.0. Represents major contributions from NCSA and beyond.
36
- Added parallel processing support, making NEAT production-ready for large genomes.

neat/read_simulator/utils/generate_reads.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ def generate_reads(
272272
end_point=read1[1] + ref_start,
273273
padding=actual_padding,
274274
run_read_len=options.read_len,
275+
segment_start=read1[0] + ref_start,
275276
is_paired=options.paired_ended,
276277
)
277278

@@ -309,6 +310,7 @@ def generate_reads(
309310
end_point=read2[1] + ref_start,
310311
padding=actual_padding,
311312
run_read_len=options.read_len,
313+
segment_start=start_coordinate + ref_start,
312314
is_reverse=True,
313315
is_paired=options.paired_ended
314316
)

neat/read_simulator/utils/read.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def __init__(self,
5050
end_point: int,
5151
padding: int,
5252
run_read_len: int,
53+
segment_start: int = None,
5354
is_reverse: bool = False,
5455
is_paired: bool = False):
5556

@@ -63,6 +64,9 @@ def __init__(self,
6364
self.end_point = end_point
6465
self.padding = padding
6566
self.run_read_length = run_read_len
67+
# segment_start is the reference coordinate of reference_segment[0], which may be
68+
# earlier than self.position for reverse reads (padding prepended for deletion headroom).
69+
self.segment_start = segment_start if segment_start is not None else position
6670
self.is_reverse = is_reverse
6771
self.is_paired = is_paired
6872

@@ -202,8 +206,10 @@ def apply_mutations(self, quality_scores: list, rng: Generator):
202206

203207
# Fetch parameters
204208
qual_score = variant_to_apply.get_qual_score()
205-
# Find the position within the read for this variant, cast as a python int, instead of a numpy int
206-
position = int(variant_to_apply.get_0_location() - self.position)
209+
# Find the position within the reference_segment for this variant. Use segment_start rather
210+
# than self.position because for reverse reads the segment begins padding bases before
211+
# self.position (leading padding for deletion headroom after reverse_complement).
212+
position = int(variant_to_apply.get_0_location() - self.segment_start)
207213
# Figure out if the variant is in this read. If 'to_mutate' selects any 1, then it is mutated.
208214
# For diploid animals, for example, this should result in approximately 50% of reads having a
209215
# given heterozygous mutation and 100% for homozygous mutations.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
[tool.poetry]
22
name = "neat-genreads"
3-
version = "4.4"
3+
version = "4.4.1"
44
description = "NGS Simulation toolkit"
5+
readme = "README.md"
56
authors = ["Joshua Allen <jallen17@illinois.edu>"]
67
license = "BSD 3-Clause License"
78
packages = [{include = "neat"}]

tests/test_read_simulator/test_read.py

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,4 +478,141 @@ def test_make_cigar_reverse_strand():
478478
r.finalize_read_and_write(err_model, qual_model, None, 33, False, 3, rng)
479479
cigar = r.make_cigar()
480480
assert isinstance(cigar, str)
481-
assert len(cigar) > 0
481+
assert len(cigar) > 0
482+
483+
484+
# ---------------------------------------------------------------------------
485+
# segment_start — indel position in reverse reads (PR 276 regression)
486+
#
487+
# For read2 (reverse), the reference segment starts `padding` bases BEFORE
488+
# self.position to allow room for deletions before reverse-complementing.
489+
# Variant positions must be offset from segment_start, not position.
490+
# ---------------------------------------------------------------------------
491+
492+
# Build a 120-base segment (read_len=100 + padding=20) for reverse-read tests.
493+
_SEG_LEN = _READ_LEN + 20
494+
_REV_SEG = "ACGT" * (_SEG_LEN // 4 + 1)
495+
_REV_SEG = _REV_SEG[:_SEG_LEN] # exactly 120 bases
496+
497+
_SEGMENT_START = 80 # reference coord where segment begins
498+
_READ2_POS = 100 # reference coord of read2's nominal start (= segment_start + padding)
499+
_PADDING = _READ2_POS - _SEGMENT_START # 20
500+
501+
502+
def _make_read2(segment=_REV_SEG, segment_start=_SEGMENT_START, position=_READ2_POS,
503+
padding=_PADDING, read_len=_READ_LEN):
504+
"""Return a reverse read mimicking a real read2 where segment_start < position."""
505+
r = Read(
506+
name="test_read2",
507+
raw_read=(segment_start, segment_start + len(segment), position + 150, position + 150 + read_len),
508+
reference_segment=Seq(segment),
509+
reference_id="chr1",
510+
ref_id_index=0,
511+
position=position,
512+
end_point=position + read_len,
513+
padding=padding,
514+
run_read_len=read_len,
515+
segment_start=segment_start,
516+
is_reverse=True,
517+
is_paired=True,
518+
)
519+
r.read_sequence = Seq(segment)
520+
r.quality_array = np.array([30] * len(segment), dtype=float)
521+
return r
522+
523+
524+
def test_segment_start_defaults_to_position():
525+
"""When segment_start is omitted, it falls back to position."""
526+
r = _make_read(position=50)
527+
assert r.segment_start == 50
528+
529+
530+
def test_segment_start_stored_independently_from_position():
531+
"""segment_start is kept separate from position for reverse reads."""
532+
r = _make_read2()
533+
assert r.segment_start == _SEGMENT_START
534+
assert r.position == _READ2_POS
535+
assert r.segment_start < r.position
536+
537+
538+
def test_apply_mutations_snv_reverse_read_uses_segment_start():
539+
"""
540+
Regression for PR 276: an SNV at reference position V must land at index
541+
V - segment_start in the segment, not V - position.
542+
543+
With segment_start=80 and position=100, a variant at ref pos 110 must go
544+
to segment index 30 (correct) not index 10 (old, wrong).
545+
"""
546+
r = _make_read2()
547+
variant_ref_pos = 110
548+
correct_idx = variant_ref_pos - _SEGMENT_START # 30
549+
wrong_idx = variant_ref_pos - _READ2_POS # 10
550+
551+
# "ACGT"*30 — index 30 is 'G', index 10 is also 'G'; use alt='T' to detect placement
552+
assert str(r.read_sequence[correct_idx]) == "G"
553+
assert str(r.read_sequence[wrong_idx]) == "G"
554+
555+
snv = SingleNucleotideVariant(position1=variant_ref_pos, alt=Seq("T"),
556+
genotype=np.array([1, 1]), qual_score=30)
557+
r.mutations = {variant_ref_pos: [snv]}
558+
r.apply_mutations([30], _make_rng())
559+
560+
assert str(r.read_sequence[correct_idx]) == "T", "SNV not at V - segment_start"
561+
assert str(r.read_sequence[wrong_idx]) == "G", "SNV incorrectly placed at V - position"
562+
563+
564+
def test_apply_mutations_snv_reverse_read_variant_before_position():
565+
"""
566+
A variant in the padded region (segment_start <= V < position) must also
567+
be placed correctly. The old code (V - position) would give a negative index.
568+
"""
569+
r = _make_read2()
570+
variant_ref_pos = 85 # before position=100, inside segment starting at 80
571+
correct_idx = variant_ref_pos - _SEGMENT_START # 5
572+
573+
# "ACGT"*30 — index 5 is 'C'
574+
assert str(r.read_sequence[correct_idx]) == "C"
575+
576+
snv = SingleNucleotideVariant(position1=variant_ref_pos, alt=Seq("T"),
577+
genotype=np.array([1, 1]), qual_score=30)
578+
r.mutations = {variant_ref_pos: [snv]}
579+
r.apply_mutations([30], _make_rng())
580+
581+
assert str(r.read_sequence[correct_idx]) == "T"
582+
583+
584+
def test_apply_mutations_insertion_reverse_read_correct_position():
585+
"""Insertion in a reverse read is placed at V - segment_start."""
586+
r = _make_read2()
587+
variant_ref_pos = 115
588+
correct_idx = variant_ref_pos - _SEGMENT_START # 35
589+
590+
ins = Insertion(position1=variant_ref_pos, length=2, alt=Seq("TTT"),
591+
genotype=np.array([1, 1]), qual_score=30)
592+
r.mutations = {variant_ref_pos: [ins]}
593+
r.apply_mutations([30], _make_rng())
594+
595+
# The three bases starting at correct_idx should now be "TTT"
596+
assert str(r.read_sequence[correct_idx: correct_idx + 3]) == "TTT"
597+
598+
599+
def test_apply_mutations_deletion_reverse_read_correct_position():
600+
"""Deletion in a reverse read removes bases starting at V - segment_start."""
601+
r = _make_read2()
602+
variant_ref_pos = 105
603+
correct_idx = variant_ref_pos - _SEGMENT_START # 25
604+
del_len = 3
605+
606+
# "ACGT"*30: indices 25,26,27 = 'C','G','T'; index 28 = 'A'
607+
pre_at_28 = str(r.read_sequence[correct_idx + del_len]) # 'A'
608+
pre_len = len(r.read_sequence)
609+
610+
deletion = Deletion(position1=variant_ref_pos, length=del_len,
611+
genotype=np.array([1, 1]), qual_score=30)
612+
r.mutations = {variant_ref_pos: [deletion]}
613+
r.apply_mutations([30], _make_rng())
614+
615+
# Sequence is shorter by del_len - 1 (one base kept at position, rest removed)
616+
assert len(r.read_sequence) == pre_len - (del_len - 1)
617+
# The base that was at correct_idx + del_len is now at correct_idx + 1
618+
assert str(r.read_sequence[correct_idx + 1]) == pre_at_28

0 commit comments

Comments
 (0)