@@ -478,4 +478,141 @@ def test_make_cigar_reverse_strand():
478478 r .finalize_read_and_write (err_model , qual_model , None , 33 , False , 3 , rng )
479479 cigar = r .make_cigar ()
480480 assert isinstance (cigar , str )
481- assert len (cigar ) > 0
481+ assert len (cigar ) > 0
482+
483+
484+ # ---------------------------------------------------------------------------
485+ # segment_start — indel position in reverse reads (PR 276 regression)
486+ #
487+ # For read2 (reverse), the reference segment starts `padding` bases BEFORE
488+ # self.position to allow room for deletions before reverse-complementing.
489+ # Variant positions must be offset from segment_start, not position.
490+ # ---------------------------------------------------------------------------
491+
492+ # Build a 120-base segment (read_len=100 + padding=20) for reverse-read tests.
493+ _SEG_LEN = _READ_LEN + 20
494+ _REV_SEG = "ACGT" * (_SEG_LEN // 4 + 1 )
495+ _REV_SEG = _REV_SEG [:_SEG_LEN ] # exactly 120 bases
496+
497+ _SEGMENT_START = 80 # reference coord where segment begins
498+ _READ2_POS = 100 # reference coord of read2's nominal start (= segment_start + padding)
499+ _PADDING = _READ2_POS - _SEGMENT_START # 20
500+
501+
502+ def _make_read2 (segment = _REV_SEG , segment_start = _SEGMENT_START , position = _READ2_POS ,
503+ padding = _PADDING , read_len = _READ_LEN ):
504+ """Return a reverse read mimicking a real read2 where segment_start < position."""
505+ r = Read (
506+ name = "test_read2" ,
507+ raw_read = (segment_start , segment_start + len (segment ), position + 150 , position + 150 + read_len ),
508+ reference_segment = Seq (segment ),
509+ reference_id = "chr1" ,
510+ ref_id_index = 0 ,
511+ position = position ,
512+ end_point = position + read_len ,
513+ padding = padding ,
514+ run_read_len = read_len ,
515+ segment_start = segment_start ,
516+ is_reverse = True ,
517+ is_paired = True ,
518+ )
519+ r .read_sequence = Seq (segment )
520+ r .quality_array = np .array ([30 ] * len (segment ), dtype = float )
521+ return r
522+
523+
524+ def test_segment_start_defaults_to_position ():
525+ """When segment_start is omitted, it falls back to position."""
526+ r = _make_read (position = 50 )
527+ assert r .segment_start == 50
528+
529+
530+ def test_segment_start_stored_independently_from_position ():
531+ """segment_start is kept separate from position for reverse reads."""
532+ r = _make_read2 ()
533+ assert r .segment_start == _SEGMENT_START
534+ assert r .position == _READ2_POS
535+ assert r .segment_start < r .position
536+
537+
538+ def test_apply_mutations_snv_reverse_read_uses_segment_start ():
539+ """
540+ Regression for PR 276: an SNV at reference position V must land at index
541+ V - segment_start in the segment, not V - position.
542+
543+ With segment_start=80 and position=100, a variant at ref pos 110 must go
544+ to segment index 30 (correct) not index 10 (old, wrong).
545+ """
546+ r = _make_read2 ()
547+ variant_ref_pos = 110
548+ correct_idx = variant_ref_pos - _SEGMENT_START # 30
549+ wrong_idx = variant_ref_pos - _READ2_POS # 10
550+
551+ # "ACGT"*30 — index 30 is 'G', index 10 is also 'G'; use alt='T' to detect placement
552+ assert str (r .read_sequence [correct_idx ]) == "G"
553+ assert str (r .read_sequence [wrong_idx ]) == "G"
554+
555+ snv = SingleNucleotideVariant (position1 = variant_ref_pos , alt = Seq ("T" ),
556+ genotype = np .array ([1 , 1 ]), qual_score = 30 )
557+ r .mutations = {variant_ref_pos : [snv ]}
558+ r .apply_mutations ([30 ], _make_rng ())
559+
560+ assert str (r .read_sequence [correct_idx ]) == "T" , "SNV not at V - segment_start"
561+ assert str (r .read_sequence [wrong_idx ]) == "G" , "SNV incorrectly placed at V - position"
562+
563+
564+ def test_apply_mutations_snv_reverse_read_variant_before_position ():
565+ """
566+ A variant in the padded region (segment_start <= V < position) must also
567+ be placed correctly. The old code (V - position) would give a negative index.
568+ """
569+ r = _make_read2 ()
570+ variant_ref_pos = 85 # before position=100, inside segment starting at 80
571+ correct_idx = variant_ref_pos - _SEGMENT_START # 5
572+
573+ # "ACGT"*30 — index 5 is 'C'
574+ assert str (r .read_sequence [correct_idx ]) == "C"
575+
576+ snv = SingleNucleotideVariant (position1 = variant_ref_pos , alt = Seq ("T" ),
577+ genotype = np .array ([1 , 1 ]), qual_score = 30 )
578+ r .mutations = {variant_ref_pos : [snv ]}
579+ r .apply_mutations ([30 ], _make_rng ())
580+
581+ assert str (r .read_sequence [correct_idx ]) == "T"
582+
583+
584+ def test_apply_mutations_insertion_reverse_read_correct_position ():
585+ """Insertion in a reverse read is placed at V - segment_start."""
586+ r = _make_read2 ()
587+ variant_ref_pos = 115
588+ correct_idx = variant_ref_pos - _SEGMENT_START # 35
589+
590+ ins = Insertion (position1 = variant_ref_pos , length = 2 , alt = Seq ("TTT" ),
591+ genotype = np .array ([1 , 1 ]), qual_score = 30 )
592+ r .mutations = {variant_ref_pos : [ins ]}
593+ r .apply_mutations ([30 ], _make_rng ())
594+
595+ # The three bases starting at correct_idx should now be "TTT"
596+ assert str (r .read_sequence [correct_idx : correct_idx + 3 ]) == "TTT"
597+
598+
599+ def test_apply_mutations_deletion_reverse_read_correct_position ():
600+ """Deletion in a reverse read removes bases starting at V - segment_start."""
601+ r = _make_read2 ()
602+ variant_ref_pos = 105
603+ correct_idx = variant_ref_pos - _SEGMENT_START # 25
604+ del_len = 3
605+
606+ # "ACGT"*30: indices 25,26,27 = 'C','G','T'; index 28 = 'A'
607+ pre_at_28 = str (r .read_sequence [correct_idx + del_len ]) # 'A'
608+ pre_len = len (r .read_sequence )
609+
610+ deletion = Deletion (position1 = variant_ref_pos , length = del_len ,
611+ genotype = np .array ([1 , 1 ]), qual_score = 30 )
612+ r .mutations = {variant_ref_pos : [deletion ]}
613+ r .apply_mutations ([30 ], _make_rng ())
614+
615+ # Sequence is shorter by del_len - 1 (one base kept at position, rest removed)
616+ assert len (r .read_sequence ) == pre_len - (del_len - 1 )
617+ # The base that was at correct_idx + del_len is now at correct_idx + 1
618+ assert str (r .read_sequence [correct_idx + 1 ]) == pre_at_28
0 commit comments