Skip to content
This repository was archived by the owner on May 22, 2026. It is now read-only.

Commit 3ddc1f1

Browse files
committed
Fixed issue with AMP in allele: hicnt and hicov for many insertions and SV were changed. Total coverage for part of the insertion changed.
Added changes for chimeric reads in modify cigar part.
1 parent b1cc115 commit 3ddc1f1

53 files changed

Lines changed: 1330 additions & 1240 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package com.astrazeneca.vardict.data;
2+
3+
public class ModifiedCigar {
4+
public int position;
5+
public String cigar;
6+
public String querySequence;
7+
public String queryQuality;
8+
9+
public ModifiedCigar(int position, String cigar,String querySequence, String queryQuality) {
10+
this.position = position;
11+
this.cigar = cigar;
12+
this.querySequence = querySequence;
13+
this.queryQuality = queryQuality;
14+
}
15+
}

src/main/java/com/astrazeneca/vardict/data/Patterns.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,11 @@ public class Patterns {
153153

154154
public static final jregex.Pattern SOFT_CLIPPED = new jregex.Pattern("(\\d+)[MIS]");
155155
public static final Pattern SA_CIGAR_D_S_5clip = Pattern.compile("^\\d\\d+S");
156+
public static final Pattern SA_CIGAR_D_S_5clip_GROUP = Pattern.compile("^(\\d\\d+)S");
157+
public static final jregex.Pattern SA_CIGAR_D_S_5clip_GROUP_Repl = new jregex.Pattern("^\\d+S");
156158
public static final Pattern SA_CIGAR_D_S_3clip = Pattern.compile("\\d\\dS$");
159+
public static final Pattern SA_CIGAR_D_S_3clip_GROUP = Pattern.compile("(\\d\\d+)S$");
160+
public static final jregex.Pattern SA_CIGAR_D_S_3clip_GROUP_Repl = new jregex.Pattern("\\d\\d+S$");
157161
public static final Pattern BEGIN_dig_dig_S_ANY_dig_dig_S_END = Pattern.compile("^\\d\\dS.*\\d\\dS$");
158162
public static final jregex.Pattern BEGIN_NUM_S_OR_BEGIN_NUM_H = new jregex.Pattern("^(\\d+)S|^\\d+H");
159163
public static final jregex.Pattern END_NUM_S_OR_NUM_H = new jregex.Pattern("(\\d+)S$|H$");

src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,21 @@
22

33
import com.astrazeneca.vardict.Configuration;
44
import com.astrazeneca.vardict.collection.Tuple;
5+
import com.astrazeneca.vardict.data.ModifiedCigar;
6+
import com.astrazeneca.vardict.data.Reference;
57
import com.astrazeneca.vardict.data.Region;
68
import jregex.Replacer;
79

810
import java.util.HashSet;
11+
import java.util.List;
912
import java.util.Map;
1013
import java.util.Set;
1114
import java.util.regex.Matcher;
1215

1316
import static com.astrazeneca.vardict.Utils.*;
1417
import static com.astrazeneca.vardict.collection.Tuple.tuple;
1518
import static com.astrazeneca.vardict.data.Patterns.*;
19+
import static com.astrazeneca.vardict.data.scopedata.GlobalReadOnlyScope.instance;
1620
import static com.astrazeneca.vardict.variations.VariationUtils.isHasAndEquals;
1721
import static com.astrazeneca.vardict.variations.VariationUtils.isHasAndNotEquals;
1822

@@ -26,26 +30,30 @@ public class CigarModifier {
2630
private String querySequence;
2731
private String queryQuality;
2832
private Map<Integer, Character> reference;
33+
private Reference ref;
2934
private int indel;
35+
private int maxReadLength;
3036
private Region region;
3137

3238
CigarModifier(int position, String cigarStr, String querySequence,
33-
String queryQuality, Map<Integer, Character> reference, int indel, Region region) {
39+
String queryQuality, Reference ref, int indel, Region region, int maxReadLength) {
3440
this.position = position;
3541
this.cigarStr = cigarStr;
3642
this.originalCigar = cigarStr;
3743
this.querySequence = querySequence;
3844
this.queryQuality = queryQuality;
39-
this.reference = reference;
45+
this.ref = ref;
46+
this.reference = ref.referenceSequences;
4047
this.indel = indel;
4148
this.region = region;
49+
this.maxReadLength = maxReadLength;
4250
}
4351

4452
/**
4553
* Method modify cigar by applying different matchers.
4654
* @return modified position and cigar string
4755
*/
48-
public Tuple.Tuple2<Integer, String> modifyCigar() {
56+
public ModifiedCigar modifyCigar() {
4957
//flag is set to true if CIGAR string is modified and should be looked at again
5058
boolean flag = true;
5159
try {
@@ -72,6 +80,54 @@ public Tuple.Tuple2<Integer, String> modifyCigar() {
7280
Replacer r = END_NUMBER_I.replacer(toInt(mm.group(1)) + "S");
7381
cigarStr = r.replace(cigarStr);
7482
}
83+
Matcher sc5_mm = SA_CIGAR_D_S_5clip_GROUP.matcher(cigarStr);
84+
Matcher sc3_mm = SA_CIGAR_D_S_3clip_GROUP.matcher(cigarStr);
85+
Map<String, List<Integer>> referenceSeedMap = ref.seed;
86+
if (sc5_mm.find()) {
87+
int cigarElement = toInt(sc5_mm.group(1));
88+
if (!instance().conf.chimeric && cigarElement >= Configuration.SEED_2 ) {
89+
String sseq = substr(querySequence, 0, cigarElement);
90+
String sequence = complement(reverse(sseq));
91+
String reverseComplementedSeed = sequence.substring(0, Configuration.SEED_2);
92+
93+
if (referenceSeedMap.containsKey(reverseComplementedSeed)) {
94+
List<Integer> positions = referenceSeedMap.get(reverseComplementedSeed);
95+
96+
if (positions.size() == 1 && Math.abs(position - positions.get(0)) < 2 * maxReadLength) {
97+
Replacer r = SA_CIGAR_D_S_5clip_GROUP_Repl.replacer(""); // $a[5] = ~s / ^\d + S//;
98+
cigarStr = r.replace(cigarStr);
99+
querySequence = substr(querySequence, cigarElement);
100+
queryQuality = substr(queryQuality, cigarElement);
101+
if (instance().conf.y) {
102+
System.err.println(sequence + " at 5' is a chimeric at "
103+
+ position + " by SEED " + Configuration.SEED_2);
104+
}
105+
}
106+
}
107+
}
108+
} else if (sc3_mm.find()) {
109+
int cigarElement = toInt(sc3_mm.group(1));
110+
if (!instance().conf.chimeric && cigarElement >= Configuration.SEED_2 ) {
111+
String sseq = substr(querySequence, -cigarElement, cigarElement);
112+
String sequence = complement(reverse(sseq));
113+
String reverseComplementedSeed = substr(sequence, -Configuration.SEED_2, Configuration.SEED_2);
114+
115+
if (referenceSeedMap.containsKey(reverseComplementedSeed)) {
116+
List<Integer> positions = referenceSeedMap.get(reverseComplementedSeed);
117+
118+
if (positions.size() == 1 && Math.abs(position - positions.get(0)) < 2* maxReadLength ) {
119+
Replacer r = SA_CIGAR_D_S_3clip_GROUP_Repl.replacer(""); // $a[5] = ~s /\d\d + S$//;
120+
cigarStr = r.replace(cigarStr);
121+
querySequence = substr(querySequence, 0, querySequence.length() - cigarElement);
122+
queryQuality = substr(queryQuality, 0, queryQuality.length() - cigarElement);
123+
if (instance().conf.y) {
124+
System.err.println(sequence + " at 3' is a chimeric at "
125+
+ position + " by SEED " + Configuration.SEED_2);
126+
}
127+
}
128+
}
129+
}
130+
}
75131

76132
while (flag && indel > 0) {
77133
flag = false;
@@ -184,11 +240,11 @@ public Tuple.Tuple2<Integer, String> modifyCigar() {
184240
} else if ((mtch = BEGIN_DIG_M.matcher(cigarStr)).find()) {
185241
combineBeginDigM(mtch);
186242
}
187-
return tuple(position, cigarStr);
243+
return new ModifiedCigar(position, cigarStr, querySequence, queryQuality);
188244
} catch (Exception exception) {
189245
printExceptionAndContinue(exception, "cigar", String.valueOf(position) + " " + originalCigar, region);
190246
}
191-
return tuple(position, cigarStr);
247+
return new ModifiedCigar(position, cigarStr, querySequence, queryQuality);
192248
}
193249

194250
/**

src/main/java/com/astrazeneca/vardict/modules/CigarParser.java

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ private void parseCigar(String chrName,
242242
}
243243
}
244244

245-
final String queryQuality = getBaseQualityString(record);
245+
String queryQuality = getBaseQualityString(record);
246246
final boolean isMateReferenceNameEqual = record.getReferenceName().equals(record.getMateReferenceName());
247247
final int numberOfMismatches = totalNumberOfMismatches;
248248
boolean direction = record.getReadNegativeStrandFlag();
@@ -264,12 +264,15 @@ private void parseCigar(String chrName,
264264
record.getCigarString(),
265265
querySequence,
266266
queryQuality,
267-
ref,
267+
reference,
268268
insertionDeletionLength,
269-
region);
270-
Tuple.Tuple2<Integer, String> modifiedCigar = cigarModifier.modifyCigar();
271-
position = modifiedCigar._1;
272-
cigar = TextCigarCodec.decode(modifiedCigar._2);
269+
region,
270+
maxReadLength);
271+
ModifiedCigar modifiedCigar = cigarModifier.modifyCigar();
272+
position = modifiedCigar.position;
273+
cigar = TextCigarCodec.decode(modifiedCigar.cigar);
274+
querySequence = modifiedCigar.querySequence;
275+
queryQuality = modifiedCigar.queryQuality;
273276
} else {
274277
position = record.getAlignmentStart();
275278
cigar = record.getCigar();
@@ -574,7 +577,7 @@ && isHasAndNotEquals(querySequence.charAt(readPositionIncludingSoftClipped + 1 +
574577
if (!trim) {
575578
//If start - qbases + 1 is in region of interest
576579
final int pos = start - qbases + 1;
577-
if (pos >= region.start && pos <= region.end) {
580+
if (pos >= region.start && pos <= region.end && !s.contains("N")) {
578581
addVariationForMatchingPart(mappingQuality, numberOfMismatches,
579582
direction, readLengthIncludeMatchingAndInsertions, nmoff, s, startWithDeletion,
580583
q, qbases, qibases, ddlen, pos);
@@ -1663,15 +1666,19 @@ private void addVariationForMatchingPart(int mappingQuality, int nm, boolean dir
16631666
int rlen1, int nmoff, String s,
16641667
boolean startWithDelition, double q,
16651668
int qbases, int qibases, int ddlen, int pos) {
1666-
//add variation record for $s
1667-
Variation hv = getVariation(nonInsertionVariants, pos, s); //reference to variant structure
1668-
hv.incDir(dir);
1669-
1670-
if(isBEGIN_ATGC_AMP_ATGCs_END(s)) {
1671-
//if s is one base followed by '&' and one or more bases
1672-
//add variant record for s to mnp
1673-
increment(mnp, pos, s);
1669+
Variation hv;
1670+
if (s.startsWith("+")) {
1671+
increment(positionToInsertionCount, pos, s);
1672+
hv = getVariation(insertionVariants, pos, s); //reference to variant structure
1673+
} else {
1674+
if(isBEGIN_ATGC_AMP_ATGCs_END(s)) {
1675+
//if s is one base followed by '&' and one or more bases
1676+
//add variant record for s to mnp
1677+
increment(mnp, pos, s);
1678+
}
1679+
hv = getVariation(nonInsertionVariants, pos, s); //reference to variant structure
16741680
}
1681+
hv.incDir(dir);
16751682

16761683
//increment count
16771684
hv.varsCount++;
@@ -1704,9 +1711,10 @@ private void addVariationForMatchingPart(int mappingQuality, int nm, boolean dir
17041711
} else {
17051712
hv.lowQualityReadsCount++;
17061713
}
1714+
int shift = (s.startsWith("+") && s.contains("&")) ? 1 : 0;
17071715

17081716
//increase coverage for bases covered by the variation
1709-
for (int qi = 1; qi <= qbases; qi++) {
1717+
for (int qi = 1; qi <= qbases - shift; qi++) {
17101718
incCnt(refCoverage, start - qi + 1, 1);
17111719
}
17121720

@@ -1792,7 +1800,7 @@ private boolean isReadChimericWithSA(SAMRecord record, int position, String saTa
17921800
String saDirectionString = saTagArray[2];
17931801
String saCigar = saTagArray[3];
17941802
boolean saDirectionIsForward = saDirectionString.equals("+");
1795-
Matcher mm = SA_CIGAR_D_S_3clip.matcher(saCigar);;
1803+
Matcher mm = SA_CIGAR_D_S_3clip.matcher(saCigar);
17961804

17971805
if (is5Side) {
17981806
mm = SA_CIGAR_D_S_5clip.matcher(saCigar);
@@ -1806,7 +1814,7 @@ private boolean isReadChimericWithSA(SAMRecord record, int position, String saTa
18061814
if (instance().conf.y && isChimericWithSA) {
18071815
System.err.println(record.getReadName() + " " + record.getReferenceName()
18081816
+ " " + position + " " + record.getMappingQuality()
1809-
+ " " + record.getCigarString()
1817+
+ " " + cigar.toString()
18101818
+ " is ignored as chimeric with SA: " +
18111819
saPosition + "," + saDirectionString + "," + saCigar);
18121820
}
@@ -1988,7 +1996,7 @@ private void prepareSVStructuresForAnalysis(final SAMRecord record,
19881996
matcher = END_NUM_S_OR_NUM_H.matcher(cigar.toString());
19891997
if (matcher.find() && matcher.group(1) != null) {
19901998
int tt = toInt(matcher.group(1));
1991-
if (tt != 0 && queryQuality.charAt(record.getReadLength() - tt) - 33 > instance().conf.goodq) {
1999+
if (tt != 0 && queryQuality.charAt(queryQuality.length() - tt) - 33 > instance().conf.goodq) {
19922000
soft3 = end;
19932001
}
19942002
}

src/main/java/com/astrazeneca/vardict/modules/StructuralVariantsProcessor.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,10 @@ void findDEL() {
152152
.collect(toList());
153153

154154
int softp = soft.isEmpty() ? 0 : soft.get(0)._1;
155+
if (instance().conf.y) {
156+
System.err.printf("%n%nWorking DEL 5' %d mate cluster cnt: %d%n", softp, del.varsCount);
157+
}
155158
if (softp != 0) {
156-
if (instance().conf.y) {
157-
System.err.printf("%n%nWorking DEL 5' %d mate cluster cnt: %d%n", softp, del.varsCount);
158-
}
159159
if (!softClips3End.containsKey(softp)) {
160160
continue;
161161
}
@@ -1429,7 +1429,7 @@ void findDUPdisc (){
14291429
String ins3 = joinRef(ref, pe - Configuration.SVFLANK + 1, pe);
14301430
String ins = ins5 + "<dup" + (mlen - 2 * Configuration.SVFLANK) + ">" + ins3;
14311431

1432-
final Variation vref = getVariation(nonInsertionVariants, bp, "+" + ins);
1432+
final Variation vref = getVariation(insertionVariants, bp, "+" + ins);
14331433
vref.varsCount = 0;
14341434

14351435
SV sv = getSV(nonInsertionVariants, bp);
@@ -1564,7 +1564,7 @@ void findDUPdisc (){
15641564
String ins3 = joinRef(ref, pe - Configuration.SVFLANK + 1, pe);
15651565
String ins = ins5 + "<dup" + (mlen - 2 * Configuration.SVFLANK) + ">" + ins3;
15661566

1567-
final Variation vref = getVariation(nonInsertionVariants, bp, "+" + ins);
1567+
final Variation vref = getVariation(insertionVariants, bp, "+" + ins);
15681568
vref.varsCount = 0;
15691569

15701570
SV sv = getSV(nonInsertionVariants, bp);

src/main/java/com/astrazeneca/vardict/modules/ToVarsBuilder.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,9 @@ int createInsertion(double duprate, int position, int totalPosCoverage,
304304
Collections.sort(insertionDescriptionStrings);
305305
//Loop over insertion variants
306306
for (String descriptionString : insertionDescriptionStrings) {
307+
if (descriptionString.contains("&") && refCoverage.containsKey(position + 1)) {
308+
totalPosCoverage = refCoverage.get(position + 1);
309+
}
307310
// String n = entV.getKey();
308311
Variation cnt = insertionVariations.get(descriptionString);
309312
//count of variants in forward strand
@@ -346,6 +349,9 @@ int createInsertion(double duprate, int position, int totalPosCoverage,
346349
}
347350

348351
Variant tvref = new Variant();
352+
if (hicov < hicnt) {
353+
hicov = hicnt;
354+
}
349355
tvref.descriptionString = descriptionString;
350356
tvref.positionCoverage = cnt.varsCount;
351357
tvref.varsCountOnForward = fwd;
@@ -494,14 +500,14 @@ private int calcHicov(VariationMap<String, Variation> insertionVariations,
494500
VariationMap<String, Variation> nonInsertionVariations) {
495501
int hicov = 0;
496502
for (Map.Entry<String, Variation> descVariantEntry : nonInsertionVariations.entrySet()) {
497-
if (descVariantEntry.getKey().equals("SV")) {
503+
if (descVariantEntry.getKey().equals("SV") || descVariantEntry.getKey().startsWith("+")) {
498504
continue;
499505
}
500506
hicov += descVariantEntry.getValue().highQualityReadsCount;
501507
}
502508
if (insertionVariations != null) {
503509
for (Variation variation : insertionVariations.values()) {
504-
hicov += variation.highQualityReadsCount;
510+
//hicov += variation.highQualityReadsCount;
505511
}
506512
}
507513
return hicov;

src/test/java/com/astrazeneca/vardict/modules/CigarModifierTest.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.astrazeneca.vardict.modules;
22

33
import com.astrazeneca.vardict.Configuration;
4+
import com.astrazeneca.vardict.data.Reference;
45
import com.astrazeneca.vardict.data.Region;
56
import com.astrazeneca.vardict.data.scopedata.GlobalReadOnlyScope;
67
import com.astrazeneca.vardict.variations.Variation;
@@ -325,11 +326,12 @@ public void threeIndels(final String initialCigar, final String expected, final
325326
reference.put(i, base.charAt(0));
326327
i++;
327328
}
328-
329+
Reference ref = new Reference();
330+
ref.referenceSequences = reference;
329331
Region region = new Region("chr1", 1, 100, "");
330332
CigarModifier cigarModifier = new CigarModifier(position, initialCigar, querySequence,
331-
queryQuality, reference, indel, region);
332-
String actual = cigarModifier.modifyCigar()._2;
333+
queryQuality, ref, indel, region, 150);
334+
String actual = cigarModifier.modifyCigar().cigar;
333335
Assert.assertEquals(actual, expected);
334336
}
335337
}

src/test/java/com/astrazeneca/vardict/modules/ToVarsBuilderTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,9 @@ public void createInsertion() {
7373
meanMappingQuality = 7.8;
7474
numberOfMismatches = 2.0;
7575
hicnt = 44;
76+
hicov = 44;
7677
highQualityToLowQualityRatio = 1.2571428571428571;
78+
highQualityReadsFrequency = 1.0;
7779
}};
7880

7981
assertEquals(var.get(0).toString(), expectedVariant.toString());
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)