Skip to content
This repository was archived by the owner on May 22, 2026. It is now read-only.

Commit f860b5b

Browse files
committed
Added NM tag for STAR aligner: if nM is presented, it will be used.
1 parent 68bacbb commit f860b5b

5 files changed

Lines changed: 5667 additions & 5666 deletions

File tree

Readme.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,9 @@ These are only rough classification. You need to examine the p-value (after test
406406
- `-q double`
407407
The phred score for a base to be considered a good call. Default: 22.5 (for Illumina). For PGM, set it to ~15, as PGM tends to underestimate base quality.
408408
- `-m INT`
409-
If set, reads with mismatches more than `INT` will be filtered and ignored. Gaps are not counted as mismatches. Valid only for bowtie2/TopHat or BWA aln followed by sampe. BWA mem is calculated as NM - Indels. Default: 8, or reads with more than 8 mismatches will not be used.
409+
If set, reads with mismatches more than `INT` will be filtered and ignored. Gaps are not counted as mismatches.
410+
Valid only for bowtie2/TopHat or BWA aln followed by sampe. BWA mem is calculated as NM - Indels. For STAR
411+
you have to increase default if nM tag (for "paired" alignment) is presented in reads.Default: 8, or reads with more than 8 mismatches will not be used.
410412
- `-T|--trim INT`
411413
Trim bases after `[INT]` bases in the reads
412414
- `-X INT`

src/main/java/com/astrazeneca/vardict/Configuration.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,14 @@ public class Configuration {
233233
* The minimum distance between two SV clusters in term of read length
234234
*/
235235
public static final double MINSVCDIST = 1.5;
236+
/**
237+
* The minimum position in read for mapping quality in SV analysis
238+
*/
239+
public static final int MINMAPBASE = 15;
240+
/**
241+
* The minimum distance between start position and end of SV structure in inter-chr translocation
242+
*/
243+
public static final int MINSVPOS = 25;
236244
/**
237245
* Mean Insert size
238246
*/

src/main/java/com/astrazeneca/vardict/modules/CigarParser.java

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package com.astrazeneca.vardict.modules;
22

33
import com.astrazeneca.vardict.Configuration;
4-
import com.astrazeneca.vardict.collection.Tuple;
54
import com.astrazeneca.vardict.collection.VariationMap;
65
import com.astrazeneca.vardict.data.*;
76
import com.astrazeneca.vardict.data.scopedata.Scope;
@@ -222,13 +221,19 @@ private void parseCigar(String chrName,
222221
Map<Integer, Character> ref = reference.referenceSequences;
223222

224223
final int insertionDeletionLength = getInsertionDeletionLength(cigar);
224+
225+
// Number of mismatches from NM tag (or nM for STAR).
225226
int totalNumberOfMismatches = 0;
226-
Integer numberOfMismatchesFromTag = record.getIntegerAttribute(SAMTag.NM.name());
227+
Integer numberOfMismatches_NM = record.getIntegerAttribute(SAMTag.NM.name());
228+
Integer numberOfMismatches_STAR = record.getIntegerAttribute("nM");
229+
if (numberOfMismatches_STAR != null) {
230+
numberOfMismatches_NM = numberOfMismatches_STAR;
231+
}
227232

228-
// Number of mismatches from NM tag. Don't use NM since it includes gaps, which can be from indels
229-
if (numberOfMismatchesFromTag != null) {
233+
// Don't use NM since it includes gaps, which can be from indels
234+
if (numberOfMismatches_NM != null) {
230235
// Edit distance - indels is the # of mismatches
231-
totalNumberOfMismatches = numberOfMismatchesFromTag - insertionDeletionLength;
236+
totalNumberOfMismatches = numberOfMismatches_NM - insertionDeletionLength;
232237
if (totalNumberOfMismatches > instance().conf.mismatch) {
233238
return;
234239
}
@@ -2015,7 +2020,8 @@ private void prepareSVStructuresForAnalysis(final SAMRecord record,
20152020
} else if (record.getIntegerAttribute(SAMTag.MQ.name()) != null
20162021
&& record.getIntegerAttribute(SAMTag.MQ.name()) < 15) {
20172022
// Ignore those with mate mapping quality less than 15
2018-
} else if (readDirNum * mateDirNum == -1 && (mlen * readDirNum) > 0 && queryQuality.length() > 15) {
2023+
} else if (readDirNum * mateDirNum == -1 && (mlen * readDirNum) > 0
2024+
&& queryQuality.length() > Configuration.MINMAPBASE) {
20192025
// deletion candidate
20202026
mlen = mateStart > start ? mend - start : end - mateStart;
20212027
if(abs(mlen) > instance().conf.INSSIZE + instance().conf.INSSTDAMT * instance().conf.INSSTD) {
@@ -2028,7 +2034,7 @@ private void prepareSVStructuresForAnalysis(final SAMRecord record,
20282034
}
20292035
addSV(getLastSVStructure(svStructures.svfdel), start, end, mateStart,
20302036
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft3, maxReadLength/2.0,
2031-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2037+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
20322038
svStructures.svdelfend = end;
20332039
} else {
20342040
if (svStructures.svrdel.size() == 0
@@ -2039,7 +2045,7 @@ private void prepareSVStructuresForAnalysis(final SAMRecord record,
20392045
}
20402046
addSV(getLastSVStructure(svStructures.svrdel), start, end, mateStart,
20412047
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft5, maxReadLength/2.0,
2042-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2048+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
20432049
svStructures.svdelrend = end;
20442050
}
20452051

@@ -2070,7 +2076,8 @@ && abs(start - svStructures.svdelrend) <= Configuration.MINSVCDIST * maxReadLeng
20702076
adddisccnt(getLastSVStructure(svStructures.svrinv3));
20712077
}
20722078
}
2073-
} else if (readDirNum * mateDirNum == -1 && readDirNum * mlen < 0 && queryQuality.length() > 15) {
2079+
} else if (readDirNum * mateDirNum == -1 && readDirNum * mlen < 0
2080+
&& queryQuality.length() > Configuration.MINMAPBASE) {
20742081
//duplication
20752082
if (readDirNum == 1) {
20762083
if (svStructures.svfdup.size() == 0
@@ -2081,7 +2088,7 @@ && abs(start - svStructures.svdelrend) <= Configuration.MINSVCDIST * maxReadLeng
20812088
}
20822089
addSV(getLastSVStructure(svStructures.svfdup), start, end, mateStart,
20832090
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft3, maxReadLength/2.0,
2084-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2091+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
20852092
svStructures.svdupfend = end;
20862093
} else {
20872094
if (svStructures.svrdup.size() == 0
@@ -2092,7 +2099,7 @@ && abs(start - svStructures.svdelrend) <= Configuration.MINSVCDIST * maxReadLeng
20922099
}
20932100
addSV(getLastSVStructure(svStructures.svrdup), start, end, mateStart,
20942101
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft5, maxReadLength/2.0,
2095-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2102+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
20962103
svStructures.svduprend = end;
20972104
}
20982105
if (!svStructures.svfdup.isEmpty()
@@ -2121,7 +2128,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21212128
if (!svStructures.svrinv3.isEmpty() && abs(start - svStructures.svinvrend3) <= MIN_D) {
21222129
adddisccnt(getLastSVStructure(svStructures.svrinv3));
21232130
}
2124-
} else if (readDirNum * mateDirNum == 1 && queryQuality.length() > 15) { // Inversion
2131+
} else if (readDirNum * mateDirNum == 1 && queryQuality.length() > Configuration.MINMAPBASE) { // Inversion
21252132
if (readDirNum == 1 && mlen != 0 ) {
21262133
if (mlen < -3 * maxReadLength) {
21272134
if (svStructures.svfinv3.size() == 0
@@ -2132,7 +2139,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21322139
}
21332140
addSV(getLastSVStructure(svStructures.svfinv3), start, end, mateStart,
21342141
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft3, maxReadLength/2.0,
2135-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2142+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
21362143
svStructures.svinvfend3 = end;
21372144
getLastSVStructure(svStructures.svfinv3).disc++;
21382145
} else if (mlen > 3 * maxReadLength) {
@@ -2144,7 +2151,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21442151
}
21452152
addSV(getLastSVStructure(svStructures.svfinv5), start, end, mateStart,
21462153
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft3, maxReadLength/2.0,
2147-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2154+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
21482155
svStructures.svinvfend5 = end;
21492156
getLastSVStructure(svStructures.svfinv5).disc++;
21502157
}
@@ -2158,7 +2165,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21582165
}
21592166
addSV(getLastSVStructure(svStructures.svrinv3), start, end, mateStart,
21602167
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft5, maxReadLength/2.0,
2161-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2168+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
21622169
svStructures.svinvrend3 = end;
21632170
getLastSVStructure(svStructures.svrinv3).disc++;
21642171

@@ -2171,7 +2178,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21712178
}
21722179
addSV(getLastSVStructure(svStructures.svrinv5), start, end, mateStart,
21732180
mend, readDirNum, totalLengthIncludingSoftClipped, mlen, soft5, maxReadLength/2.0,
2174-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2181+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
21752182
svStructures.svinvrend5 = end;
21762183
getLastSVStructure(svStructures.svrinv5).disc++;
21772184
}
@@ -2191,7 +2198,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
21912198
}
21922199
}
21932200
}
2194-
} else if (queryQuality.length() > 15){ // Inter-chr translocation
2201+
} else if (queryQuality.length() > Configuration.MINMAPBASE){ // Inter-chr translocation
21952202
// to be implemented
21962203
final String mchr = getMateReferenceName(record);
21972204
if (record.getStringAttribute(SAMTag.MC.name()) != null
@@ -2212,7 +2219,7 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
22122219
int svn = svStructures.svffus.get(mchr).size() - 1;
22132220
addSV(svStructures.svffus.get(mchr).get(svn), start, end, mateStart,
22142221
mend, readDirNum, totalLengthIncludingSoftClipped, 0, soft3, maxReadLength/2.0,
2215-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2222+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
22162223
svStructures.svfusfend.put(mchr, end);
22172224
svStructures.svffus.get(mchr).get(svn).disc++;
22182225
} else {
@@ -2227,32 +2234,32 @@ && abs(start - svStructures.svduprend) <= Configuration.MINSVCDIST * maxReadLeng
22272234
int svn = svStructures.svrfus.get(mchr).size() - 1;
22282235
addSV(svStructures.svrfus.get(mchr).get(svn), start, end, mateStart,
22292236
mend, readDirNum, totalLengthIncludingSoftClipped, 0, soft5, maxReadLength/2.0,
2230-
queryQuality.charAt(15) - 33, record.getMappingQuality(), numberOfMismatches);
2237+
queryQuality.charAt(Configuration.MINMAPBASE) - 33, record.getMappingQuality(), numberOfMismatches);
22312238
svStructures.svfusrend.put(mchr, end);
22322239
svStructures.svrfus.get(mchr).get(svn).disc++;
22332240
}
2234-
if (!svStructures.svfdel.isEmpty() && start - svStructures.svdelfend <= 25) {
2241+
if (!svStructures.svfdel.isEmpty() && start - svStructures.svdelfend <= Configuration.MINSVPOS) {
22352242
adddisccnt(getLastSVStructure(svStructures.svfdel));
22362243
}
2237-
if (!svStructures.svrdel.isEmpty() && start - svStructures.svdelrend <= 25) {
2244+
if (!svStructures.svrdel.isEmpty() && start - svStructures.svdelrend <= Configuration.MINSVPOS) {
22382245
adddisccnt(getLastSVStructure(svStructures.svrdel));
22392246
}
2240-
if (!svStructures.svfdup.isEmpty() && start - svStructures.svdupfend <= 25) {
2247+
if (!svStructures.svfdup.isEmpty() && start - svStructures.svdupfend <= Configuration.MINSVPOS) {
22412248
adddisccnt(getLastSVStructure(svStructures.svfdup));
22422249
}
2243-
if (!svStructures.svrdup.isEmpty() && start - svStructures.svduprend <= 25) {
2250+
if (!svStructures.svrdup.isEmpty() && start - svStructures.svduprend <= Configuration.MINSVPOS) {
22442251
adddisccnt(getLastSVStructure(svStructures.svrdup));
22452252
}
2246-
if (!svStructures.svfinv5.isEmpty() && start - svStructures.svinvfend5 <= 25) {
2253+
if (!svStructures.svfinv5.isEmpty() && start - svStructures.svinvfend5 <= Configuration.MINSVPOS) {
22472254
adddisccnt(getLastSVStructure(svStructures.svfinv5));
22482255
}
2249-
if (!svStructures.svrinv5.isEmpty() && start - svStructures.svinvrend5 <= 25) {
2256+
if (!svStructures.svrinv5.isEmpty() && start - svStructures.svinvrend5 <= Configuration.MINSVPOS) {
22502257
adddisccnt(getLastSVStructure(svStructures.svrinv5));
22512258
}
2252-
if (!svStructures.svfinv3.isEmpty() && start - svStructures.svinvfend3 <= 25) {
2259+
if (!svStructures.svfinv3.isEmpty() && start - svStructures.svinvfend3 <= Configuration.MINSVPOS) {
22532260
adddisccnt(getLastSVStructure(svStructures.svfinv3));
22542261
}
2255-
if (!svStructures.svrinv3.isEmpty() && start - svStructures.svinvrend3 <= 25) {
2262+
if (!svStructures.svrinv3.isEmpty() && start - svStructures.svinvrend3 <= Configuration.MINSVPOS) {
22562263
adddisccnt(getLastSVStructure(svStructures.svrinv3));
22572264
}
22582265
}

0 commit comments

Comments
 (0)