Skip to content
This repository was archived by the owner on May 22, 2026. It is now read-only.

Commit 7b86e6d

Browse files
authored
Merge pull request #270 from PolinaBevad/fix_issue_267_ref_MR
Fixed issue 267: IUPAC ambiguity codes in reference
2 parents 6618039 + a0f8fbe commit 7b86e6d

4 files changed

Lines changed: 64 additions & 8 deletions

File tree

VarDict

src/main/java/com/astrazeneca/vardict/modules/ToVarsBuilder.java

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import java.util.regex.Pattern;
1616
import java.util.regex.Matcher;
1717
import java.util.*;
18+
import java.util.stream.Collectors;
19+
import java.util.stream.Stream;
1820

1921
import static com.astrazeneca.vardict.data.scopedata.GlobalReadOnlyScope.instance;
2022
import static com.astrazeneca.vardict.Utils.*;
@@ -35,6 +37,21 @@ public class ToVarsBuilder implements Module<RealignedVariationData, AlignedVars
3537
private Map<Integer, Character> ref;
3638
private Double duprate;
3739

40+
// Map of IUPAC ambiguity codes that we can observe in reference.
41+
// By VCF 4.3 specification they aren't allowed and must be reduced to a first alphabetically base.
42+
private Map<String, String> IUPAC_AMBIGUITY_CODES = Stream.of(new String[][] {
43+
{"M","A"},
44+
{"R","A"},
45+
{"W","A"},
46+
{"S","C"},
47+
{"Y","C"},
48+
{"K","G"},
49+
{"V","A"},
50+
{"H","A"},
51+
{"D","A"},
52+
{"B","C"},
53+
}).collect(Collectors.toMap(key -> key[0], key -> key[1]));
54+
3855
public Map<Integer, VariationMap<String, Variation>> getInsertionVariants() {
3956
return insertionVariants;
4057
}
@@ -903,7 +920,7 @@ else if (deletionLength < instance().conf.SVMINLEN) {
903920
vref.shift3 = shift3;
904921
vref.startPosition = startPosition;
905922
vref.endPosition = endPosition;
906-
vref.refallele = refallele;
923+
vref.refallele = validateRefallele(refallele);
907924
vref.varallele = varallele;
908925
vref.genotype = genotype;
909926
vref.totalPosCoverage = totalPosCoverage;
@@ -954,8 +971,8 @@ else if (deletionLength < instance().conf.SVMINLEN) {
954971
vref.highQualityReadsFrequency = roundHalfEven("0.0000", vref.highQualityReadsFrequency);
955972
String referenceBase = ref.containsKey(position) ? ref.get(position).toString() : ""; // $r
956973
//both refallele and varallele are 1 base from reference string
957-
vref.refallele = referenceBase;
958-
vref.varallele = referenceBase;
974+
vref.refallele = validateRefallele(referenceBase);
975+
vref.varallele = validateRefallele(referenceBase);
959976
vref.genotype = referenceBase + "/" + referenceBase;
960977
vref.leftseq = "";
961978
vref.rightseq = "";
@@ -976,6 +993,23 @@ else if (deletionLength < instance().conf.SVMINLEN) {
976993
}
977994
}
978995

996+
/**
997+
* Validate reference allele according to VCF 4.3 specification in case if IUPAC ambiguity codes are present
998+
* in reference.
999+
* @param refallele sequence of reference bases that covers variant
1000+
* @return reference allele sequence where IUPAC ambuguity bases are changed to the one that is
1001+
* first alphabetically.
1002+
*/
1003+
String validateRefallele(String refallele) {
1004+
for (int i = 0; i < refallele.length(); i++) {
1005+
String refBase = substr(refallele, i, 1);
1006+
if (IUPAC_AMBIGUITY_CODES.containsKey(refBase)) {
1007+
refallele = refallele.replaceFirst(refBase, IUPAC_AMBIGUITY_CODES.get(refBase));
1008+
}
1009+
}
1010+
return refallele;
1011+
}
1012+
9791013
/**
9801014
* Microsatellite instability
9811015
* Tandemly repeated short sequence motifs ranging from 1– 6(8 in our case) base pairs are called microsatellites.

src/main/java/com/astrazeneca/vardict/postprocessmodules/SimplePostProcessModule.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ public void accept(Scope<AlignedVarsData> mapScope) {
6565
if (vref.refallele.contains("N")) {
6666
continue;
6767
}
68+
if (vref.refallele.equals(vref.varallele)) {
69+
if (!conf.doPileup) {
70+
continue;
71+
}
72+
}
6873
vref.vartype = vref.varType();
6974
if (!vref.isGoodVar(variantsOnPosition.referenceVariant, vref.vartype, mapScope.splice)) {
7075
if (!conf.doPileup) {

src/test/java/com/astrazeneca/vardict/modules/ToVarsBuilderTest.java

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111
import org.testng.annotations.BeforeMethod;
1212
import org.testng.annotations.Test;
1313

14-
import java.util.ArrayList;
15-
import java.util.HashMap;
16-
import java.util.List;
17-
import java.util.Map;
14+
import java.util.*;
1815

1916
import static org.testng.Assert.assertEquals;
2017

@@ -132,4 +129,24 @@ public void createVariant() {
132129

133130
assertEquals(var.get(0).toString(), expectedVariant.toString());
134131
}
132+
133+
@Test
134+
public void testValidateRefAllele(){
135+
List<String> alleles = Arrays.asList("A", "C", "G", "T", "N", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B");
136+
List<String> expected = Arrays.asList("A", "C", "G", "T", "N", "A", "A", "A", "C", "C", "G", "A", "A", "A", "C");
137+
List<String> validatedAlleles = new ArrayList<>();
138+
for (String allele: alleles) {
139+
validatedAlleles.add(toVarsBuilder.validateRefallele(allele));
140+
}
141+
assertEquals(validatedAlleles, expected);
142+
143+
List<String> alleles_complex = Arrays.asList("ANYCGT", "MRACT", "CCGKBG");
144+
List<String> expected_complex = Arrays.asList("ANCCGT", "AAACT", "CCGGCG");
145+
146+
validatedAlleles = new ArrayList<>();
147+
for (String allele: alleles_complex) {
148+
validatedAlleles.add(toVarsBuilder.validateRefallele(allele));
149+
}
150+
assertEquals(validatedAlleles, expected_complex);
151+
}
135152
}

0 commit comments

Comments
 (0)