1515import java .util .regex .Pattern ;
1616import java .util .regex .Matcher ;
1717import java .util .*;
18+ import java .util .stream .Collectors ;
19+ import java .util .stream .Stream ;
1820
1921import static com .astrazeneca .vardict .data .scopedata .GlobalReadOnlyScope .instance ;
2022import static com .astrazeneca .vardict .Utils .*;
@@ -35,6 +37,21 @@ public class ToVarsBuilder implements Module<RealignedVariationData, AlignedVars
3537 private Map <Integer , Character > ref ;
3638 private Double duprate ;
3739
40+ // Map of IUPAC ambiguity codes that we can observe in reference.
41+ // By VCF 4.3 specification they aren't allowed and must be reduced to a first alphabetically base.
42+ private Map <String , String > IUPAC_AMBIGUITY_CODES = Stream .of (new String [][] {
43+ {"M" ,"A" },
44+ {"R" ,"A" },
45+ {"W" ,"A" },
46+ {"S" ,"C" },
47+ {"Y" ,"C" },
48+ {"K" ,"G" },
49+ {"V" ,"A" },
50+ {"H" ,"A" },
51+ {"D" ,"A" },
52+ {"B" ,"C" },
53+ }).collect (Collectors .toMap (key -> key [0 ], key -> key [1 ]));
54+
3855 public Map <Integer , VariationMap <String , Variation >> getInsertionVariants () {
3956 return insertionVariants ;
4057 }
@@ -903,7 +920,7 @@ else if (deletionLength < instance().conf.SVMINLEN) {
903920 vref .shift3 = shift3 ;
904921 vref .startPosition = startPosition ;
905922 vref .endPosition = endPosition ;
906- vref .refallele = refallele ;
923+ vref .refallele = validateRefallele ( refallele ) ;
907924 vref .varallele = varallele ;
908925 vref .genotype = genotype ;
909926 vref .totalPosCoverage = totalPosCoverage ;
@@ -954,8 +971,8 @@ else if (deletionLength < instance().conf.SVMINLEN) {
954971 vref .highQualityReadsFrequency = roundHalfEven ("0.0000" , vref .highQualityReadsFrequency );
955972 String referenceBase = ref .containsKey (position ) ? ref .get (position ).toString () : "" ; // $r
956973 //both refallele and varallele are 1 base from reference string
957- vref .refallele = referenceBase ;
958- vref .varallele = referenceBase ;
974+ vref .refallele = validateRefallele ( referenceBase ) ;
975+ vref .varallele = validateRefallele ( referenceBase ) ;
959976 vref .genotype = referenceBase + "/" + referenceBase ;
960977 vref .leftseq = "" ;
961978 vref .rightseq = "" ;
@@ -976,6 +993,23 @@ else if (deletionLength < instance().conf.SVMINLEN) {
976993 }
977994 }
978995
996+ /**
997+ * Validate reference allele according to VCF 4.3 specification in case if IUPAC ambiguity codes are present
998+ * in reference.
999+ * @param refallele sequence of reference bases that covers variant
1000+ * @return reference allele sequence where IUPAC ambuguity bases are changed to the one that is
1001+ * first alphabetically.
1002+ */
1003+ String validateRefallele (String refallele ) {
1004+ for (int i = 0 ; i < refallele .length (); i ++) {
1005+ String refBase = substr (refallele , i , 1 );
1006+ if (IUPAC_AMBIGUITY_CODES .containsKey (refBase )) {
1007+ refallele = refallele .replaceFirst (refBase , IUPAC_AMBIGUITY_CODES .get (refBase ));
1008+ }
1009+ }
1010+ return refallele ;
1011+ }
1012+
9791013 /**
9801014 * Microsatellite instability
9811015 * Tandemly repeated short sequence motifs ranging from 1– 6(8 in our case) base pairs are called microsatellites.
0 commit comments