Skip to content

Commit aac46ee

Browse files
Added GVCF mode for VariantContext type determination (#1544)
* Added GVCF mode for VariantContext type determination - Usually, NON_REF alleles will be considered SYMBOLIC. Therefore, if a VariantContext contains the alleles `A*,C,<NON_REF>`, the resulting type would be MIXED. For GVCF files, however, it would be helpful that this would be considered a SNP. - Default behavior will not change, only if true is passed for the optional ignoreNonRef argument to getType() - Added unit tests * Moderate refactoring of VariantContext type determination - This was necessary because the type caching needs to distinguish between ignoreNonRef being true or false - Changed return type of `determineType` and `determinePolymorphicType` from `void` to `VariantContext.Type`, otherwise multiple code branches would be necessary depending on which caching variable to set - Added unit test to catch if the cache separation works
1 parent 57c3f03 commit aac46ee

2 files changed

Lines changed: 192 additions & 35 deletions

File tree

src/main/java/htsjdk/variant/variantcontext/VariantContext.java

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ public class VariantContext implements HtsRecord, Feature, Serializable {
243243
/** The type (cached for performance reasons) of this context */
244244
protected Type type = null;
245245

246+
/** The type of this context, cached separately if ignoreNonRef is true */
247+
protected Type typeIgnoringNonRef = null;
248+
246249
/** A set of the alleles segregating in this context */
247250
protected final List<Allele> alleles;
248251

@@ -666,10 +669,29 @@ public enum Type {
666669
* @return the type of this VariantContext
667670
**/
668671
public Type getType() {
669-
if ( type == null )
670-
determineType();
672+
return getType(false);
673+
}
671674

672-
return type;
675+
/**
676+
* Determines (if necessary) and returns the type of this variation by examining the alleles it contains.
677+
*
678+
* @param ignoreNonRef If set to true, symbolic NON_REF alleles will not be considered for the type determination,
679+
* which is required for handling GVCF files.
680+
* @return the type of this VariantContext
681+
**/
682+
public Type getType(final boolean ignoreNonRef) {
683+
// Make sure we use the correct cached result
684+
if (ignoreNonRef) {
685+
if (typeIgnoringNonRef == null) {
686+
typeIgnoringNonRef = determineType(ignoreNonRef);
687+
}
688+
return typeIgnoringNonRef;
689+
} else {
690+
if (type == null) {
691+
type = determineType(ignoreNonRef);
692+
}
693+
return type;
694+
}
673695
}
674696

675697
/**
@@ -1430,30 +1452,34 @@ private void validateStop() {
14301452
//
14311453
// ---------------------------------------------------------------------------------------------------------
14321454

1433-
private void determineType() {
1434-
if ( type == null ) {
1435-
switch ( getNAlleles() ) {
1436-
case 0:
1437-
throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this);
1438-
case 1:
1439-
// note that this doesn't require a reference allele. You can be monomorphic independent of having a
1440-
// reference allele
1441-
type = Type.NO_VARIATION;
1442-
break;
1443-
default:
1444-
determinePolymorphicType();
1445-
}
1455+
private Type determineType(final boolean ignoreNonRef) {
1456+
switch ( getNAlleles() ) {
1457+
case 0:
1458+
throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this);
1459+
case 1:
1460+
// note that this doesn't require a reference allele. You can be monomorphic independent of having a
1461+
// reference allele
1462+
return Type.NO_VARIATION;
1463+
default:
1464+
return determinePolymorphicType(ignoreNonRef);
14461465
}
14471466
}
14481467

1449-
private void determinePolymorphicType() {
1450-
type = null;
1468+
private Type determinePolymorphicType(final boolean ignoreNonRef) {
1469+
Type type = null;
1470+
boolean nonRefAlleleFound = false;
14511471

14521472
// do a pairwise comparison of all alleles against the reference allele
14531473
for ( Allele allele : alleles ) {
14541474
if ( allele == REF )
14551475
continue;
14561476

1477+
// If we see a NON_REF allele and need to ignore it, skip this allele, but signal that we have seen one
1478+
if (ignoreNonRef && allele.isNonRefAllele()) {
1479+
nonRefAlleleFound = true;
1480+
continue;
1481+
}
1482+
14571483
// find the type of this allele relative to the reference
14581484
Type biallelicType = typeOfBiallelicVariant(REF, allele);
14591485

@@ -1463,10 +1489,15 @@ private void determinePolymorphicType() {
14631489
}
14641490
// if the type of this allele is different from that of a previous one, assign it the MIXED type and quit
14651491
else if ( biallelicType != type ) {
1466-
type = Type.MIXED;
1467-
return;
1492+
return Type.MIXED;
14681493
}
14691494
}
1495+
// If all alt alleles are NON_REF alleles and ignoreNonRef is true, type will still be null. Therefore, if we
1496+
// have only seen NON_REFs, choose SYMBOLIC
1497+
if (type == null && nonRefAlleleFound) {
1498+
return Type.NO_VARIATION;
1499+
}
1500+
return type;
14701501
}
14711502

14721503
private static Type typeOfBiallelicVariant(Allele ref, Allele allele) {

src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java

Lines changed: 141 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -104,72 +104,198 @@ public void testDetermineTypes() {
104104
Allele TA = Allele.create("TA");
105105
Allele TC = Allele.create("TC");
106106
Allele symbolic = Allele.create("<FOO>");
107+
Allele nonRef = Allele.create(Allele.NON_REF_STRING, false);
107108

108109
// test REF
109110
List<Allele> alleles = Arrays.asList(Tref);
111+
List<Allele> allelesWithNonRef = new ArrayList<>(alleles);
112+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
110113
VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
111-
Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION);
114+
VariantContext vcWithNonRef = snpBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
115+
VariantContext.Type expectedType = VariantContext.Type.NO_VARIATION;
116+
Assert.assertEquals(vc.getType(), expectedType);
117+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
112118

113119
// test SNPs
114120
alleles = Arrays.asList(Tref, A);
121+
allelesWithNonRef = new ArrayList<>(alleles);
122+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
115123
vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
116-
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
124+
vcWithNonRef = snpBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
125+
expectedType = VariantContext.Type.SNP;
126+
Assert.assertEquals(vc.getType(), expectedType);
127+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
117128

118129
alleles = Arrays.asList(Tref, A, C);
130+
allelesWithNonRef = new ArrayList<>(alleles);
131+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
119132
vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
120-
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
133+
vcWithNonRef = snpBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
134+
expectedType = VariantContext.Type.SNP;
135+
Assert.assertEquals(vc.getType(), expectedType);
136+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
121137

122138
// test MNPs
123139
alleles = Arrays.asList(ACref, TA);
140+
allelesWithNonRef = new ArrayList<>(alleles);
141+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
124142
vc = snpBuilder.alleles(alleles).stop(snpLocStop+1).make();
125-
Assert.assertEquals(vc.getType(), VariantContext.Type.MNP);
143+
vcWithNonRef = snpBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
144+
expectedType = VariantContext.Type.MNP;
145+
Assert.assertEquals(vc.getType(), expectedType);
146+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
126147

127148
alleles = Arrays.asList(ATCref, CAT, Allele.create("GGG"));
149+
allelesWithNonRef = new ArrayList<>(alleles);
150+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
128151
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
129-
Assert.assertEquals(vc.getType(), VariantContext.Type.MNP);
152+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
153+
expectedType = VariantContext.Type.MNP;
154+
Assert.assertEquals(vc.getType(), expectedType);
155+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
130156

131157
// test INDELs
132158
alleles = Arrays.asList(Aref, ATC);
159+
allelesWithNonRef = new ArrayList<>(alleles);
160+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
133161
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
134-
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
162+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
163+
expectedType = VariantContext.Type.INDEL;
164+
Assert.assertEquals(vc.getType(), expectedType);
165+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
135166

136167
alleles = Arrays.asList(ATCref, A);
168+
allelesWithNonRef = new ArrayList<>(alleles);
169+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
137170
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
138-
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
171+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
172+
expectedType = VariantContext.Type.INDEL;
173+
Assert.assertEquals(vc.getType(), expectedType);
174+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
139175

140176
alleles = Arrays.asList(Tref, TA, TC);
177+
allelesWithNonRef = new ArrayList<>(alleles);
178+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
141179
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
142-
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
180+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
181+
expectedType = VariantContext.Type.INDEL;
182+
Assert.assertEquals(vc.getType(), expectedType);
183+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
143184

144185
alleles = Arrays.asList(ATCref, A, AC);
186+
allelesWithNonRef = new ArrayList<>(alleles);
187+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
145188
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
146-
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
189+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
190+
expectedType = VariantContext.Type.INDEL;
191+
Assert.assertEquals(vc.getType(), expectedType);
192+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
147193

148194
alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC"));
195+
allelesWithNonRef = new ArrayList<>(alleles);
196+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
149197
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
150-
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
198+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
199+
expectedType = VariantContext.Type.INDEL;
200+
Assert.assertEquals(vc.getType(), expectedType);
201+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
151202

152203
// test MIXED
153204
alleles = Arrays.asList(TAref, T, TC);
205+
allelesWithNonRef = new ArrayList<>(alleles);
206+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
154207
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
155-
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
208+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
209+
expectedType = VariantContext.Type.MIXED;
210+
Assert.assertEquals(vc.getType(), expectedType);
211+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
156212

157213
alleles = Arrays.asList(TAref, T, AC);
214+
allelesWithNonRef = new ArrayList<>(alleles);
215+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
158216
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
159-
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
217+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
218+
expectedType = VariantContext.Type.MIXED;
219+
Assert.assertEquals(vc.getType(), expectedType);
220+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
160221

161222
alleles = Arrays.asList(ACref, ATC, AT);
223+
allelesWithNonRef = new ArrayList<>(alleles);
224+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
162225
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
163-
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
226+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
227+
expectedType = VariantContext.Type.MIXED;
228+
Assert.assertEquals(vc.getType(), expectedType);
229+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
164230

165231
alleles = Arrays.asList(Aref, T, symbolic);
232+
allelesWithNonRef = new ArrayList<>(alleles);
233+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
166234
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
167-
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
235+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
236+
expectedType = VariantContext.Type.MIXED;
237+
Assert.assertEquals(vc.getType(), expectedType);
238+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
168239

169240
// test SYMBOLIC
170241
alleles = Arrays.asList(Tref, symbolic);
242+
allelesWithNonRef = new ArrayList<>(alleles);
243+
allelesWithNonRef.add(Allele.NON_REF_ALLELE);
171244
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
172-
Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC);
245+
vcWithNonRef = basicBuilder.alleles(allelesWithNonRef).stop(snpLocStop).make();
246+
expectedType = VariantContext.Type.SYMBOLIC;
247+
Assert.assertEquals(vc.getType(), expectedType);
248+
Assert.assertEquals(vcWithNonRef.getType(true), expectedType);
249+
}
250+
251+
@Test
252+
public void testDetermineTypesIgnoringNonRef() {
253+
Allele AT = Allele.create("AT");
254+
Allele TC = Allele.create("TC");
255+
Allele symbolic = Allele.create("<FOO>");
256+
257+
List<Allele> alleles = Arrays.asList(Allele.REF_A, Allele.NON_REF_ALLELE);
258+
VariantContext vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
259+
Assert.assertEquals(vc.getType(false), VariantContext.Type.SYMBOLIC);
260+
261+
alleles = Arrays.asList(Allele.REF_A, Allele.NON_REF_ALLELE, symbolic);
262+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
263+
Assert.assertEquals(vc.getType(false), VariantContext.Type.SYMBOLIC);
264+
265+
alleles = Arrays.asList(Allele.REF_A, Allele.NON_REF_ALLELE, symbolic);
266+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
267+
Assert.assertEquals(vc.getType(true), VariantContext.Type.SYMBOLIC);
268+
269+
alleles = Arrays.asList(Allele.REF_A, Allele.NON_REF_ALLELE);
270+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
271+
Assert.assertEquals(vc.getType(true), VariantContext.Type.NO_VARIATION);
272+
273+
alleles = Arrays.asList(Allele.REF_A, Allele.ALT_C, Allele.NON_REF_ALLELE);
274+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
275+
Assert.assertEquals(vc.getType(true), VariantContext.Type.SNP);
276+
277+
alleles = Arrays.asList(Allele.REF_A, Allele.NON_REF_ALLELE, Allele.ALT_C);
278+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
279+
Assert.assertEquals(vc.getType(true), VariantContext.Type.SNP);
280+
281+
alleles = Arrays.asList(Allele.REF_A, AT, Allele.NON_REF_ALLELE);
282+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
283+
Assert.assertEquals(vc.getType(true), VariantContext.Type.INDEL);
284+
285+
alleles = Arrays.asList(Allele.REF_A, Allele.ALT_C, TC, Allele.NON_REF_ALLELE);
286+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
287+
Assert.assertEquals(vc.getType(true), VariantContext.Type.MIXED);
288+
289+
alleles = Arrays.asList(Allele.REF_A, Allele.ALT_C, symbolic, Allele.NON_REF_ALLELE);
290+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
291+
Assert.assertEquals(vc.getType(true), VariantContext.Type.MIXED);
292+
293+
// Assure that the caching of the variant type is not persistent between ignoreNonRef being true and false
294+
alleles = Arrays.asList(Allele.REF_A, Allele.ALT_C, Allele.NON_REF_ALLELE);
295+
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
296+
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
297+
Assert.assertEquals(vc.getType(true), VariantContext.Type.SNP);
298+
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
173299
}
174300

175301
@Test

0 commit comments

Comments
 (0)