Skip to content

Commit 5b45b8b

Browse files
author
Anders Leung
committed
Refactor VariantContext decoding, decode VCs with different version from header on write
1 parent e917800 commit 5b45b8b

8 files changed

Lines changed: 269 additions & 154 deletions

File tree

src/main/java/htsjdk/variant/variantcontext/VariantContext.java

Lines changed: 128 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
package htsjdk.variant.variantcontext;
2727

2828
import htsjdk.beta.plugin.HtsRecord;
29-
import htsjdk.samtools.util.QualityUtil;
3029
import htsjdk.tribble.Feature;
3130
import htsjdk.tribble.TribbleException;
3231
import htsjdk.tribble.util.ParsingUtils;
@@ -39,11 +38,12 @@
3938
import java.util.Collections;
4039
import java.util.Comparator;
4140
import java.util.EnumSet;
42-
import java.util.HashMap;
4341
import java.util.HashSet;
4442
import java.util.List;
4543
import java.util.Map;
4644
import java.util.Set;
45+
import java.util.function.BiConsumer;
46+
import java.util.function.Function;
4747
import java.util.regex.Pattern;
4848
import java.util.stream.Collectors;
4949

@@ -1636,10 +1636,14 @@ public VariantContext fullyDecode(final VCFHeader header, final boolean lenientD
16361636
else {
16371637
// TODO -- warning this is potentially very expensive as it creates copies over and over
16381638
final VariantContextBuilder builder = new VariantContextBuilder(this);
1639-
fullyDecodeInfo(builder, header, lenientDecoding);
1640-
fullyDecodeGenotypes(builder, header);
1641-
builder.fullyDecoded(true);
1642-
return builder.make();
1639+
try {
1640+
decodeInfo(builder, header, lenientDecoding);
1641+
decodeGenotypes(builder, header, lenientDecoding);
1642+
} catch (final NumberFormatException | TribbleException e) {
1643+
// Wrap and rethrow exception with more precise information about VC
1644+
throw new TribbleException("Failed to fully decode VariantContext " + getContig() + ":" + getStart() + " with cause: " + e.getMessage());
1645+
}
1646+
return builder.fullyDecoded(true).make();
16431647
}
16441648
}
16451649

@@ -1651,145 +1655,156 @@ public boolean isFullyDecoded() {
16511655
return fullyDecoded;
16521656
}
16531657

1654-
private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) {
1655-
builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding));
1658+
private void decodeInfo(
1659+
final VariantContextBuilder builder,
1660+
final VCFHeader header,
1661+
final boolean lenientDecoding
1662+
) {
1663+
decodeAttributes(builder::attribute, id -> getInfoLine(id, header, lenientDecoding), this.getAttributes(), lenientDecoding);
16561664
}
16571665

1658-
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
1659-
final VCFHeader header,
1660-
final boolean lenientDecoding) {
1661-
final Map<String, Object> newAttributes = new HashMap<>(10);
1662-
1663-
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
1664-
final String field = attr.getKey();
1665-
1666-
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) )
1667-
continue; // gross, FT is part of the extended attributes
1668-
1669-
final VCFCompoundHeaderLine format = VariantContextUtils.getMetaDataForField(header, field);
1670-
final Object decoded = decodeValue(field, attr.getValue(), format, header.getVCFHeaderVersion());
1666+
private void decodeGenotypes(
1667+
final VariantContextBuilder builder,
1668+
final VCFHeader header,
1669+
final boolean lenientDecoding
1670+
) {
1671+
final GenotypesContext gc = this.getGenotypes();
1672+
final GenotypesContext newGc = new GenotypesContext(gc.size());
1673+
for (final Genotype g : this.getGenotypes()) {
1674+
final GenotypeBuilder gb = new GenotypeBuilder(g);
1675+
decodeAttributes(gb::attribute, id -> getFormatLine(id, header, lenientDecoding), g.getExtendedAttributes(), lenientDecoding);
1676+
newGc.add(gb.make());
1677+
}
1678+
builder.genotypesNoValidation(newGc);
1679+
}
16711680

1672-
if ( decoded != null &&
1673-
! lenientDecoding
1674-
&& format.getCountType() != VCFHeaderLineCount.UNBOUNDED
1675-
&& format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements
1676-
final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1;
1677-
final int expSize = format.getCount(this);
1678-
if ( obsSize != expSize ) {
1679-
throw new TribbleException.InvalidHeader("Discordant field size detected for field " +
1680-
field + " at " + getContig() + ":" + getStart() + ". Field had " + obsSize + " values " +
1681-
"but the header says this should have " + expSize + " values based on header record " +
1682-
format);
1681+
private void decodeAttributes(
1682+
final BiConsumer<String, Object> put,
1683+
final Function<String, VCFCompoundHeaderLine> metadataGetter,
1684+
final Map<String, Object> attributes,
1685+
final boolean lenientDecoding
1686+
) {
1687+
attributes.forEach((id, value) -> {
1688+
final VCFCompoundHeaderLine line = metadataGetter.apply(id);
1689+
final Object decoded = decodeValue(value, line);
1690+
1691+
if (decoded != null && !lenientDecoding
1692+
&& line.getCountType() != VCFHeaderLineCount.UNBOUNDED
1693+
&& line.getType() != VCFHeaderLineType.Flag
1694+
) {
1695+
// We expect exactly the right number of elements
1696+
final int actualSize = decoded instanceof List ? ((List<?>) decoded).size() : 1;
1697+
final int expectedSize = line.getCount(this);
1698+
if (actualSize != expectedSize) {
1699+
throw new TribbleException(String.format(
1700+
"While decoding field: %s, number of values decoded: %d did not match expected number based on header count: %d",
1701+
id, actualSize, expectedSize
1702+
));
16831703
}
16841704
}
1685-
newAttributes.put(field, decoded);
1686-
}
1687-
1688-
return newAttributes;
1705+
put.accept(id, decoded);
1706+
});
16891707
}
16901708

16911709
private Object decodeValue(
1692-
final String field,
16931710
final Object value,
1694-
final VCFCompoundHeaderLine format,
1695-
final VCFHeaderVersion version
1711+
final VCFCompoundHeaderLine line
16961712
) {
16971713
final boolean percentDecode = version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3);
1698-
if ( value instanceof String ) {
1714+
final VCFHeaderLineType type = line.getType();
1715+
if (value instanceof String) {
16991716
final String string = (String) value;
1700-
if ( field.equals(VCFConstants.GENOTYPE_PL_KEY) )
1701-
return GenotypeLikelihoods.fromPLField(string);
1702-
1703-
if (field.equals(VCFConstants.GENOTYPE_POSTERIORS_KEY)) {
1704-
return decodeGPKey(string, version);
1705-
}
1706-
1707-
if ( string.indexOf(',') != -1 ) {
1717+
if (string.indexOf(',') != -1) {
17081718
final String[] splits = string.split(",");
17091719
final List<Object> values = new ArrayList<>(splits.length);
1710-
for (final String split : splits)
1711-
values.add(decodeOne(field, split, format, percentDecode));
1720+
for (final String split : splits) {
1721+
values.add(decodeOne(split, type, percentDecode));
1722+
}
17121723
return values;
17131724
} else {
1714-
return decodeOne(field, string, format, percentDecode);
1725+
return decodeOne(string, type, percentDecode);
17151726
}
1716-
} else if ( value instanceof List && (((List<?>) value).get(0)) instanceof String ) {
1717-
final List<String> asList = (List<String>)value;
1727+
} else if (value instanceof List && (((List<?>) value).get(0)) instanceof String) {
1728+
final List<String> asList = (List<String>) value;
17181729
final List<Object> values = new ArrayList<>(asList.size());
1719-
for ( final String s : asList )
1720-
values.add(decodeOne(field, s, format, percentDecode));
1730+
for (final String s : asList) {
1731+
values.add(decodeOne(s, type, percentDecode));
1732+
}
17211733
return values;
17221734
} else {
17231735
return value;
17241736
}
1725-
1726-
// allowMissingValuesComparedToHeader
17271737
}
17281738

1729-
private Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format, final boolean percentDecode) {
1730-
try {
1731-
if ( string.equals(VCFConstants.MISSING_VALUE_v4) )
1732-
return null;
1733-
else {
1734-
switch ( format.getType() ) {
1735-
case Character: return string;
1736-
case Flag:
1737-
final boolean b = Boolean.parseBoolean(string) || string.equals("1");
1738-
if (!b)
1739-
throw new TribbleException("VariantContext FLAG fields " + field + " cannot contain false values"
1740-
+ " as seen at " + getContig() + ":" + getStart());
1741-
return true;
1742-
case String: return percentDecode ? VCFPercentEncodedTextTransformer.percentDecode(string) : string;
1743-
case Integer: return Integer.valueOf(string);
1744-
case Float: return VCFUtils.parseVcfDouble(string);
1745-
default: throw new TribbleException("Unexpected type for field" + field);
1746-
}
1739+
private static Object decodeOne(
1740+
final String string,
1741+
final VCFHeaderLineType type,
1742+
final boolean percentDecode
1743+
) {
1744+
if (string.equals(VCFConstants.MISSING_VALUE_v4))
1745+
return null;
1746+
else {
1747+
switch (type) {
1748+
case Character: return string;
1749+
case Flag:
1750+
final boolean b = Boolean.parseBoolean(string) || string.equals("1");
1751+
if (!b)
1752+
throw new TribbleException("VariantContext FLAG fields cannot contain false values");
1753+
return true;
1754+
case String: return percentDecode ? VCFPercentEncodedTextTransformer.percentDecode(string) : string;
1755+
case Integer: return Integer.valueOf(string);
1756+
case Float: return VCFUtils.parseVcfDouble(string);
1757+
default: throw new TribbleException("Unexpected VCF type " + type);
17471758
}
1748-
} catch (NumberFormatException e) {
1749-
throw new TribbleException("Could not decode field " + field + " with value " + string + " of declared type " + format.getType());
17501759
}
17511760
}
17521761

1753-
private static List<Double> decodeGPKey(final String value, final VCFHeaderVersion version) {
1754-
final String[] splits = value.split(",");
1755-
// We need to special-case GP because there is a discrepancy in the scale used to record
1756-
// its values between pre-4.3 and 4.3+ VCF. Pre-4.3 GP is phred scale encoded while
1757-
// 4.3+ GP is a linear probability, bringing it in line with other standard keys that
1758-
// use the P suffix (c.f. VCF 4.3 spec section 7.2).
1759-
1760-
// Some tools in the wild apparently already use linear scaled GP, so we have to
1761-
// be careful about converting inputs. We check whether GP values are already linear
1762-
// scaled by seeing if the values' sum is approximately equal to 1, like we
1763-
// would expect if the values were linear scale probabilities.
1764-
// c.f. https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/CEBCD558.FA29%25browning%40u.washington.edu/
1765-
double sum = 0;
1766-
1767-
final List<Double> rawGPValues = new ArrayList<>(splits.length);
1768-
for (final String s : splits) {
1769-
final double GP = VCFUtils.parseVcfDouble(s);
1770-
rawGPValues.add(GP);
1771-
sum += GP;
1772-
}
1773-
1774-
final boolean wasLinearScale = GeneralUtils.compareDoubles(sum, 1, VCFConstants.VCF_ENCODING_EPSILON) == 0;
1775-
if (!wasLinearScale && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) {
1776-
rawGPValues.replaceAll(GP -> QualityUtil.getErrorProbabilityFromPhredScore((int) Math.round(GP)));
1777-
}
1778-
return rawGPValues;
1779-
1762+
private static VCFCompoundHeaderLine getInfoLine(
1763+
final String id,
1764+
final VCFHeader header,
1765+
final boolean lenientDecoding
1766+
) {
1767+
return getMetadataLine(id, header, lenientDecoding, false);
17801768
}
17811769

1782-
private void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) {
1783-
final GenotypesContext gc = new GenotypesContext();
1784-
for ( final Genotype g : getGenotypes() ) {
1785-
gc.add(fullyDecodeGenotypes(g, header));
1786-
}
1787-
builder.genotypesNoValidation(gc);
1770+
private static VCFCompoundHeaderLine getFormatLine(
1771+
final String id,
1772+
final VCFHeader header,
1773+
final boolean lenientDecoding
1774+
) {
1775+
return getMetadataLine(id, header, lenientDecoding, true);
17881776
}
17891777

1790-
private final Genotype fullyDecodeGenotypes(final Genotype g, final VCFHeader header) {
1791-
final Map<String, Object> map = fullyDecodeAttributes(g.getExtendedAttributes(), header, true);
1792-
return new GenotypeBuilder(g).attributes(map).make();
1778+
private static VCFCompoundHeaderLine getMetadataLine(
1779+
final String id,
1780+
final VCFHeader header,
1781+
final boolean lenientDecoding,
1782+
final boolean isFormat
1783+
) {
1784+
// Try getting line from header
1785+
VCFCompoundHeaderLine line = isFormat ? header.getFormatHeaderLine(id) : header.getInfoHeaderLine(id);
1786+
if (line == null && !lenientDecoding) {
1787+
throw new TribbleException(String.format(
1788+
"No %s header line was found matching ID: %s",
1789+
isFormat ? "FORMAT" : "INFO", id
1790+
));
1791+
}
1792+
if (line == null) {
1793+
// Try to find a matching standard header line
1794+
line = isFormat
1795+
? VCFStandardHeaderLines.getFormatLine(id, false)
1796+
: VCFStandardHeaderLines.getInfoLine(id, false);
1797+
}
1798+
if (line == null) {
1799+
final String description = "Auto-generated unbounded string "
1800+
+ (isFormat ? "FORMAT" : "INFO")
1801+
+ " header line for " + id;
1802+
// Decode values as strings, and allow an unbounded number of values
1803+
line = isFormat
1804+
? new VCFFormatHeaderLine(id, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, description)
1805+
: new VCFInfoHeaderLine(id, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, description);
1806+
}
1807+
return line;
17931808
}
17941809

17951810
// ---------------------------------------------------------------------------------------------------------

src/main/java/htsjdk/variant/variantcontext/VariantContextUtils.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ public class VariantContextUtils {
6060
return jexl;
6161
});
6262
private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false;
63-
63+
6464
/**
6565
* Computes the alternate allele frequency at the provided {@link VariantContext} by dividing its "AN" by its "AC".
6666
* @param vc The variant whose alternate allele frequency is computed
@@ -81,7 +81,7 @@ public static double calculateAltAlleleFrequency(final VariantContext vc) {
8181
throw new AssertionError(String.format("Expected a minor allele frequency in the range [0, 1], but got %s. vc=%s", aaf, vc));
8282
return aaf;
8383
}
84-
84+
8585
/**
8686
* Update the attributes of the attributes map given the VariantContext to reflect the
8787
* proper chromosome-based VCF tags
@@ -180,6 +180,11 @@ public static void calculateChromosomeCounts(VariantContextBuilder builder, bool
180180
builder.attributes(calculateChromosomeCounts(vc, new HashMap<>(vc.getAttributes()), removeStaleValues, founderIds));
181181
}
182182

183+
/**
184+
* @deprecated starting after version 2.24.1
185+
* Use {@link VCFHeader#getInfoHeaderLine(String)} or {@link VCFHeader#getFormatHeaderLine(String)} instead
186+
*/
187+
@Deprecated
183188
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
184189
VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field);
185190
if ( metaData == null ) metaData = header.getInfoHeaderLine(field);

src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
6161
// we have to store the list of strings that make up the header until they're needed
6262
protected VCFHeader header = null;
6363
protected VCFHeaderVersion version = null;
64+
private VCFHeaderVersion contextsVersion = null;
6465

6566
// a mapping of the allele
6667
protected final Map<String, List<Allele>> alleleMap = new HashMap<>(3);
@@ -510,7 +511,11 @@ public VCFHeader setVCFHeader(final VCFHeader newHeader) {
510511
throw new TribbleException("Unrecognized VCF Version Upgrade Policy: " + this.policy);
511512
}
512513

514+
// We check and possibly update the version of the header lines inside the codec here, but the VariantContexts
515+
// themselves may be expensive to decode and update, so we mark them with their original version so that when
516+
// they are decoded, the decoder handles version specific behavior (e.g. percent encoding) correctly
513517
this.version = this.header.getVCFHeaderVersion();
518+
this.contextsVersion = originalVersion;
514519
return this.header;
515520
}
516521

@@ -562,8 +567,8 @@ private VariantContext decodeLine(final String line, final boolean includeGenoty
562567
* @return a variant context object
563568
*/
564569
private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) {
565-
VariantContextBuilder builder = new VariantContextBuilder();
566-
builder.version(version);
570+
final VariantContextBuilder builder = new VariantContextBuilder();
571+
builder.version(contextsVersion);
567572
builder.source(getName());
568573

569574
// increment the line count

0 commit comments

Comments
 (0)