2626package htsjdk .variant .variantcontext ;
2727
2828import htsjdk .beta .plugin .HtsRecord ;
29- import htsjdk .samtools .util .QualityUtil ;
3029import htsjdk .tribble .Feature ;
3130import htsjdk .tribble .TribbleException ;
3231import htsjdk .tribble .util .ParsingUtils ;
3938import java .util .Collections ;
4039import java .util .Comparator ;
4140import java .util .EnumSet ;
42- import java .util .HashMap ;
4341import java .util .HashSet ;
4442import java .util .List ;
4543import java .util .Map ;
4644import java .util .Set ;
45+ import java .util .function .BiConsumer ;
46+ import java .util .function .Function ;
4747import java .util .regex .Pattern ;
4848import java .util .stream .Collectors ;
4949
@@ -1636,10 +1636,14 @@ public VariantContext fullyDecode(final VCFHeader header, final boolean lenientD
16361636 else {
16371637 // TODO -- warning this is potentially very expensive as it creates copies over and over
16381638 final VariantContextBuilder builder = new VariantContextBuilder (this );
1639- fullyDecodeInfo (builder , header , lenientDecoding );
1640- fullyDecodeGenotypes (builder , header );
1641- builder .fullyDecoded (true );
1642- return builder .make ();
1639+ try {
1640+ decodeInfo (builder , header , lenientDecoding );
1641+ decodeGenotypes (builder , header , lenientDecoding );
1642+ } catch (final NumberFormatException | TribbleException e ) {
1643+ // Wrap and rethrow exception with more precise information about VC
1644+ throw new TribbleException ("Failed to fully decode VariantContext " + getContig () + ":" + getStart () + " with cause: " + e .getMessage ());
1645+ }
1646+ return builder .fullyDecoded (true ).make ();
16431647 }
16441648 }
16451649
@@ -1651,145 +1655,156 @@ public boolean isFullyDecoded() {
16511655 return fullyDecoded ;
16521656 }
16531657
1654- private final void fullyDecodeInfo (final VariantContextBuilder builder , final VCFHeader header , final boolean lenientDecoding ) {
1655- builder .attributes (fullyDecodeAttributes (getAttributes (), header , lenientDecoding ));
1658+ private void decodeInfo (
1659+ final VariantContextBuilder builder ,
1660+ final VCFHeader header ,
1661+ final boolean lenientDecoding
1662+ ) {
1663+ decodeAttributes (builder ::attribute , id -> getInfoLine (id , header , lenientDecoding ), this .getAttributes (), lenientDecoding );
16561664 }
16571665
1658- private final Map <String , Object > fullyDecodeAttributes (final Map <String , Object > attributes ,
1659- final VCFHeader header ,
1660- final boolean lenientDecoding ) {
1661- final Map <String , Object > newAttributes = new HashMap <>(10 );
1662-
1663- for ( final Map .Entry <String , Object > attr : attributes .entrySet () ) {
1664- final String field = attr .getKey ();
1665-
1666- if ( field .equals (VCFConstants .GENOTYPE_FILTER_KEY ) )
1667- continue ; // gross, FT is part of the extended attributes
1668-
1669- final VCFCompoundHeaderLine format = VariantContextUtils .getMetaDataForField (header , field );
1670- final Object decoded = decodeValue (field , attr .getValue (), format , header .getVCFHeaderVersion ());
1666+ private void decodeGenotypes (
1667+ final VariantContextBuilder builder ,
1668+ final VCFHeader header ,
1669+ final boolean lenientDecoding
1670+ ) {
1671+ final GenotypesContext gc = this .getGenotypes ();
1672+ final GenotypesContext newGc = new GenotypesContext (gc .size ());
1673+ for (final Genotype g : this .getGenotypes ()) {
1674+ final GenotypeBuilder gb = new GenotypeBuilder (g );
1675+ decodeAttributes (gb ::attribute , id -> getFormatLine (id , header , lenientDecoding ), g .getExtendedAttributes (), lenientDecoding );
1676+ newGc .add (gb .make ());
1677+ }
1678+ builder .genotypesNoValidation (newGc );
1679+ }
16711680
1672- if ( decoded != null &&
1673- ! lenientDecoding
1674- && format .getCountType () != VCFHeaderLineCount .UNBOUNDED
1675- && format .getType () != VCFHeaderLineType .Flag ) { // we expect exactly the right number of elements
1676- final int obsSize = decoded instanceof List ? ((List ) decoded ).size () : 1 ;
1677- final int expSize = format .getCount (this );
1678- if ( obsSize != expSize ) {
1679- throw new TribbleException .InvalidHeader ("Discordant field size detected for field " +
1680- field + " at " + getContig () + ":" + getStart () + ". Field had " + obsSize + " values " +
1681- "but the header says this should have " + expSize + " values based on header record " +
1682- format );
1681+ private void decodeAttributes (
1682+ final BiConsumer <String , Object > put ,
1683+ final Function <String , VCFCompoundHeaderLine > metadataGetter ,
1684+ final Map <String , Object > attributes ,
1685+ final boolean lenientDecoding
1686+ ) {
1687+ attributes .forEach ((id , value ) -> {
1688+ final VCFCompoundHeaderLine line = metadataGetter .apply (id );
1689+ final Object decoded = decodeValue (value , line );
1690+
1691+ if (decoded != null && !lenientDecoding
1692+ && line .getCountType () != VCFHeaderLineCount .UNBOUNDED
1693+ && line .getType () != VCFHeaderLineType .Flag
1694+ ) {
1695+ // We expect exactly the right number of elements
1696+ final int actualSize = decoded instanceof List ? ((List <?>) decoded ).size () : 1 ;
1697+ final int expectedSize = line .getCount (this );
1698+ if (actualSize != expectedSize ) {
1699+ throw new TribbleException (String .format (
1700+ "While decoding field: %s, number of values decoded: %d did not match expected number based on header count: %d" ,
1701+ id , actualSize , expectedSize
1702+ ));
16831703 }
16841704 }
1685- newAttributes .put (field , decoded );
1686- }
1687-
1688- return newAttributes ;
1705+ put .accept (id , decoded );
1706+ });
16891707 }
16901708
16911709 private Object decodeValue (
1692- final String field ,
16931710 final Object value ,
1694- final VCFCompoundHeaderLine format ,
1695- final VCFHeaderVersion version
1711+ final VCFCompoundHeaderLine line
16961712 ) {
16971713 final boolean percentDecode = version .isAtLeastAsRecentAs (VCFHeaderVersion .VCF4_3 );
1698- if ( value instanceof String ) {
1714+ final VCFHeaderLineType type = line .getType ();
1715+ if (value instanceof String ) {
16991716 final String string = (String ) value ;
1700- if ( field .equals (VCFConstants .GENOTYPE_PL_KEY ) )
1701- return GenotypeLikelihoods .fromPLField (string );
1702-
1703- if (field .equals (VCFConstants .GENOTYPE_POSTERIORS_KEY )) {
1704- return decodeGPKey (string , version );
1705- }
1706-
1707- if ( string .indexOf (',' ) != -1 ) {
1717+ if (string .indexOf (',' ) != -1 ) {
17081718 final String [] splits = string .split ("," );
17091719 final List <Object > values = new ArrayList <>(splits .length );
1710- for (final String split : splits )
1711- values .add (decodeOne (field , split , format , percentDecode ));
1720+ for (final String split : splits ) {
1721+ values .add (decodeOne (split , type , percentDecode ));
1722+ }
17121723 return values ;
17131724 } else {
1714- return decodeOne (field , string , format , percentDecode );
1725+ return decodeOne (string , type , percentDecode );
17151726 }
1716- } else if ( value instanceof List && (((List <?>) value ).get (0 )) instanceof String ) {
1717- final List <String > asList = (List <String >)value ;
1727+ } else if (value instanceof List && (((List <?>) value ).get (0 )) instanceof String ) {
1728+ final List <String > asList = (List <String >) value ;
17181729 final List <Object > values = new ArrayList <>(asList .size ());
1719- for ( final String s : asList )
1720- values .add (decodeOne (field , s , format , percentDecode ));
1730+ for (final String s : asList ) {
1731+ values .add (decodeOne (s , type , percentDecode ));
1732+ }
17211733 return values ;
17221734 } else {
17231735 return value ;
17241736 }
1725-
1726- // allowMissingValuesComparedToHeader
17271737 }
17281738
1729- private Object decodeOne (final String field , final String string , final VCFCompoundHeaderLine format , final boolean percentDecode ) {
1730- try {
1731- if ( string .equals (VCFConstants .MISSING_VALUE_v4 ) )
1732- return null ;
1733- else {
1734- switch ( format .getType () ) {
1735- case Character : return string ;
1736- case Flag :
1737- final boolean b = Boolean .parseBoolean (string ) || string .equals ("1" );
1738- if (!b )
1739- throw new TribbleException ("VariantContext FLAG fields " + field + " cannot contain false values"
1740- + " as seen at " + getContig () + ":" + getStart ());
1741- return true ;
1742- case String : return percentDecode ? VCFPercentEncodedTextTransformer .percentDecode (string ) : string ;
1743- case Integer : return Integer .valueOf (string );
1744- case Float : return VCFUtils .parseVcfDouble (string );
1745- default : throw new TribbleException ("Unexpected type for field" + field );
1746- }
1739+ private static Object decodeOne (
1740+ final String string ,
1741+ final VCFHeaderLineType type ,
1742+ final boolean percentDecode
1743+ ) {
1744+ if (string .equals (VCFConstants .MISSING_VALUE_v4 ))
1745+ return null ;
1746+ else {
1747+ switch (type ) {
1748+ case Character : return string ;
1749+ case Flag :
1750+ final boolean b = Boolean .parseBoolean (string ) || string .equals ("1" );
1751+ if (!b )
1752+ throw new TribbleException ("VariantContext FLAG fields cannot contain false values" );
1753+ return true ;
1754+ case String : return percentDecode ? VCFPercentEncodedTextTransformer .percentDecode (string ) : string ;
1755+ case Integer : return Integer .valueOf (string );
1756+ case Float : return VCFUtils .parseVcfDouble (string );
1757+ default : throw new TribbleException ("Unexpected VCF type " + type );
17471758 }
1748- } catch (NumberFormatException e ) {
1749- throw new TribbleException ("Could not decode field " + field + " with value " + string + " of declared type " + format .getType ());
17501759 }
17511760 }
17521761
1753- private static List <Double > decodeGPKey (final String value , final VCFHeaderVersion version ) {
1754- final String [] splits = value .split ("," );
1755- // We need to special-case GP because there is a discrepancy in the scale used to record
1756- // its values between pre-4.3 and 4.3+ VCF. Pre-4.3 GP is phred scale encoded while
1757- // 4.3+ GP is a linear probability, bringing it in line with other standard keys that
1758- // use the P suffix (c.f. VCF 4.3 spec section 7.2).
1759-
1760- // Some tools in the wild apparently already use linear scaled GP, so we have to
1761- // be careful about converting inputs. We check whether GP values are already linear
1762- // scaled by seeing if the values' sum is approximately equal to 1, like we
1763- // would expect if the values were linear scale probabilities.
1764- // c.f. https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/CEBCD558.FA29%25browning%40u.washington.edu/
1765- double sum = 0 ;
1766-
1767- final List <Double > rawGPValues = new ArrayList <>(splits .length );
1768- for (final String s : splits ) {
1769- final double GP = VCFUtils .parseVcfDouble (s );
1770- rawGPValues .add (GP );
1771- sum += GP ;
1772- }
1773-
1774- final boolean wasLinearScale = GeneralUtils .compareDoubles (sum , 1 , VCFConstants .VCF_ENCODING_EPSILON ) == 0 ;
1775- if (!wasLinearScale && version .isAtLeastAsRecentAs (VCFHeaderVersion .VCF4_3 )) {
1776- rawGPValues .replaceAll (GP -> QualityUtil .getErrorProbabilityFromPhredScore ((int ) Math .round (GP )));
1777- }
1778- return rawGPValues ;
1779-
1762+ private static VCFCompoundHeaderLine getInfoLine (
1763+ final String id ,
1764+ final VCFHeader header ,
1765+ final boolean lenientDecoding
1766+ ) {
1767+ return getMetadataLine (id , header , lenientDecoding , false );
17801768 }
17811769
1782- private void fullyDecodeGenotypes ( final VariantContextBuilder builder , final VCFHeader header ) {
1783- final GenotypesContext gc = new GenotypesContext ();
1784- for ( final Genotype g : getGenotypes () ) {
1785- gc . add ( fullyDecodeGenotypes ( g , header ));
1786- }
1787- builder . genotypesNoValidation ( gc );
1770+ private static VCFCompoundHeaderLine getFormatLine (
1771+ final String id ,
1772+ final VCFHeader header ,
1773+ final boolean lenientDecoding
1774+ ) {
1775+ return getMetadataLine ( id , header , lenientDecoding , true );
17881776 }
17891777
1790- private final Genotype fullyDecodeGenotypes (final Genotype g , final VCFHeader header ) {
1791- final Map <String , Object > map = fullyDecodeAttributes (g .getExtendedAttributes (), header , true );
1792- return new GenotypeBuilder (g ).attributes (map ).make ();
1778+ private static VCFCompoundHeaderLine getMetadataLine (
1779+ final String id ,
1780+ final VCFHeader header ,
1781+ final boolean lenientDecoding ,
1782+ final boolean isFormat
1783+ ) {
1784+ // Try getting line from header
1785+ VCFCompoundHeaderLine line = isFormat ? header .getFormatHeaderLine (id ) : header .getInfoHeaderLine (id );
1786+ if (line == null && !lenientDecoding ) {
1787+ throw new TribbleException (String .format (
1788+ "No %s header line was found matching ID: %s" ,
1789+ isFormat ? "FORMAT" : "INFO" , id
1790+ ));
1791+ }
1792+ if (line == null ) {
1793+ // Try to find a matching standard header line
1794+ line = isFormat
1795+ ? VCFStandardHeaderLines .getFormatLine (id , false )
1796+ : VCFStandardHeaderLines .getInfoLine (id , false );
1797+ }
1798+ if (line == null ) {
1799+ final String description = "Auto-generated unbounded string "
1800+ + (isFormat ? "FORMAT" : "INFO" )
1801+ + " header line for " + id ;
1802+ // Decode values as strings, and allow an unbounded number of values
1803+ line = isFormat
1804+ ? new VCFFormatHeaderLine (id , VCFHeaderLineCount .UNBOUNDED , VCFHeaderLineType .String , description )
1805+ : new VCFInfoHeaderLine (id , VCFHeaderLineCount .UNBOUNDED , VCFHeaderLineType .String , description );
1806+ }
1807+ return line ;
17931808 }
17941809
17951810 // ---------------------------------------------------------------------------------------------------------
0 commit comments