Skip to content

Commit 1480e6d

Browse files
committed
VCFHeader, VCFCodec and VCFHeaderLine refactoring to enable support for VCF4.3/BCF2.2 and bug fixes.
1 parent 39b18c7 commit 1480e6d

53 files changed

Lines changed: 4289 additions & 1601 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/main/java/htsjdk/variant/bcf2/BCF2Codec.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public class BCF2Codec extends BinaryFeatureCodec<VariantContext> {
6262

6363
/** sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header */
6464
public static final int SIZEOF_BCF_HEADER = BCFVersion.MAGIC_HEADER_START.length + 2*Byte.BYTES;
65-
65+
6666
private BCFVersion bcfVersion = null;
6767

6868
private VCFHeader header = null;
@@ -501,10 +501,10 @@ protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String
501501
protected void error(final String message) throws RuntimeException {
502502
throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos));
503503
}
504-
504+
505505
/** try to read a BCFVersion from an uncompressed BufferedInputStream.
506506
* The buffer must be large enough to contain {@link #SIZEOF_BCF_HEADER}
507-
*
507+
*
508508
* @param uncompressedBufferedInput the uncompressed input stream
509509
* @return the BCFVersion if it can be decoded, or null if not found.
510510
* @throws IOException
@@ -515,5 +515,5 @@ public static BCFVersion tryReadBCFVersion(final BufferedInputStream uncompresse
515515
uncompressedBufferedInput.reset();
516516
return bcfVersion;
517517
}
518-
518+
519519
}

src/main/java/htsjdk/variant/bcf2/BCF2Utils.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@
2727

2828
import htsjdk.samtools.util.FileExtensions;
2929
import htsjdk.tribble.TribbleException;
30-
import htsjdk.variant.vcf.*;
30+
import htsjdk.variant.vcf.VCFConstants;
31+
import htsjdk.variant.vcf.VCFHeader;
32+
import htsjdk.variant.vcf.VCFHeaderLine;
33+
import htsjdk.variant.vcf.VCFIDHeaderLine;
34+
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
3135

3236
import java.io.File;
3337
import java.io.FileNotFoundException;
@@ -93,10 +97,15 @@ public static ArrayList<String> makeDictionary(final VCFHeader header) {
9397
// set up the strings dictionary
9498
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
9599
if ( line.shouldBeAddedToDictionary() ) {
96-
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
97-
if ( ! seen.contains(idLine.getID())) {
98-
dict.add(idLine.getID());
99-
seen.add(idLine.getID());
100+
if (!line.isIDHeaderLine()) {
101+
//TODO: how do we ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true
102+
throw new TribbleException(String.format(
103+
"The header line %s cannot be added to the BCF dictionary if its not an ID header line",
104+
line));
105+
}
106+
if ( ! seen.contains(line.getID())) {
107+
dict.add(line.getID());
108+
seen.add(line.getID());
100109
}
101110
}
102111
}
@@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) {
291300
* Are the elements and their order in the output and input headers consistent so that
292301
* we can write out the raw genotypes block without decoding and recoding it?
293302
*
294-
* If the order of INFO, FILTER, or contrig elements in the output header is different than
303+
* If the order of INFO, FILTER, or contig elements in the output header is different than
295304
* in the input header we must decode the blocks using the input header and then recode them
296305
* based on the new output order.
297306
*
@@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe
308317
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
309318
return false;
310319

311-
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
312-
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
320+
final Iterator<VCFSimpleHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
321+
final Iterator<VCFSimpleHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
313322

314323
while ( inputLinesIt.hasNext() ) {
315324
if ( ! outputLinesIt.hasNext() ) // missing lines in output
316325
return false;
317326

318-
final VCFIDHeaderLine outputLine = outputLinesIt.next();
319-
final VCFIDHeaderLine inputLine = inputLinesIt.next();
327+
final VCFSimpleHeaderLine outputLine = outputLinesIt.next();
328+
final VCFSimpleHeaderLine inputLine = inputLinesIt.next();
320329

321330
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
322331
return false;

src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,19 @@
2727

2828
import htsjdk.samtools.SAMSequenceDictionary;
2929
import htsjdk.samtools.util.IOUtil;
30+
import htsjdk.samtools.util.Log;
3031
import htsjdk.samtools.util.RuntimeIOException;
32+
import htsjdk.tribble.TribbleException;
3133
import htsjdk.tribble.index.IndexCreator;
34+
import htsjdk.utils.ValidationUtils;
3235
import htsjdk.variant.variantcontext.VariantContext;
3336
import htsjdk.variant.variantcontext.VariantContextBuilder;
3437
import htsjdk.variant.vcf.VCFConstants;
3538
import htsjdk.variant.vcf.VCFEncoder;
3639
import htsjdk.variant.vcf.VCFHeader;
3740
import htsjdk.variant.vcf.VCFHeaderLine;
3841
import htsjdk.variant.vcf.VCFHeaderVersion;
42+
import htsjdk.variant.vcf.VCFUtils;
3943

4044
import java.io.BufferedWriter;
4145
import java.io.ByteArrayOutputStream;
@@ -45,14 +49,15 @@
4549
import java.io.OutputStreamWriter;
4650
import java.io.Writer;
4751
import java.nio.file.Path;
52+
import java.util.stream.Collectors;
4853

4954
/**
5055
* this class writes VCF files
5156
*/
5257
class VCFWriter extends IndexingVariantContextWriter {
58+
protected final static Log logger = Log.getInstance(VCFWriter.class);
5359

54-
private static final String VERSION_LINE =
55-
VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString();
60+
private static final String DEFAULT_VERSION_LINE = VCFHeader.DEFAULT_VCF_VERSION.toHeaderVersionLine();
5661

5762
// Initialized when the header is written to the output stream
5863
private VCFEncoder vcfEncoder = null;
@@ -164,7 +169,7 @@ public void writeHeader(final VCFHeader header) {
164169
}
165170

166171
public static String getVersionLine() {
167-
return VERSION_LINE;
172+
return DEFAULT_VERSION_LINE;
168173
}
169174

170175
public static VCFHeader writeHeader(VCFHeader header,
@@ -175,12 +180,18 @@ public static VCFHeader writeHeader(VCFHeader header,
175180
try {
176181
rejectVCFV43Headers(header);
177182

178-
// the file format field needs to be written first
183+
// Validate that the file version we're writing is version-compatible this header's version.
184+
validateHeaderVersion(header, versionLine);
185+
186+
// The file format field needs to be written first; below any file format lines
187+
// embedded in the header will be removed
179188
writer.write(versionLine + "\n");
180189

181190
for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) {
182-
if ( VCFHeaderVersion.isFormatString(line.getKey()) )
191+
// Remove the fileformat header lines
192+
if ( VCFHeaderVersion.isFormatString(line.getKey()) ) {
183193
continue;
194+
}
184195

185196
writer.write(VCFHeader.METADATA_INDICATOR);
186197
writer.write(line.toString());
@@ -189,14 +200,9 @@ public static VCFHeader writeHeader(VCFHeader header,
189200

190201
// write out the column line
191202
writer.write(VCFHeader.HEADER_INDICATOR);
192-
boolean isFirst = true;
193-
for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
194-
if ( isFirst )
195-
isFirst = false; // don't write out a field separator
196-
else
197-
writer.write(VCFConstants.FIELD_SEPARATOR);
198-
writer.write(field.toString());
199-
}
203+
writer.write(header.getHeaderFields().stream()
204+
.map(f -> f.name())
205+
.collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString());
200206

201207
if ( header.hasGenotypingData() ) {
202208
writer.write(VCFConstants.FIELD_SEPARATOR);
@@ -217,6 +223,33 @@ public static VCFHeader writeHeader(VCFHeader header,
217223
return header;
218224
}
219225

226+
/**
227+
* Given a header and a requested target output version, see if the header's version is compatible with the
228+
* requested version.
229+
* @param header
230+
* @param requestedVersionLine
231+
*/
232+
private static void validateHeaderVersion(final VCFHeader header, final String requestedVersionLine) {
233+
ValidationUtils.nonNull(header);
234+
ValidationUtils.nonNull(requestedVersionLine);
235+
236+
final VCFHeaderVersion vcfCurrentVersion = header.getVCFHeaderVersion();
237+
final VCFHeaderVersion vcfRequestedVersion = VCFHeaderVersion.getHeaderVersion(requestedVersionLine);
238+
if (!vcfCurrentVersion.equals(vcfRequestedVersion)) {
239+
final String message = String.format("Attempting to write a %s VCF header to a %s VCFWriter",
240+
vcfRequestedVersion,
241+
vcfCurrentVersion.getVersionString());
242+
if (!VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.getHeaderVersion(requestedVersionLine), vcfCurrentVersion)) {
243+
if (VCFUtils.getStrictVCFVersionValidation()) {
244+
throw new TribbleException(message);
245+
}
246+
}
247+
if (VCFUtils.getVerboseVCFLogging()) {
248+
logger.warn(message);
249+
}
250+
}
251+
}
252+
220253
/**
221254
* attempt to close the VCF file
222255
*/

0 commit comments

Comments
 (0)