Skip to content

Commit db17826

Browse files
committed
Merge branch 'develop' of pd3-github:samtools/bcftools into develop
2 parents 0c238d1 + 0b249f0 commit db17826

7 files changed

Lines changed: 98 additions & 38 deletions

File tree

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ Changes affecting specific commands:
66

77
- New `-s, --samples` to print per-sample HWE probability, geometric mean
88

9+
* bcftools conveert
10+
11+
- Make --gvcf2vcf work around malformed gVCF records where INFO/END overlaps
12+
the next record
13+
914
* bcftools mpileup
1015

1116
- Remove unused experimental INFO/MIN_PL_SUM annotation

test/convert.gvcf.2.out

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##reference=file://some/path/human_g1k_v37.fasta
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
6+
##contig=<ID=22,length=450>
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl
8+
22 10 . C . . . . GT 0/0
9+
22 11 . T . . . . GT 0/0
10+
22 12 . T . . . . GT 0/0
11+
22 13 . G . . . . GT 0/0
12+
22 14 . G . . . . GT 0/0
13+
22 15 . C . . . . GT 0/0
14+
22 16 . C . . . . GT 0/0
15+
22 17 . A . . . . GT 0/0
16+
22 18 . A . . . . GT 0/0
17+
22 19 . G . . . . GT 0/0
18+
22 20 . T . . . . GT 0/0
19+
22 21 . C T . . END=26 GT 0/0
20+
22 27 . C T . . END=42 GT 0/0

test/convert.gvcf.2.vcf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
##fileformat=VCFv4.2
2+
##reference=file://some/path/human_g1k_v37.fasta
3+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
5+
##contig=<ID=22,length=450>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl
7+
22 10 . C . . . END=40 GT 0/0
8+
22 21 . C T . . END=26 GT 0/0
9+
22 27 . C T . . END=42 GT 0/0

test/convert.gvcf.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
##reference=file://some/path/human_g1k_v37.fasta
44
##FORMAT=<ID=GQX,Number=1,Type=Integer,Description="Minimum of {Genotype quality assuming variant position,Genotype quality assuming non-variant position}">
55
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6-
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
6+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
77
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
88
##FORMAT=<ID=DPF,Number=1,Type=Integer,Description="Basecalls filtered from input prior to site genotyping">
99
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)">
@@ -57,7 +57,7 @@
5757
##FORMAT=<ID=OPL,Number=.,Type=Integer,Description="Original PL value before ploidy correction">
5858
##INFO=<ID=phastCons,Number=0,Type=Flag,Description="overlaps a phastCons element">
5959
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments/README">
60-
##INFO=<ID=AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency based on AC/AN; Format: Allele:AlleleFrequency">
60+
##INFO=<ID=AF,Number=A,Type=Float,Description="1000 Genomes Allele Frequency based on AC/AN; Format: Allele:AlleleFrequency">
6161
##INFO=<ID=AMR_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from AMR population based on AC/AN; Format: Allele:AlleleFrequency">
6262
##INFO=<ID=ASN_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from ASN population based on AC/AN; Format: Allele:AlleleFrequency">
6363
##INFO=<ID=AFR_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from AFR population based on AC/AN; Format: Allele:AlleleFrequency">

test/convert.gvcf.vcf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
##reference=file://some/path/human_g1k_v37.fasta
44
##FORMAT=<ID=GQX,Number=1,Type=Integer,Description="Minimum of {Genotype quality assuming variant position,Genotype quality assuming non-variant position}">
55
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6-
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
6+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
77
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
88
##FORMAT=<ID=DPF,Number=1,Type=Integer,Description="Basecalls filtered from input prior to site genotyping">
99
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)">
@@ -57,7 +57,7 @@
5757
##FORMAT=<ID=OPL,Number=.,Type=Integer,Description="Original PL value before ploidy correction">
5858
##INFO=<ID=phastCons,Number=0,Type=Flag,Description="overlaps a phastCons element">
5959
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments/README">
60-
##INFO=<ID=AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency based on AC/AN; Format: Allele:AlleleFrequency">
60+
##INFO=<ID=AF,Number=A,Type=Float,Description="1000 Genomes Allele Frequency based on AC/AN; Format: Allele:AlleleFrequency">
6161
##INFO=<ID=AMR_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from AMR population based on AC/AN; Format: Allele:AlleleFrequency">
6262
##INFO=<ID=ASN_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from ASN population based on AC/AN; Format: Allele:AlleleFrequency">
6363
##INFO=<ID=AFR_AF,Number=A,Type=String,Description="1000 Genomes Allele Frequency for samples from AFR population based on AC/AN; Format: Allele:AlleleFrequency">

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@
949949
run_test(\&test_vcf_convert_hs2vcf,$opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.vcf',args=>'--hapsample2vcf');
950950
run_test(\&test_vcf_convert_hs2vcf,$opts,h=>'convert.hs.gt.ids.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids --hapsample2vcf');
951951
run_test(\&test_vcf_convert_gvcf,$opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf -i\'FILTER="PASS"\'');
952+
run_test(\&test_vcf_convert_gvcf,$opts,in=>'convert.gvcf.2',out=>'convert.gvcf.2.out',fa=>'gvcf.fa',args=>'--gvcf2vcf');
952953
run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme');
953954
run_test(\&test_vcf_convert_tsv2vcf,$opts,in=>'convert.tsv',out=>'convert.tsv.vcf',args=>'-c -,CHROM,POS,REF,ALT',fai=>'23andme');
954955
run_test(\&test_vcf_consensus,$opts,in=>'consensus.gvcf-missing.1',out=>'consensus.gvcf-missing.1.out',fa=>'consensus.gvcf-missing.fa',args=>'--missing N');

vcfconvert.c

Lines changed: 59 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* vcfconvert.c -- convert between VCF/BCF and related formats.
22
3-
Copyright (C) 2013-2025 Genome Research Ltd.
3+
Copyright (C) 2013-2026 Genome Research Ltd.
44
55
Author: Petr Danecek <pd3@sanger.ac.uk>
66
@@ -44,6 +44,7 @@ THE SOFTWARE. */
4444
#include "filter.h"
4545
#include "convert.h"
4646
#include "tsv2vcf.h"
47+
#include "vcfbuf.h"
4748

4849
// Logic of the filters: include or exclude sites which match the filters?
4950
#define FLT_INCLUDE 1
@@ -52,6 +53,7 @@ THE SOFTWARE. */
5253
typedef struct _args_t args_t;
5354
struct _args_t
5455
{
56+
vcfbuf_t *vcfbuf;
5557
faidx_t *ref;
5658
filter_t *filter;
5759
char *filter_str;
@@ -64,7 +66,8 @@ struct _args_t
6466
int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing, written;
6567
} n;
6668
kstring_t str;
67-
int32_t *gts;
69+
int32_t *gts, *itmp;
70+
int nitmp;
6871
float *flt;
6972
int rev_als, output_vcf_ids, hap2dip, gen_3N6;
7073
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
@@ -1512,6 +1515,50 @@ static void vcf_to_vcf(args_t *args)
15121515
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
15131516
}
15141517

1518+
static bcf1_t *gvcf_next_line(args_t *args, int *end1)
1519+
{
1520+
*end1 = 0;
1521+
while ( vcfbuf_nsites(args->vcfbuf)!=2 && bcf_sr_next_line(args->files) )
1522+
{
1523+
bcf1_t *rec = bcf_sr_get_line(args->files,0);
1524+
args->files->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
1525+
}
1526+
bcf1_t *rec = vcfbuf_flush(args->vcfbuf, 0);
1527+
if ( !rec ) return NULL;
1528+
1529+
// is it a gVCF record?
1530+
// - ALT must be one of ., <*>, <X>, <NON_REF>
1531+
// - INFO/END must be present
1532+
int i, gallele = -1;
1533+
if ( rec->n_allele==1 ) gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
1534+
else if ( rec->d.allele[1][0]=='<' )
1535+
{
1536+
for (i=1; i<rec->n_allele; i++)
1537+
{
1538+
if ( rec->d.allele[i][1]=='*' && rec->d.allele[i][2]=='>' && rec->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
1539+
if ( rec->d.allele[i][1]=='X' && rec->d.allele[i][2]=='>' && rec->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
1540+
if ( strcmp(rec->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
1541+
}
1542+
}
1543+
if ( gallele<0 ) return rec;
1544+
1545+
int nend = bcf_get_info_int32(args->header,rec,"END",&args->itmp,&args->nitmp);
1546+
if ( nend!=1 ) return rec;
1547+
*end1 = args->itmp[0];
1548+
1549+
bcf1_t *peek = vcfbuf_peek(args->vcfbuf, 0);
1550+
if ( peek && rec->rid==peek->rid && (*end1)-1>=peek->pos )
1551+
{
1552+
static int warned = 0;
1553+
if ( !warned )
1554+
{
1555+
hts_log_warning("Malformed gVCF: INFO/END at %s:%"PRIhts_pos" overlaps the next record",bcf_seqname(args->header,rec),rec->pos+1);
1556+
warned = 1;
1557+
}
1558+
*end1 = rec->pos < peek->pos ? peek->pos : 0;
1559+
}
1560+
return rec;
1561+
}
15151562
static void gvcf_to_vcf(args_t *args)
15161563
{
15171564
if ( !args->ref_fname ) error("--gvcf2vcf requires the --fasta-ref option\n");
@@ -1529,15 +1576,16 @@ static void gvcf_to_vcf(args_t *args)
15291576
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
15301577
if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
15311578
if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1532-
if ( init_index2(out_fh,hdr,args->outfname,&args->index_fn,
1533-
args->write_index)<0 )
1579+
if ( init_index2(out_fh,hdr,args->outfname,&args->index_fn,args->write_index)<0 )
15341580
error("Error: failed to initialise index for %s\n",args->outfname);
15351581

1536-
int32_t *itmp = NULL, nitmp = 0;
1582+
args->vcfbuf = vcfbuf_init(hdr, 0);
1583+
vcfbuf_set(args->vcfbuf,VCFBUF_DUMMY,1);
15371584

1538-
while ( bcf_sr_next_line(args->files) )
1585+
int end1 = 0;
1586+
bcf1_t *line;
1587+
while ( (line=gvcf_next_line(args, &end1)) )
15391588
{
1540-
bcf1_t *line = bcf_sr_get_line(args->files,0);
15411589
if ( args->filter )
15421590
{
15431591
int pass = filter_test(args->filter, line, NULL);
@@ -1549,39 +1597,15 @@ static void gvcf_to_vcf(args_t *args)
15491597
}
15501598
}
15511599

1552-
// check if alleles compatible with being a gVCF record
1553-
// ALT must be one of ., <*>, <X>, <NON_REF>
1554-
// check for INFO/END is below
1555-
int i, gallele = -1;
1556-
if (line->n_allele==1)
1557-
gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
1558-
else if ( line->d.allele[1][0]=='<' )
1559-
{
1560-
for (i=1; i<line->n_allele; i++)
1561-
{
1562-
if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
1563-
if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
1564-
if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
1565-
}
1566-
}
1567-
1568-
// no gVCF compatible alleles
1569-
if (gallele<0)
1600+
if ( !end1 )
15701601
{
15711602
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
15721603
continue;
15731604
}
15741605

1575-
int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
1576-
if ( nend!=1 )
1577-
{
1578-
// No INFO/END => not gVCF record
1579-
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
1580-
continue;
1581-
}
15821606
bcf_update_info_int32(hdr,line,"END",NULL,0);
15831607
int pos, len;
1584-
for (pos=line->pos; pos<itmp[0]; pos++)
1608+
for (pos=line->pos; pos<end1; pos++)
15851609
{
15861610
line->pos = pos;
15871611
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
@@ -1593,7 +1617,7 @@ static void gvcf_to_vcf(args_t *args)
15931617
}
15941618
}
15951619
if ( args->files->errnum ) error("Error: %s\n", bcf_sr_strerror(args->files->errnum));
1596-
free(itmp);
1620+
free(args->itmp);
15971621
if ( args->write_index )
15981622
{
15991623
if ( bcf_idx_save(out_fh)<0 )
@@ -1604,6 +1628,7 @@ static void gvcf_to_vcf(args_t *args)
16041628
free(args->index_fn);
16051629
}
16061630
if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname);
1631+
vcfbuf_destroy(args->vcfbuf);
16071632
}
16081633

16091634
static void usage(void)

0 commit comments

Comments
 (0)