diff --git a/htslib/vcf.h b/htslib/vcf.h index 45a7adc16..92f0c2bd4 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -216,7 +216,8 @@ typedef struct { float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; - kstring_t shared, indiv; + kstring_t shared; + kstring_t indiv; // Per sample data. unpacked & VCF_UN_FMT => VCF / BCF bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work @@ -389,6 +390,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define BCF_UN_FMT 8 // unpack format and each sample #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything + #define VCF_UN_FMT 16 // indiv.s is VCF string HTSLIB_EXPORT int bcf_unpack(bcf1_t *b, int which); diff --git a/test/test.pl b/test/test.pl index 2c4a1b1f4..601025ceb 100755 --- a/test/test.pl +++ b/test/test.pl @@ -822,7 +822,7 @@ sub test_vcf_various test_cmd($opts, %args, out => "formatcols.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatcols.vcf"); test_cmd($opts, %args, out => "noroundtrip-out.vcf", - cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf"); + cmd => "$$opts{path}/test_view -b $$opts{path}/noroundtrip.vcf | $$opts{bin}/htsfile -c -"); test_cmd($opts, %args, out => "formatmissing-out.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); } diff --git a/vcf.c b/vcf.c index 62f7741aa..66a1f26d4 100644 --- a/vcf.c +++ b/vcf.c @@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_IS_64BIT (1<<30) +static int vcf_parse_format_partial(bcf1_t *v); static char *find_chrom_header_line(char *s) { @@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) { + if (rec->unpacked & VCF_UN_FMT) { + if (vcf_parse_format_partial(rec) < 0) + return -1; + } + if ( !hdr->keep_samples ) return 0; if ( !bcf_hdr_nsamples(hdr) ) { @@ -1734,6 +1740,16 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) if ( h->dirty ) { if (bcf_hdr_sync(h) < 0) return -1; } + + if ( hfp->format.format == vcf || hfp->format.format == text_format ) + return vcf_write(hfp,h,v); + + if (v->unpacked & VCF_UN_FMT) { + // slow, but round trip test + if (vcf_parse_format_partial(v) < 0) + return -1; + } + if ( bcf_hdr_nsamples(h)!=v->n_sample ) { hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", @@ -1741,9 +1757,6 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) return -1; } - if ( hfp->format.format == vcf || hfp->format.format == text_format ) - return vcf_write(hfp,h,v); - if ( v->errcode ) { // vcf_parse1() encountered a new contig or tag, undeclared in the @@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return 0; } +// The indiv.s kstring is the textual VCF representation for FORMAT +// column and all the subsequent SAMPLE columns. +// +// We also need the header, and this cannot be passed in as bcf_unpack calls +// this and it doesn't have the header available. +// So we also cache a pointer to the header in the first bytes of the +// kstring too. +// +// This packing ensures the data is still in kstring format and amenable +// to freeing / reuse. I.e.: +// +// s.s p q s.l s.m +// | | | | | +// ......... +// +// Returns 0 on success, +// <0 on failure. +static int vcf_parse_format_partial(bcf1_t *v) { + if (!(v->unpacked & VCF_UN_FMT)) + return 0; + kstring_t s = v->indiv; + const bcf_hdr_t *h = *(const bcf_hdr_t **)s.s; + char *p = s.s + sizeof(const bcf_hdr_t *); + char *q = p + strlen(p); + + v->indiv.s = NULL; + v->indiv.l = v->indiv.m = 0; + + int ret; + if ((ret = vcf_parse_format(&s, h, v, p, q) == 0)) { + v->unpacked &= ~VCF_UN_FMT; + free(s.s); + return ret; + } else { + v->indiv = s; + return ret; + } +} + static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has // been already printed, but will enable tools like vcfcheck to proceed. @@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; } else if (i == 8) {// FORMAT - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + // Consider complete copy of s, obtained via ks_release, + // and cache of p/q pointers. This is then a generalised + // parse delay that works for any max_unpack value. + + kstring_t *iv = &v->indiv; + ks_clear(iv); + if (kputsn((char *)&h, sizeof(&h), iv) < 0 || + kputsn(p, s->s + s->l - p, iv) < 0) + goto err; + + v->unpacked |= VCF_UN_FMT; + + // We can't accurately judge n_sample and some things may check + // this without doing an explicit bcf_unpack call, but we + // set it to bcf_hdr_nsamples(h) as a starting point. + // (vcf_parse_format validates this and it's an error if it + // mismatches, so this is initial value is incorrect if the + // data itself is incorrect, and we'll spot that on an explict + // bcf_unpack call.) + v->n_sample = bcf_hdr_nsamples(h); + + return 0; + //return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; } } @@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which) ptr = bcf_unpack_info_core1(ptr, &d->info[i]); b->unpacked |= BCF_UN_INFO; } + if ((which & BCF_UN_FMT) && (b->unpacked & VCF_UN_FMT)) { + if (vcf_parse_format_partial(b) < 0) + return -1; + } if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT ptr = (uint8_t*)b->indiv.s; hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt); @@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which) int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) { int i; - bcf_unpack((bcf1_t*)v, BCF_UN_ALL); + bcf_unpack((bcf1_t*)v, (v->unpacked & VCF_UN_FMT) + ? BCF_UN_SHR : BCF_UN_ALL); kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM kputc('\t', s); kputll(v->pos + 1, s); // POS kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID @@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if ( first ) kputc('.', s); } else kputc('.', s); // FORMAT and individual information - if (v->n_sample) - { - int i,j; - if ( v->n_fmt) + if (v->unpacked & VCF_UN_FMT) { + size_t l = strlen(v->indiv.s + sizeof(bcf_hdr_t *)); + kputc('\t', s); + kputsn(v->indiv.s + sizeof(bcf_hdr_t *), l, s); + kputc('\t', s); + kputsn(v->indiv.s + sizeof(bcf_hdr_t *) + l+1, + v->indiv.l - (sizeof(bcf_hdr_t *) + l+1), s); + } else { + if (v->n_sample) { - int gt_i = -1; - bcf_fmt_t *fmt = v->d.fmt; - int first = 1; - for (i = 0; i < (int)v->n_fmt; ++i) { - if ( !fmt[i].p ) continue; - kputc(!first ? ':' : '\t', s); first = 0; - if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) - { - hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1); - abort(); - } - kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); - if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; - } - if ( first ) kputs("\t.", s); - for (j = 0; j < v->n_sample; ++j) { - kputc('\t', s); - first = 1; + int i,j; + if ( v->n_fmt) + { + int gt_i = -1; + bcf_fmt_t *fmt = v->d.fmt; + int first = 1; for (i = 0; i < (int)v->n_fmt; ++i) { - bcf_fmt_t *f = &fmt[i]; - if ( !f->p ) continue; - if (!first) kputc(':', s); - first = 0; - if (gt_i == i) - bcf_format_gt(f,j,s); - else - bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); + if ( !fmt[i].p ) continue; + kputc(!first ? ':' : '\t', s); first = 0; + if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) + { + hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1); + abort(); + } + kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); + if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; + } + if ( first ) kputs("\t.", s); + for (j = 0; j < v->n_sample; ++j) { + kputc('\t', s); + first = 1; + for (i = 0; i < (int)v->n_fmt; ++i) { + bcf_fmt_t *f = &fmt[i]; + if ( !f->p ) continue; + if (!first) kputc(':', s); + first = 0; + if (gt_i == i) + bcf_format_gt(f,j,s); + else + bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); + } + if ( first ) kputc('.', s); } - if ( first ) kputc('.', s); } + else + for (j=0; j<=v->n_sample; j++) + kputs("\t.", s); } - else - for (j=0; j<=v->n_sample; j++) - kputs("\t.", s); } kputc('\n', s); return 0; @@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) { + if (v->unpacked & VCF_UN_FMT) { + if (vcf_parse_format_partial(v) < 0) + return -1; + } + kstring_t ind; ind.s = 0; ind.l = ind.m = 0; if (n) {