Skip to content

Commit f730b7c

Browse files
authored
Merge pull request #76 from tfenne/fix-genotype-parsing-fallback
Fix per-sample genotype parsing to fall back from PL to GL to GT
2 parents 3f0d01f + 8ef3290 commit f730b7c

4 files changed

Lines changed: 216 additions & 42 deletions

File tree

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,13 @@ add_executable(VerifyBamID ${SOURCE_FILES})
6969

7070
target_link_libraries(VerifyBamID statgen Vcf ${HTS_LIBRARIES} samtools ${ZLIB} ${BZIP2} ${LZMA} ${CURLLIB})
7171

72+
add_executable(TestReadVcf TestReadVcf.cpp SVDcalculator.cpp)
73+
target_link_libraries(TestReadVcf statgen Vcf ${HTS_LIBRARIES} samtools ${ZLIB} ${BZIP2} ${LZMA} ${CURLLIB})
74+
7275
enable_testing()
76+
add_test(NAME testReadVcf
77+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
78+
COMMAND TestReadVcf resource/test/test_readvcf.vcf)
7379
add_test(NAME myTest1
7480
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
7581
COMMAND VerifyBamID --DisableSanityCheck --BamFile resource/test/test.bam --SVDPrefix resource/test/hapmap_3.3.b37.dat --Reference resource/test/chr20.fa.gz --NumPC 2)

SVDcalculator.cpp

Lines changed: 53 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -79,74 +79,85 @@ int SVDcalculator::ReadVcf(const std::string &VcfPath,
7979
refAllele=pMarker->sRef[0];
8080
altAllele=pMarker->asAlts[0][0];
8181

82-
int PLidx = pMarker->asFormatKeys.Find("PL");
83-
int PLGLGTflag = 0;//0 for PL, 1 for GL, 2 for GT
84-
if (PLidx < 0) {
85-
PLidx = pMarker->asFormatKeys.Find("GL");
86-
if (PLidx >= 0) PLGLGTflag = 1;//found GL
87-
else {
88-
PLidx = pMarker->asFormatKeys.Find("GT");
89-
if(PLidx >= 0) PLGLGTflag = 2;//found GT
90-
else throw VcfFileException("Cannot recognize GT, GL or PL key in FORMAT field");
91-
}
82+
// Look up FORMAT field indices for genotype data. A marker's FORMAT
83+
// may contain any combination of PL, GL, and GT. Individual samples
84+
// may have missing values for some fields, so we try each per-sample
85+
// in priority order: PL > GL > GT.
86+
int idxPL = pMarker->asFormatKeys.Find("PL");
87+
int idxGL = pMarker->asFormatKeys.Find("GL");
88+
int idxGT = pMarker->asFormatKeys.Find("GT");
89+
if (idxPL < 0 && idxGL < 0 && idxGT < 0) {
90+
throw VcfFileException("Cannot recognize GT, GL or PL key in FORMAT field");
9291
}
9392
int formatLength = pMarker->asFormatKeys.Length();
94-
int idx11 = 0, idx12 = 1, idx22 = 2;
95-
9693
StringArray phred;
9794

98-
long phred11 = 0;
99-
long phred12 = 0;
100-
long phred22 = 0;
10195
std::vector<char> perMarkerGeno(nSamples,-1);
10296
int nMissingGenoSamples = 0;
10397
for (int i = 0; i < nSamples; i++)//for each individual
10498
{
10599
if(nMarkers==0) Samples.push_back(pVcf->getSampleID(i).c_str());
106-
if(PLGLGTflag==0)//found PL
107-
{
108-
phred.ReplaceTokens(pMarker->asSampleValues[PLidx + i * formatLength], ",");
109-
if (phred.Length() == 3 && phred[0] != ".") {
110-
phred11 = phred[idx11].AsInteger();
111-
phred12 = phred[idx12].AsInteger();
112-
phred22 = phred[idx22].AsInteger();
100+
101+
// Phred-scaled likelihoods for the three diploid genotypes:
102+
// phred11 = P(data|0/0), phred12 = P(data|0/1), phred22 = P(data|1/1)
103+
long phred11 = 0;
104+
long phred12 = 0;
105+
long phred22 = 0;
106+
bool parsed = false;
107+
108+
// Try PL: phred-scaled genotype likelihoods (3 comma-separated ints)
109+
if (!parsed && idxPL >= 0) {
110+
phred.ReplaceTokens(pMarker->asSampleValues[idxPL + i * formatLength], ",");
111+
if (phred.Length() == 3 && phred[0] != "." && phred[1] != "." && phred[2] != ".") {
112+
phred11 = phred[0].AsInteger();
113+
phred12 = phred[1].AsInteger();
114+
phred22 = phred[2].AsInteger();
115+
parsed = true;
113116
}
114-
else nMissingGenoSamples++;
115117
}
116-
else if(PLGLGTflag == 1)//found GL
117-
{
118-
phred.ReplaceTokens(pMarker->asSampleValues[PLidx + i * formatLength], ",");
119-
if (phred.Length() == 3 && phred[0] != ".") {
118+
// Try GL: log10 genotype likelihoods (3 comma-separated floats),
119+
// converted to phred scale
120+
if (!parsed && idxGL >= 0) {
121+
phred.ReplaceTokens(pMarker->asSampleValues[idxGL + i * formatLength], ",");
122+
if (phred.Length() == 3 && phred[0] != "." && phred[1] != "." && phred[2] != ".") {
120123
phred11 =
121-
static_cast<int>(-10. * phred[idx11].AsDouble());
124+
static_cast<int>(-10. * phred[0].AsDouble());
122125
phred12 =
123-
static_cast<int>(-10. * phred[idx12].AsDouble());
126+
static_cast<int>(-10. * phred[1].AsDouble());
124127
phred22 =
125-
static_cast<int>(-10. * phred[idx22].AsDouble());
128+
static_cast<int>(-10. * phred[2].AsDouble());
129+
parsed = true;
126130
}
127-
else nMissingGenoSamples++;
128131
}
129-
else//found GT
130-
{
131-
phred.ReplaceTokens(pMarker->asSampleValues[PLidx + i * formatLength], "|/");
132-
if (phred.Length() == 3 && phred[0] != ".") {
132+
// Try GT: hard-call genotype (two allele indices separated by
133+
// / or |). Synthesize high-confidence phred scores from the
134+
// hard call since no likelihoods are available.
135+
if (!parsed && idxGT >= 0) {
136+
StringArray alleles;
137+
alleles.ReplaceTokens(pMarker->asSampleValues[idxGT + i * formatLength], "|/");
138+
if (alleles.Length() == 2 && alleles[0] != "." && alleles[1] != ".") {
133139
long geno =
134-
phred[0].AsInteger() + phred[1].AsInteger();
135-
if (geno == 0) {
140+
alleles[0].AsInteger() + alleles[1].AsInteger();
141+
if (geno == 0) { // 0/0: homozygous ref
136142
phred11 = 0;
137143
phred12 = 30;
138144
phred22 = 50;
139-
} else if (geno == 1) {
145+
} else if (geno == 1) { // 0/1: heterozygous
140146
phred11 = 50;
141147
phred12 = 0;
142148
phred22 = 50;
143-
} else {
149+
} else { // 1/1: homozygous alt
144150
phred11 = 50;
145151
phred12 = 30;
146152
phred22 = 0;
147153
}
154+
parsed = true;
148155
}
149-
else nMissingGenoSamples++;
156+
}
157+
158+
if (!parsed) {
159+
nMissingGenoSamples++;
160+
continue;
150161
}
151162

152163
if ((phred11 < 0) || (phred12 < 0) || (phred22 < 0)) {
@@ -156,8 +167,8 @@ int SVDcalculator::ReadVcf(const std::string &VcfPath,
156167
if (phred11 > maxPhred) phred11 = maxPhred;
157168
if (phred12 > maxPhred) phred12 = maxPhred;
158169
if (phred22 > maxPhred) phred22 = maxPhred;
159-
//
160-
// printf("phred scores are %f, %f, %f;\tphred11/12/22 %d, %d, %d\n", phred[idx11].AsDouble(), phred[idx12].AsDouble(), phred[idx22].AsDouble(),phred11,phred12,phred22);
170+
171+
// Pick the most likely genotype (lowest phred = highest likelihood)
161172
int minGeno = -1;
162173
long minPhred = maxPhred;
163174
if(phred11 < minPhred)

TestReadVcf.cpp

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/// Integration test for SVDcalculator::ReadVcf genotype parsing.
2+
///
3+
/// Reads a test VCF that encodes expected AF and N_MISSING in each row's INFO
4+
/// field, calls ReadVcf, then validates the genotype matrix produces matching
5+
/// values. This exercises PL/GL/GT parsing, per-sample fallback, partial-missing
6+
/// handling, and phased genotypes.
7+
8+
#include "SVDcalculator.h"
9+
#include <cmath>
10+
#include <cstdlib>
11+
#include <fstream>
12+
#include <iostream>
13+
#include <sstream>
14+
#include <string>
15+
#include <vector>
16+
17+
struct Expected {
18+
std::string testDesc;
19+
double af;
20+
int nMissing;
21+
};
22+
23+
/// Parse EXPECTED_AF, EXPECTED_N_MISSING, and TESTDESC from the INFO column of
24+
/// each data line in the VCF. Only includes rows with FILTER=PASS (matching
25+
/// ReadVcf's filtering behavior).
26+
static std::vector<Expected> parseExpected(const std::string& vcfPath) {
27+
std::vector<Expected> result;
28+
std::ifstream fin(vcfPath);
29+
std::string line;
30+
31+
while (std::getline(fin, line)) {
32+
if (line.empty() || line[0] == '#') continue;
33+
34+
// Split line by tab
35+
std::vector<std::string> cols;
36+
std::istringstream iss(line);
37+
std::string col;
38+
while (std::getline(iss, col, '\t')) cols.push_back(col);
39+
if (cols.size() < 8) continue;
40+
41+
// Only include PASS rows (ReadVcf skips non-PASS)
42+
if (cols[6] != "PASS") continue;
43+
44+
Expected ev;
45+
ev.testDesc = "unknown";
46+
ev.af = -1.0;
47+
ev.nMissing = -1;
48+
49+
// Parse semicolon-delimited INFO field
50+
std::istringstream infoStream(cols[7]);
51+
std::string field;
52+
while (std::getline(infoStream, field, ';')) {
53+
size_t eq = field.find('=');
54+
if (eq == std::string::npos) continue;
55+
std::string key = field.substr(0, eq);
56+
std::string val = field.substr(eq + 1);
57+
if (key == "TESTDESC") ev.testDesc = val;
58+
else if (key == "EXPECTED_AF") ev.af = std::stod(val);
59+
else if (key == "EXPECTED_N_MISSING") ev.nMissing = std::stoi(val);
60+
}
61+
62+
result.push_back(ev);
63+
}
64+
return result;
65+
}
66+
67+
int main(int argc, char** argv) {
68+
if (argc != 2) {
69+
std::cerr << "Usage: " << argv[0] << " <test.vcf>" << std::endl;
70+
return 1;
71+
}
72+
std::string vcfPath = argv[1];
73+
74+
// Run ReadVcf
75+
SVDcalculator calc;
76+
std::vector<std::vector<char>> genotype;
77+
int nSamples = 0, nMarkers = 0;
78+
std::unordered_set<std::string> includeChr;
79+
calc.ReadVcf(vcfPath, genotype, nSamples, nMarkers, includeChr);
80+
81+
// Parse expected values from VCF INFO fields
82+
std::vector<Expected> expected = parseExpected(vcfPath);
83+
84+
if (nMarkers != static_cast<int>(expected.size())) {
85+
std::cerr << "FAIL: expected " << expected.size()
86+
<< " markers but ReadVcf returned " << nMarkers << std::endl;
87+
return 1;
88+
}
89+
90+
int failures = 0;
91+
for (int m = 0; m < nMarkers; m++) {
92+
const auto& geno = genotype[m];
93+
const auto& exp = expected[m];
94+
95+
// Compute observed N_MISSING and AF from the genotype vector
96+
int nMissing = 0;
97+
int sumGeno = 0;
98+
int nNonMissing = 0;
99+
for (int s = 0; s < nSamples; s++) {
100+
if (geno[s] < 0) {
101+
nMissing++;
102+
} else {
103+
sumGeno += geno[s];
104+
nNonMissing++;
105+
}
106+
}
107+
double af = (nNonMissing > 0)
108+
? static_cast<double>(sumGeno) / (2.0 * nNonMissing)
109+
: 0.0;
110+
111+
bool ok = true;
112+
if (nMissing != exp.nMissing) {
113+
std::cerr << "FAIL [" << exp.testDesc << "]: N_MISSING expected "
114+
<< exp.nMissing << " got " << nMissing << std::endl;
115+
ok = false;
116+
}
117+
if (std::fabs(af - exp.af) > 0.001) {
118+
std::cerr << "FAIL [" << exp.testDesc << "]: AF expected "
119+
<< exp.af << " got " << af << std::endl;
120+
ok = false;
121+
}
122+
123+
if (ok) {
124+
std::cerr << "PASS [" << exp.testDesc << "]" << std::endl;
125+
} else {
126+
failures++;
127+
}
128+
}
129+
130+
if (failures > 0) {
131+
std::cerr << failures << " of " << nMarkers << " test(s) FAILED" << std::endl;
132+
return 1;
133+
}
134+
135+
std::cerr << "All " << nMarkers << " tests passed" << std::endl;
136+
return 0;
137+
}

resource/test/test_readvcf.vcf

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
##fileformat=VCFv4.1
2+
##INFO=<ID=TESTDESC,Number=1,Type=String,Description="Description of what this VCF row tests">
3+
##INFO=<ID=EXPECTED_AF,Number=1,Type=Float,Description="Expected alt allele frequency computed from correctly parsed genotypes">
4+
##INFO=<ID=EXPECTED_N_MISSING,Number=1,Type=Integer,Description="Expected number of samples with missing genotypes after parsing">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">
7+
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Log10-scaled genotype likelihoods">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
9+
20 100000 . A T 60 PASS TESTDESC=PL_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 PL 0,30,50 0,30,50 50,0,50 50,0,50 50,30,0 50,30,0 0,30,50 50,0,50 50,30,0 0,30,50
10+
20 101000 . A T 60 PASS TESTDESC=GL_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GL 0,-3,-5 0,-3,-5 -5,0,-5 -5,0,-5 -5,-3,0 -5,-3,0 0,-3,-5 -5,0,-5 -5,-3,0 0,-3,-5
11+
20 102000 . A T 60 PASS TESTDESC=GT_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT 0/0 0/0 0/1 0/1 1/1 1/1 0/0 0/1 1/1 0/0
12+
20 103000 . A T 60 PASS TESTDESC=PL_missing_falls_back_to_GT;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL 0/0:0,30,50 0/0:. 0/1:50,0,50 0/1:. 1/1:50,30,0 1/1:. 0/0:0,30,50 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50
13+
20 104000 . A T 60 PASS TESTDESC=GL_missing_falls_back_to_GT;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:GL 0/0:0,-3,-5 0/0:. 0/1:-5,0,-5 0/1:. 1/1:-5,-3,0 1/1:. 0/0:0,-3,-5 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5
14+
20 105000 . A T 60 PASS TESTDESC=PL_GL_GT_cascade_fallback;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL:GL 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:.:. 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:.:. 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:50,30,0:-5,-3,0 0/0:.:0,-3,-5
15+
20 106000 . A T 60 PASS TESTDESC=PL_partial_missing_falls_back_to_GT;EXPECTED_AF=0.35;EXPECTED_N_MISSING=0 GT:PL 0/0:50,.,. 0/0:0,30,50 0/1:50,0,50 0/0:50,.,30 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50
16+
20 107000 . A T 60 PASS TESTDESC=GL_partial_missing_falls_back_to_GT;EXPECTED_AF=0.35;EXPECTED_N_MISSING=0 GT:GL 0/0:-5,.,. 0/0:0,-3,-5 0/1:-5,0,-5 0/0:-5,.,0 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5
17+
20 108000 . A T 60 PASS TESTDESC=GT_partial_missing_one_allele;EXPECTED_AF=0.5;EXPECTED_N_MISSING=1 GT 0/0 ./1 0/1 0/1 1/1 1/1 0/0 0/1 1/1 0/0
18+
20 109000 . A T 60 PASS TESTDESC=GT_phased_genotypes;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT 0|0 0|0 0|1 1|0 1|1 1|1 0|0 0|1 1|1 0|0
19+
20 110000 . A T 60 PASS TESTDESC=all_fields_missing_two_samples;EXPECTED_AF=0.4375;EXPECTED_N_MISSING=2 GT:PL:GL 0/0:0,30,50:0,-3,-5 0/1:50,0,50:-5,0,-5 ./.:.:. 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5 ./.:.:. 0/1:50,0,50:-5,0,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5 0/1:50,0,50:-5,0,-5
20+
20 111000 . A T 60 PASS TESTDESC=PL_takes_priority_over_GL;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL:GL 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5

0 commit comments

Comments
 (0)