Skip to content

Commit 8ef3290

Browse files
committed
Add ReadVcf integration test for genotype parsing
Adds a test VCF with 12 rows covering PL-only, GL-only, GT-only, PL/GL/GT fallback, partial-missing fields (e.g. "50,.,."), phased genotypes, and priority ordering. Each row encodes EXPECTED_AF and EXPECTED_N_MISSING in the INFO field. The test calls ReadVcf directly and validates computed AF and missing counts match expectations.
1 parent 4678dad commit 8ef3290

3 files changed

Lines changed: 163 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,13 @@ add_executable(VerifyBamID ${SOURCE_FILES})
6969

7070
target_link_libraries(VerifyBamID statgen Vcf ${HTS_LIBRARIES} samtools ${ZLIB} ${BZIP2} ${LZMA} ${CURLLIB})
7171

72+
add_executable(TestReadVcf TestReadVcf.cpp SVDcalculator.cpp)
73+
target_link_libraries(TestReadVcf statgen Vcf ${HTS_LIBRARIES} samtools ${ZLIB} ${BZIP2} ${LZMA} ${CURLLIB})
74+
7275
enable_testing()
76+
add_test(NAME testReadVcf
77+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
78+
COMMAND TestReadVcf resource/test/test_readvcf.vcf)
7379
add_test(NAME myTest1
7480
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
7581
COMMAND VerifyBamID --DisableSanityCheck --BamFile resource/test/test.bam --SVDPrefix resource/test/hapmap_3.3.b37.dat --Reference resource/test/chr20.fa.gz --NumPC 2)

TestReadVcf.cpp

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/// Integration test for SVDcalculator::ReadVcf genotype parsing.
2+
///
3+
/// Reads a test VCF that encodes expected AF and N_MISSING in each row's INFO
4+
/// field, calls ReadVcf, then validates the genotype matrix produces matching
5+
/// values. This exercises PL/GL/GT parsing, per-sample fallback, partial-missing
6+
/// handling, and phased genotypes.
7+
8+
#include "SVDcalculator.h"
9+
#include <cmath>
10+
#include <cstdlib>
11+
#include <fstream>
12+
#include <iostream>
13+
#include <sstream>
14+
#include <string>
15+
#include <vector>
16+
17+
struct Expected {
18+
std::string testDesc;
19+
double af;
20+
int nMissing;
21+
};
22+
23+
/// Parse EXPECTED_AF, EXPECTED_N_MISSING, and TESTDESC from the INFO column of
24+
/// each data line in the VCF. Only includes rows with FILTER=PASS (matching
25+
/// ReadVcf's filtering behavior).
26+
static std::vector<Expected> parseExpected(const std::string& vcfPath) {
27+
std::vector<Expected> result;
28+
std::ifstream fin(vcfPath);
29+
std::string line;
30+
31+
while (std::getline(fin, line)) {
32+
if (line.empty() || line[0] == '#') continue;
33+
34+
// Split line by tab
35+
std::vector<std::string> cols;
36+
std::istringstream iss(line);
37+
std::string col;
38+
while (std::getline(iss, col, '\t')) cols.push_back(col);
39+
if (cols.size() < 8) continue;
40+
41+
// Only include PASS rows (ReadVcf skips non-PASS)
42+
if (cols[6] != "PASS") continue;
43+
44+
Expected ev;
45+
ev.testDesc = "unknown";
46+
ev.af = -1.0;
47+
ev.nMissing = -1;
48+
49+
// Parse semicolon-delimited INFO field
50+
std::istringstream infoStream(cols[7]);
51+
std::string field;
52+
while (std::getline(infoStream, field, ';')) {
53+
size_t eq = field.find('=');
54+
if (eq == std::string::npos) continue;
55+
std::string key = field.substr(0, eq);
56+
std::string val = field.substr(eq + 1);
57+
if (key == "TESTDESC") ev.testDesc = val;
58+
else if (key == "EXPECTED_AF") ev.af = std::stod(val);
59+
else if (key == "EXPECTED_N_MISSING") ev.nMissing = std::stoi(val);
60+
}
61+
62+
result.push_back(ev);
63+
}
64+
return result;
65+
}
66+
67+
int main(int argc, char** argv) {
68+
if (argc != 2) {
69+
std::cerr << "Usage: " << argv[0] << " <test.vcf>" << std::endl;
70+
return 1;
71+
}
72+
std::string vcfPath = argv[1];
73+
74+
// Run ReadVcf
75+
SVDcalculator calc;
76+
std::vector<std::vector<char>> genotype;
77+
int nSamples = 0, nMarkers = 0;
78+
std::unordered_set<std::string> includeChr;
79+
calc.ReadVcf(vcfPath, genotype, nSamples, nMarkers, includeChr);
80+
81+
// Parse expected values from VCF INFO fields
82+
std::vector<Expected> expected = parseExpected(vcfPath);
83+
84+
if (nMarkers != static_cast<int>(expected.size())) {
85+
std::cerr << "FAIL: expected " << expected.size()
86+
<< " markers but ReadVcf returned " << nMarkers << std::endl;
87+
return 1;
88+
}
89+
90+
int failures = 0;
91+
for (int m = 0; m < nMarkers; m++) {
92+
const auto& geno = genotype[m];
93+
const auto& exp = expected[m];
94+
95+
// Compute observed N_MISSING and AF from the genotype vector
96+
int nMissing = 0;
97+
int sumGeno = 0;
98+
int nNonMissing = 0;
99+
for (int s = 0; s < nSamples; s++) {
100+
if (geno[s] < 0) {
101+
nMissing++;
102+
} else {
103+
sumGeno += geno[s];
104+
nNonMissing++;
105+
}
106+
}
107+
double af = (nNonMissing > 0)
108+
? static_cast<double>(sumGeno) / (2.0 * nNonMissing)
109+
: 0.0;
110+
111+
bool ok = true;
112+
if (nMissing != exp.nMissing) {
113+
std::cerr << "FAIL [" << exp.testDesc << "]: N_MISSING expected "
114+
<< exp.nMissing << " got " << nMissing << std::endl;
115+
ok = false;
116+
}
117+
if (std::fabs(af - exp.af) > 0.001) {
118+
std::cerr << "FAIL [" << exp.testDesc << "]: AF expected "
119+
<< exp.af << " got " << af << std::endl;
120+
ok = false;
121+
}
122+
123+
if (ok) {
124+
std::cerr << "PASS [" << exp.testDesc << "]" << std::endl;
125+
} else {
126+
failures++;
127+
}
128+
}
129+
130+
if (failures > 0) {
131+
std::cerr << failures << " of " << nMarkers << " test(s) FAILED" << std::endl;
132+
return 1;
133+
}
134+
135+
std::cerr << "All " << nMarkers << " tests passed" << std::endl;
136+
return 0;
137+
}

resource/test/test_readvcf.vcf

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
##fileformat=VCFv4.1
2+
##INFO=<ID=TESTDESC,Number=1,Type=String,Description="Description of what this VCF row tests">
3+
##INFO=<ID=EXPECTED_AF,Number=1,Type=Float,Description="Expected alt allele frequency computed from correctly parsed genotypes">
4+
##INFO=<ID=EXPECTED_N_MISSING,Number=1,Type=Integer,Description="Expected number of samples with missing genotypes after parsing">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">
7+
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Log10-scaled genotype likelihoods">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
9+
20 100000 . A T 60 PASS TESTDESC=PL_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 PL 0,30,50 0,30,50 50,0,50 50,0,50 50,30,0 50,30,0 0,30,50 50,0,50 50,30,0 0,30,50
10+
20 101000 . A T 60 PASS TESTDESC=GL_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GL 0,-3,-5 0,-3,-5 -5,0,-5 -5,0,-5 -5,-3,0 -5,-3,0 0,-3,-5 -5,0,-5 -5,-3,0 0,-3,-5
11+
20 102000 . A T 60 PASS TESTDESC=GT_only_all_present;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT 0/0 0/0 0/1 0/1 1/1 1/1 0/0 0/1 1/1 0/0
12+
20 103000 . A T 60 PASS TESTDESC=PL_missing_falls_back_to_GT;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL 0/0:0,30,50 0/0:. 0/1:50,0,50 0/1:. 1/1:50,30,0 1/1:. 0/0:0,30,50 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50
13+
20 104000 . A T 60 PASS TESTDESC=GL_missing_falls_back_to_GT;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:GL 0/0:0,-3,-5 0/0:. 0/1:-5,0,-5 0/1:. 1/1:-5,-3,0 1/1:. 0/0:0,-3,-5 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5
14+
20 105000 . A T 60 PASS TESTDESC=PL_GL_GT_cascade_fallback;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL:GL 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:.:. 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:.:. 0/0:0,30,50:0,-3,-5 0/1:.:-5,0,-5 1/1:50,30,0:-5,-3,0 0/0:.:0,-3,-5
15+
20 106000 . A T 60 PASS TESTDESC=PL_partial_missing_falls_back_to_GT;EXPECTED_AF=0.35;EXPECTED_N_MISSING=0 GT:PL 0/0:50,.,. 0/0:0,30,50 0/1:50,0,50 0/0:50,.,30 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50 0/1:50,0,50 1/1:50,30,0 0/0:0,30,50
16+
20 107000 . A T 60 PASS TESTDESC=GL_partial_missing_falls_back_to_GT;EXPECTED_AF=0.35;EXPECTED_N_MISSING=0 GT:GL 0/0:-5,.,. 0/0:0,-3,-5 0/1:-5,0,-5 0/0:-5,.,0 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5 0/1:-5,0,-5 1/1:-5,-3,0 0/0:0,-3,-5
17+
20 108000 . A T 60 PASS TESTDESC=GT_partial_missing_one_allele;EXPECTED_AF=0.5;EXPECTED_N_MISSING=1 GT 0/0 ./1 0/1 0/1 1/1 1/1 0/0 0/1 1/1 0/0
18+
20 109000 . A T 60 PASS TESTDESC=GT_phased_genotypes;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT 0|0 0|0 0|1 1|0 1|1 1|1 0|0 0|1 1|1 0|0
19+
20 110000 . A T 60 PASS TESTDESC=all_fields_missing_two_samples;EXPECTED_AF=0.4375;EXPECTED_N_MISSING=2 GT:PL:GL 0/0:0,30,50:0,-3,-5 0/1:50,0,50:-5,0,-5 ./.:.:. 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5 ./.:.:. 0/1:50,0,50:-5,0,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5 0/1:50,0,50:-5,0,-5
20+
20 111000 . A T 60 PASS TESTDESC=PL_takes_priority_over_GL;EXPECTED_AF=0.45;EXPECTED_N_MISSING=0 GT:PL:GL 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:-5,0,-5 0/1:50,0,50:0,-3,-5 1/1:50,30,0:-5,-3,0 0/0:0,30,50:0,-3,-5

0 commit comments

Comments
 (0)