Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions SVDcalculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "Eigen/Dense"
#include "libVcf/libVcfFile.h"
#include <Error.h>
#include <algorithm>
#include <fstream>
#include <unordered_map>
#include <unordered_set>
Expand Down Expand Up @@ -201,7 +202,8 @@ int SVDcalculator::ReadVcf(const std::string &VcfPath,

void SVDcalculator::ProcessRefVCF(const std::string &VcfPath,
const std::unordered_set<std::string>& includeChr,
bool skipMinSampleCountCheck)
bool skipMinSampleCountCheck,
int numSVDPCs)
{

std::vector<std::vector<char> > genotype;//markers X samples
Expand Down Expand Up @@ -244,6 +246,33 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath,
Mu.push_back(mu(rowIdx));
}
JacobiSVD<MatrixXf> svd(genoMatrix, ComputeThinU | ComputeThinV);

// Log proportion of variance explained by each principal component.
// Variance explained by PC_i = sigma_i^2 / sum(sigma_j^2) where sigma
// are the singular values. This helps users choose an appropriate --NumPC.
auto singularValues = svd.singularValues();
double totalVariance = 0.0;
for (int i = 0; i < singularValues.size(); ++i) {
totalVariance += (double)singularValues[i] * (double)singularValues[i];
}
notice("SVD completed. Total singular values: %d", (int)singularValues.size());
double cumulative = 0.0;
int numToLog = std::min((int)singularValues.size(), 20);
if (totalVariance <= 0.0) {
warning("Total variance is zero after SVD; skipping variance-explained logging for principal components.");
} else {
for (int i = 0; i < numToLog; ++i) {
double sv = singularValues[i];
double varExplained = (sv * sv) / totalVariance;
cumulative += varExplained;
notice(" PC%d: singular_value=%.4f variance_explained=%.4f (%.2f%%) cumulative=%.4f (%.2f%%)",
i + 1, sv, varExplained, varExplained * 100.0, cumulative, cumulative * 100.0);
}
if ((int)singularValues.size() > numToLog) {
notice(" ... (%d more components not shown)", (int)singularValues.size() - numToLog);
}
}

auto matrixD = svd.singularValues().asDiagonal();
MatrixXf matrixUD = svd.matrixU() * matrixD;//marker X PC
UD.resize(matrixUD.rows(),std::vector<double>(matrixUD.cols(),0.f));
Expand All @@ -260,7 +289,7 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath,
}
}

WriteSVD(VcfPath);
WriteSVD(VcfPath, numSVDPCs);
}

vector<vector<double>> SVDcalculator::GetUDMatrix() {
Expand All @@ -283,7 +312,18 @@ vector<region_t> SVDcalculator::GetBedVec() {
return BedVec;
}

void SVDcalculator::WriteSVD(const std::string &Prefix) {
void SVDcalculator::WriteSVD(const std::string &Prefix, int numSVDPCs) {
// Determine how many PCs to write
int numAvailable = (numMarker > 0 && !UD.empty()) ? (int)UD[0].size() : 0;
int numToWrite;
if (numSVDPCs <= 0) {
numToWrite = numAvailable;
} else {
numToWrite = std::min(numSVDPCs, numAvailable);
}
Comment thread
Griffan marked this conversation as resolved.
notice("Writing SVD output files with %d PCs (of %d available) to prefix: %s",
numToWrite, numAvailable, Prefix.c_str());

std::ofstream fMu(Prefix+".mu");
std::ofstream fUD(Prefix+".UD");
std::ofstream fPC(Prefix+".V");
Expand All @@ -296,14 +336,14 @@ void SVDcalculator::WriteSVD(const std::string &Prefix) {
end=BedVec[i].end;
fMu<<chr+":"+std::to_string(end)<<"\t"<<Mu[i]<<std::endl;
fBed<<chr<<"\t"<<beg<<"\t"<<end<<"\t"<<chooseBed[chr][end].first<<"\t"<<chooseBed[chr][end].second<<std::endl;
for (int j = 0; j < 10/*UD[i].size()*/ ; ++j) {
for (int j = 0; j < numToWrite; ++j) {
fUD<<UD[i][j]<<"\t";
}
fUD<<std::endl;
}
for (int k = 0; k <numIndividual; ++k) {
fPC<<Samples[k]<<"\t";
for (int i = 0; i < 10/*PC[k].size()*/; ++i) {
for (int i = 0; i < numToWrite; ++i) {
fPC<<PC[k][i]<<"\t";
}
fPC<<std::endl;
Expand Down
7 changes: 5 additions & 2 deletions SVDcalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ class SVDcalculator {
/// Read a reference VCF, compute SVD, and write .UD, .mu, .bed, .V files.
/// @param includeChr if non-empty, only markers on these chromosomes are used
/// @param skipMinSampleCountCheck if true, < 1000 samples becomes a warning not an error
/// @param numSVDPCs number of PCs to write (0 = all)
void ProcessRefVCF(const std::string& VcfPath,
const std::unordered_set<std::string>& includeChr,
bool skipMinSampleCountCheck = false);
bool skipMinSampleCountCheck = false,
int numSVDPCs = 10);

/// Parse a VCF into a genotype matrix.
/// @param includeChr if non-empty, only markers on these chromosomes are used
Expand All @@ -42,7 +44,8 @@ class SVDcalculator {
std::vector<PCtype> GetMuArray();
BED GetchooseBed();
std::vector<region_t> GetBedVec();
void WriteSVD(const std::string &VcfPath);
/// @param numSVDPCs number of PCs to write (0 = all available)
void WriteSVD(const std::string &VcfPath, int numSVDPCs);
};


Expand Down
8 changes: 7 additions & 1 deletion main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@ int execute(int argc, char **argv) {
SVDPrefix("Empty");
std::string knownAF("Empty");
std::string RefVCF("Empty");

int numSVDPCs(10);
bool skipMinSampleCountCheck(false);
// Default to human autosomes (both with and without "chr" prefix).
// Override with --IncludeChr for non-human species.
std::string includeChrStr("1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,"
"chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,"
"chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,"
"chr20,chr21,chr22");

std::string fixPC("Empty");
double fixAlpha(-1.), epsilon(1e-8);
bool withinAncestry(false), outputPileup(false), verbose(false),
Expand Down Expand Up @@ -127,6 +130,9 @@ int execute(int argc, char **argv) {
"[String] VCF file from which to extract reference "
"panel's genotype matrix[Required if no SVD files "
"available]")
LONG_INT_PARAM("NumSVDPCs", &numSVDPCs,
"[Int] Number of principal components to write to SVD "
"output files. Set to 0 for all components[default:10]")
LONG_PARAM("SkipMinSampleCountCheck", &skipMinSampleCountCheck,
"[Bool] ADVANCED: skip the minimum sample count check (1000) "
"when generating SVD files. Using fewer than 1000 samples may "
Expand Down Expand Up @@ -216,7 +222,7 @@ int execute(int argc, char **argv) {
}
notice("--IncludeChr: filtering to %d chromosome name(s)", (int)includeChrSet.size());
SVDcalculator calculator;
calculator.ProcessRefVCF(RefVCF, includeChrSet, skipMinSampleCountCheck);
calculator.ProcessRefVCF(RefVCF, includeChrSet, skipMinSampleCountCheck, numSVDPCs);
notice("Success!");
return 0;
}
Expand Down
Loading