diff --git a/SVDcalculator.cpp b/SVDcalculator.cpp index 3d5273e..e3fc8f6 100644 --- a/SVDcalculator.cpp +++ b/SVDcalculator.cpp @@ -2,6 +2,7 @@ #include "Eigen/Dense" #include "libVcf/libVcfFile.h" #include +#include #include #include #include @@ -201,7 +202,8 @@ int SVDcalculator::ReadVcf(const std::string &VcfPath, void SVDcalculator::ProcessRefVCF(const std::string &VcfPath, const std::unordered_set& includeChr, - bool skipMinSampleCountCheck) + bool skipMinSampleCountCheck, + int numSVDPCs) { std::vector > genotype;//markers X samples @@ -244,6 +246,33 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath, Mu.push_back(mu(rowIdx)); } JacobiSVD svd(genoMatrix, ComputeThinU | ComputeThinV); + + // Log proportion of variance explained by each principal component. + // Variance explained by PC_i = sigma_i^2 / sum(sigma_j^2) where sigma + // are the singular values. This helps users choose an appropriate --NumPC. + auto singularValues = svd.singularValues(); + double totalVariance = 0.0; + for (int i = 0; i < singularValues.size(); ++i) { + totalVariance += (double)singularValues[i] * (double)singularValues[i]; + } + notice("SVD completed. Total singular values: %d", (int)singularValues.size()); + double cumulative = 0.0; + int numToLog = std::min((int)singularValues.size(), 20); + if (totalVariance <= 0.0) { + warning("Total variance is zero after SVD; skipping variance-explained logging for principal components."); + } else { + for (int i = 0; i < numToLog; ++i) { + double sv = singularValues[i]; + double varExplained = (sv * sv) / totalVariance; + cumulative += varExplained; + notice(" PC%d: singular_value=%.4f variance_explained=%.4f (%.2f%%) cumulative=%.4f (%.2f%%)", + i + 1, sv, varExplained, varExplained * 100.0, cumulative, cumulative * 100.0); + } + if ((int)singularValues.size() > numToLog) { + notice(" ... (%d more components not shown)", (int)singularValues.size() - numToLog); + } + } + auto matrixD = svd.singularValues().asDiagonal(); MatrixXf matrixUD = svd.matrixU() * matrixD;//marker X PC UD.resize(matrixUD.rows(),std::vector(matrixUD.cols(),0.f)); @@ -260,7 +289,7 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath, } } - WriteSVD(VcfPath); + WriteSVD(VcfPath, numSVDPCs); } vector> SVDcalculator::GetUDMatrix() { @@ -283,7 +312,18 @@ vector SVDcalculator::GetBedVec() { return BedVec; } -void SVDcalculator::WriteSVD(const std::string &Prefix) { +void SVDcalculator::WriteSVD(const std::string &Prefix, int numSVDPCs) { + // Determine how many PCs to write + int numAvailable = (numMarker > 0 && !UD.empty()) ? (int)UD[0].size() : 0; + int numToWrite; + if (numSVDPCs <= 0) { + numToWrite = numAvailable; + } else { + numToWrite = std::min(numSVDPCs, numAvailable); + } + notice("Writing SVD output files with %d PCs (of %d available) to prefix: %s", + numToWrite, numAvailable, Prefix.c_str()); + std::ofstream fMu(Prefix+".mu"); std::ofstream fUD(Prefix+".UD"); std::ofstream fPC(Prefix+".V"); @@ -296,14 +336,14 @@ void SVDcalculator::WriteSVD(const std::string &Prefix) { end=BedVec[i].end; fMu<& includeChr, - bool skipMinSampleCountCheck = false); + bool skipMinSampleCountCheck = false, + int numSVDPCs = 10); /// Parse a VCF into a genotype matrix. /// @param includeChr if non-empty, only markers on these chromosomes are used @@ -42,7 +44,8 @@ class SVDcalculator { std::vector GetMuArray(); BED GetchooseBed(); std::vector GetBedVec(); - void WriteSVD(const std::string &VcfPath); + /// @param numSVDPCs number of PCs to write (0 = all available) + void WriteSVD(const std::string &VcfPath, int numSVDPCs); }; diff --git a/main.cpp b/main.cpp index f000486..30ee482 100644 --- a/main.cpp +++ b/main.cpp @@ -43,6 +43,8 @@ int execute(int argc, char **argv) { SVDPrefix("Empty"); std::string knownAF("Empty"); std::string RefVCF("Empty"); + + int numSVDPCs(10); bool skipMinSampleCountCheck(false); // Default to human autosomes (both with and without "chr" prefix). // Override with --IncludeChr for non-human species. @@ -50,6 +52,7 @@ int execute(int argc, char **argv) { "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10," "chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19," "chr20,chr21,chr22"); + std::string fixPC("Empty"); double fixAlpha(-1.), epsilon(1e-8); bool withinAncestry(false), outputPileup(false), verbose(false), @@ -127,6 +130,9 @@ int execute(int argc, char **argv) { "[String] VCF file from which to extract reference " "panel's genotype matrix[Required if no SVD files " "available]") + LONG_INT_PARAM("NumSVDPCs", &numSVDPCs, + "[Int] Number of principal components to write to SVD " + "output files. Set to 0 for all components[default:10]") LONG_PARAM("SkipMinSampleCountCheck", &skipMinSampleCountCheck, "[Bool] ADVANCED: skip the minimum sample count check (1000) " "when generating SVD files. Using fewer than 1000 samples may " @@ -216,7 +222,7 @@ int execute(int argc, char **argv) { } notice("--IncludeChr: filtering to %d chromosome name(s)", (int)includeChrSet.size()); SVDcalculator calculator; - calculator.ProcessRefVCF(RefVCF, includeChrSet, skipMinSampleCountCheck); + calculator.ProcessRefVCF(RefVCF, includeChrSet, skipMinSampleCountCheck, numSVDPCs); notice("Success!"); return 0; }