From 3c5fd868571ec9cd5510dca41685dbe7315c06db Mon Sep 17 00:00:00 2001 From: tfenne Date: Thu, 9 Apr 2026 15:40:43 -0600 Subject: [PATCH 1/2] Add --NumSVDPCs and log variance explained per principal component MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WriteSVD() hardcoded writing exactly 10 principal components to the .UD and .V output files. With fewer than 10 individuals (or fewer than 10 components from the SVD), this read past the end of the vectors — undefined behavior that could crash or produce garbage. Add --NumSVDPCs (default: 10, matching prior behavior; set to 0 for all available components). WriteSVD now respects this parameter and caps at the actual number of components available. After SVD computation, log the singular value, proportion of variance explained, and cumulative variance for the first 20 components. This helps users choose an appropriate --NumPC for estimation and verify that their reference panel has meaningful population structure. --- SVDcalculator.cpp | 45 ++++++++++++++++++++++++++++++++++++++++----- SVDcalculator.h | 6 ++++-- main.cpp | 6 +++++- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/SVDcalculator.cpp b/SVDcalculator.cpp index c698ea9..b263af5 100644 --- a/SVDcalculator.cpp +++ b/SVDcalculator.cpp @@ -2,6 +2,7 @@ #include "Eigen/Dense" #include "libVcf/libVcfFile.h" #include +#include #include #include #include @@ -201,7 +202,7 @@ int SVDcalculator::ReadVcf(const std::string &VcfPath, return 0; } -void SVDcalculator::ProcessRefVCF(const std::string &VcfPath) +void SVDcalculator::ProcessRefVCF(const std::string &VcfPath, int numSVDPCs) { std::vector > genotype;//markers X samples @@ -230,6 +231,29 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath) Mu.push_back(mu(rowIdx)); } JacobiSVD svd(genoMatrix, ComputeThinU | ComputeThinV); + + // Log proportion of variance explained by each principal component. + // Variance explained by PC_i = sigma_i^2 / sum(sigma_j^2) where sigma + // are the singular values. This helps users choose an appropriate --NumPC. + auto singularValues = svd.singularValues(); + double totalVariance = 0.0; + for (int i = 0; i < singularValues.size(); ++i) { + totalVariance += (double)singularValues[i] * (double)singularValues[i]; + } + notice("SVD completed. Total singular values: %d", (int)singularValues.size()); + double cumulative = 0.0; + int numToLog = std::min((int)singularValues.size(), 20); + for (int i = 0; i < numToLog; ++i) { + double sv = singularValues[i]; + double varExplained = (sv * sv) / totalVariance; + cumulative += varExplained; + notice(" PC%d: singular_value=%.4f variance_explained=%.4f (%.2f%%) cumulative=%.4f (%.2f%%)", + i + 1, sv, varExplained, varExplained * 100.0, cumulative, cumulative * 100.0); + } + if ((int)singularValues.size() > numToLog) { + notice(" ... (%d more components not shown)", (int)singularValues.size() - numToLog); + } + auto matrixD = svd.singularValues().asDiagonal(); MatrixXf matrixUD = svd.matrixU() * matrixD;//marker X PC UD.resize(matrixUD.rows(),std::vector(matrixUD.cols(),0.f)); @@ -246,7 +270,7 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath) } } - WriteSVD(VcfPath); + WriteSVD(VcfPath, numSVDPCs); } vector> SVDcalculator::GetUDMatrix() { @@ -269,7 +293,18 @@ vector SVDcalculator::GetBedVec() { return BedVec; } -void SVDcalculator::WriteSVD(const std::string &Prefix) { +void SVDcalculator::WriteSVD(const std::string &Prefix, int numSVDPCs) { + // Determine how many PCs to write + int numAvailable = (numMarker > 0 && !UD.empty()) ? (int)UD[0].size() : 0; + int numToWrite; + if (numSVDPCs <= 0) { + numToWrite = numAvailable; + } else { + numToWrite = std::min(numSVDPCs, numAvailable); + } + notice("Writing SVD output files with %d PCs (of %d available) to prefix: %s", + numToWrite, numAvailable, Prefix.c_str()); + std::ofstream fMu(Prefix+".mu"); std::ofstream fUD(Prefix+".UD"); std::ofstream fPC(Prefix+".V"); @@ -282,14 +317,14 @@ void SVDcalculator::WriteSVD(const std::string &Prefix) { end=BedVec[i].end; fMu< >& genotype, int & nSamples, int& nMarkers); @@ -31,7 +32,8 @@ class SVDcalculator { std::vector GetMuArray(); BED GetchooseBed(); std::vector GetBedVec(); - void WriteSVD(const std::string &VcfPath); + /// @param numSVDPCs number of PCs to write (0 = all available) + void WriteSVD(const std::string &VcfPath, int numSVDPCs); }; diff --git a/main.cpp b/main.cpp index daa18a6..ce996cc 100644 --- a/main.cpp +++ b/main.cpp @@ -42,6 +42,7 @@ int execute(int argc, char **argv) { SVDPrefix("Empty"); std::string knownAF("Empty"); std::string RefVCF("Empty"); + int numSVDPCs(10); std::string fixPC("Empty"); double fixAlpha(-1.), epsilon(1e-8); bool withinAncestry(false), outputPileup(false), verbose(false), @@ -119,6 +120,9 @@ int execute(int argc, char **argv) { "[String] VCF file from which to extract reference " "panel's genotype matrix[Required if no SVD files " "available]") + LONG_INT_PARAM("NumSVDPCs", &numSVDPCs, + "[Int] Number of principal components to write to SVD " + "output files. Set to 0 for all components[default:10]") LONG_PARAM_GROUP("Pileup Options", "Arguments for pileup info extraction") LONG_INT_PARAM("min-BQ", &mplp.min_baseQ, "[Int] skip bases with baseQ/BAQ smaller than min-BQ") @@ -186,7 +190,7 @@ int execute(int argc, char **argv) { notice("You may specify --SVDPrefix [RefVCF path](or --UDPath [RefVCF " "path].UD and --MeanPath [RefVCF path].mu) in future use"); SVDcalculator calculator; - calculator.ProcessRefVCF(RefVCF); + calculator.ProcessRefVCF(RefVCF, numSVDPCs); UDPath = RefVCF + ".UD"; MeanPath = RefVCF + ".mu"; BedPath = RefVCF + ".bed"; From a049d45c0d5b388ade9d8270c99b934dfb424e23 Mon Sep 17 00:00:00 2001 From: "Griffan(Fan Zhang)" Date: Sat, 11 Apr 2026 16:04:52 -0700 Subject: [PATCH 2/2] Update SVDcalculator.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- SVDcalculator.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/SVDcalculator.cpp b/SVDcalculator.cpp index 5631b31..e3fc8f6 100644 --- a/SVDcalculator.cpp +++ b/SVDcalculator.cpp @@ -258,15 +258,19 @@ void SVDcalculator::ProcessRefVCF(const std::string &VcfPath, notice("SVD completed. Total singular values: %d", (int)singularValues.size()); double cumulative = 0.0; int numToLog = std::min((int)singularValues.size(), 20); - for (int i = 0; i < numToLog; ++i) { - double sv = singularValues[i]; - double varExplained = (sv * sv) / totalVariance; - cumulative += varExplained; - notice(" PC%d: singular_value=%.4f variance_explained=%.4f (%.2f%%) cumulative=%.4f (%.2f%%)", - i + 1, sv, varExplained, varExplained * 100.0, cumulative, cumulative * 100.0); - } - if ((int)singularValues.size() > numToLog) { - notice(" ... (%d more components not shown)", (int)singularValues.size() - numToLog); + if (totalVariance <= 0.0) { + warning("Total variance is zero after SVD; skipping variance-explained logging for principal components."); + } else { + for (int i = 0; i < numToLog; ++i) { + double sv = singularValues[i]; + double varExplained = (sv * sv) / totalVariance; + cumulative += varExplained; + notice(" PC%d: singular_value=%.4f variance_explained=%.4f (%.2f%%) cumulative=%.4f (%.2f%%)", + i + 1, sv, varExplained, varExplained * 100.0, cumulative, cumulative * 100.0); + } + if ((int)singularValues.size() > numToLog) { + notice(" ... (%d more components not shown)", (int)singularValues.size() - numToLog); + } } auto matrixD = svd.singularValues().asDiagonal();