Skip to content

Commit 7281baf

Browse files
Add --comp-bias-corr-scale
1 parent d89fcec commit 7281baf

20 files changed

Lines changed: 58 additions & 24 deletions

src/alignment/Alignment.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Alignment::Alignment(const std::string &querySeqDB, const std::string &targetSeq
2323
covThr(par.covThr), canCovThr(par.covThr), covMode(par.covMode), seqIdMode(par.seqIdMode), evalThr(par.evalThr), seqIdThr(par.seqIdThr),
2424
alnLenThr(par.alnLenThr), includeIdentity(par.includeIdentity), addBacktrace(par.addBacktrace), realign(par.realign), scoreBias(par.scoreBias), realignScoreBias(par.realignScoreBias), realignMaxSeqs(par.realignMaxSeqs),
2525
threads(static_cast<unsigned int>(par.threads)), compressed(par.compressed), outDB(outDB), outDBIndex(outDBIndex),
26-
maxSeqLen(par.maxSeqLen), compBiasCorrection(par.compBiasCorrection), altAlignment(par.altAlignment), alignmentOutputMode(par.alignmentOutputMode),
26+
maxSeqLen(par.maxSeqLen), compBiasCorrection(par.compBiasCorrection), compBiasCorrectionScale(par.compBiasCorrectionScale), altAlignment(par.altAlignment), alignmentOutputMode(par.alignmentOutputMode),
2727
maxAccept(static_cast<unsigned int>(par.maxAccept)), maxReject(static_cast<unsigned int>(par.maxRejected)), wrappedScoring(par.wrappedScoring),
2828
lcaAlign(lcaAlign), qdbr(NULL), qDbrIdx(NULL), tdbr(NULL), tDbrIdx(NULL) {
2929
unsigned int alignmentMode = par.alignmentMode;
@@ -292,15 +292,15 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, con
292292

293293
std::vector<Matcher::result_t> swResults;
294294
swResults.reserve(300);
295-
Matcher matcher(querySeqType, targetSeqType, maxMatcherSeqLen, m, &evaluer, compBiasCorrection, gapOpen, gapExtend, correlationScoreWeight, zdrop);
295+
Matcher matcher(querySeqType, targetSeqType, maxMatcherSeqLen, m, &evaluer, compBiasCorrection, compBiasCorrectionScale, gapOpen, gapExtend, correlationScoreWeight, zdrop);
296296

297297
std::vector<Matcher::result_t> swRealignResults;
298298
Matcher *realigner = NULL;
299299
if (realign == true) {
300300
swRealignResults.reserve(300);
301301
realigner = &matcher;
302302
if (realign_m != NULL) {
303-
realigner = new Matcher(querySeqType, targetSeqType, maxMatcherSeqLen, realign_m, &evaluer, compBiasCorrection, gapOpen, gapExtend, 0.0, zdrop);
303+
realigner = new Matcher(querySeqType, targetSeqType, maxMatcherSeqLen, realign_m, &evaluer, compBiasCorrection, compBiasCorrectionScale, gapOpen, gapExtend, 0.0, zdrop);
304304
}
305305
}
306306

src/alignment/Alignment.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ class Alignment {
8484
int querySeqType;
8585
int targetSeqType;
8686
bool compBiasCorrection;
87+
float compBiasCorrectionScale;
8788

8889
int altAlignment;
8990
int alignmentOutputMode;

src/alignment/Matcher.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
Matcher::Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, EvalueComputation * evaluer,
10-
bool aaBiasCorrection, int gapOpen, int gapExtend, float correlationScoreWeight, int zdrop)
10+
bool aaBiasCorrection, float aaBiasCorrectionScale, int gapOpen, int gapExtend, float correlationScoreWeight, int zdrop)
1111
: gapOpen(gapOpen), gapExtend(gapExtend), correlationScoreWeight(correlationScoreWeight), m(m), evaluer(evaluer), tinySubMat(NULL) {
1212
setSubstitutionMatrix(m);
1313

@@ -16,7 +16,8 @@ Matcher::Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix
1616
aligner = NULL;
1717
} else {
1818
nuclaligner = NULL;
19-
aligner = new SmithWaterman(maxSeqLen, m->alphabetSize, aaBiasCorrection, targetSeqType);
19+
aligner = new SmithWaterman(maxSeqLen, m->alphabetSize, aaBiasCorrection,
20+
aaBiasCorrectionScale, targetSeqType);
2021
}
2122
//std::cout << "lambda=" << lambdaLog2 << " logKLog2=" << logKLog2 << std::endl;
2223
}

src/alignment/Matcher.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,9 @@ class Matcher{
143143
};
144144

145145
Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
146-
EvalueComputation * evaluer, bool aaBiasCorrection,
146+
EvalueComputation * evaluer, bool aaBiasCorrection, float aaBiasCorrectionScale,
147147
int gapOpen, int gapExtend, float correlationScoreWeight,
148-
int zdrop = 40);
148+
int zdrop);
149149

150150
~Matcher();
151151

src/alignment/StripedSmithWaterman.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@
3232
#include "Debug.h"
3333
#include <iostream>
3434

35-
SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCorrection, int targetSeqType) {
35+
SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCorrection,
36+
float aaBiasCorrectionScale, int targetSeqType) {
3637
maxSequenceLength += 1;
38+
this->aaBiasCorrectionScale = aaBiasCorrectionScale;
3739
this->aaBiasCorrection = aaBiasCorrection;
3840

3941
int segmentSize = (maxSequenceLength+7)/8;
@@ -1169,7 +1171,7 @@ void SmithWaterman::ssw_init(const Sequence* q,
11691171
int32_t compositionBias = 0;
11701172
bool isProfile = Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE);
11711173
if (!isProfile && aaBiasCorrection) {
1172-
SubstitutionMatrix::calcLocalAaBiasCorrection(m, q->numSequence, q->L, tmp_composition_bias);
1174+
SubstitutionMatrix::calcLocalAaBiasCorrection(m, q->numSequence, q->L, tmp_composition_bias, aaBiasCorrectionScale);
11731175
for (int i =0; i < q->L; i++) {
11741176
profile->composition_bias[i] = (int8_t) (tmp_composition_bias[i] < 0.0)? tmp_composition_bias[i] - 0.5: tmp_composition_bias[i] + 0.5;
11751177
compositionBias = (compositionBias < profile->composition_bias[i]) ? compositionBias : profile->composition_bias[i];

src/alignment/StripedSmithWaterman.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ typedef struct {
7272
class SmithWaterman{
7373
public:
7474

75-
SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCorrection, int targetSeqType);
75+
SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCorrection,
76+
float aaBiasCorrectionScale, int targetSeqType);
7677
~SmithWaterman();
7778

7879
// TODO: private or public?
@@ -388,6 +389,7 @@ class SmithWaterman{
388389
int8_t * scorePerCol;
389390
short * profile_word_linear_data;
390391
bool aaBiasCorrection;
392+
float aaBiasCorrectionScale;
391393

392394
template <typename T, size_t Elements>
393395
void createConsensProfile(simd_int *profile, const int8_t *consens_sequence, const int32_t query_length, const int32_t offset);

src/commons/Parameters.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ Parameters::Parameters():
5757
PARAM_SUB_MAT(PARAM_SUB_MAT_ID, "--sub-mat", "Substitution matrix", "Substitution matrix file", typeid(MultiParam<NuclAA<std::string>>), (void *) &scoringMatrixFile, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
5858
PARAM_SEED_SUB_MAT(PARAM_SEED_SUB_MAT_ID, "--seed-sub-mat", "Seed substitution matrix", "Substitution matrix file for k-mer generation", typeid(MultiParam<NuclAA<std::string>>), (void *) &seedScoringMatrixFile, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
5959
PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID, "--comp-bias-corr", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
60+
PARAM_NO_COMP_BIAS_CORR_SCALE(PARAM_NO_COMP_BIAS_CORR_SCALE_ID, "--comp-bias-corr-scale", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(float), (void *) &compBiasCorrectionScale, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
61+
6062
PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(int), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
6163
PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files", "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
6264
PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID, "--add-self-matches", "Include identical seq. id.", "Artificially add entries of queries with themselves (for clustering)", typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
@@ -326,6 +328,8 @@ Parameters::Parameters():
326328
alignall.push_back(&PARAM_COV_MODE);
327329
alignall.push_back(&PARAM_MAX_SEQ_LEN);
328330
alignall.push_back(&PARAM_NO_COMP_BIAS_CORR);
331+
alignall.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
332+
329333
// alignall.push_back(&PARAM_REALIGN);
330334
// alignall.push_back(&PARAM_MAX_REJECTED);
331335
// alignall.push_back(&PARAM_MAX_ACCEPT);
@@ -356,6 +360,8 @@ Parameters::Parameters():
356360
align.push_back(&PARAM_COV_MODE);
357361
align.push_back(&PARAM_MAX_SEQ_LEN);
358362
align.push_back(&PARAM_NO_COMP_BIAS_CORR);
363+
align.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
364+
359365
align.push_back(&PARAM_MAX_REJECTED);
360366
align.push_back(&PARAM_MAX_ACCEPT);
361367
align.push_back(&PARAM_INCLUDE_IDENTITY);
@@ -389,6 +395,7 @@ Parameters::Parameters():
389395
prefilter.push_back(&PARAM_C);
390396
prefilter.push_back(&PARAM_COV_MODE);
391397
prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR);
398+
prefilter.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
392399
prefilter.push_back(&PARAM_DIAGONAL_SCORING);
393400
prefilter.push_back(&PARAM_EXACT_KMER_MATCHING);
394401
prefilter.push_back(&PARAM_MASK_RESIDUES);
@@ -412,6 +419,7 @@ Parameters::Parameters():
412419
ungappedprefilter.push_back(&PARAM_E);
413420
ungappedprefilter.push_back(&PARAM_COV_MODE);
414421
ungappedprefilter.push_back(&PARAM_NO_COMP_BIAS_CORR);
422+
ungappedprefilter.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
415423
ungappedprefilter.push_back(&PARAM_MIN_DIAG_SCORE);
416424
ungappedprefilter.push_back(&PARAM_MAX_SEQS);
417425
ungappedprefilter.push_back(&PARAM_THREADS);
@@ -490,6 +498,7 @@ Parameters::Parameters():
490498
result2profile.push_back(&PARAM_MASK_PROFILE);
491499
result2profile.push_back(&PARAM_E_PROFILE);
492500
result2profile.push_back(&PARAM_NO_COMP_BIAS_CORR);
501+
result2profile.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
493502
result2profile.push_back(&PARAM_WG);
494503
result2profile.push_back(&PARAM_ALLOW_DELETION);
495504
result2profile.push_back(&PARAM_FILTER_MSA);
@@ -547,6 +556,7 @@ Parameters::Parameters():
547556
result2msa.push_back(&PARAM_GAP_EXTEND);
548557
result2msa.push_back(&PARAM_ALLOW_DELETION);
549558
result2msa.push_back(&PARAM_NO_COMP_BIAS_CORR);
559+
result2msa.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
550560
result2msa.push_back(&PARAM_MSA_FORMAT_MODE);
551561
result2msa.push_back(&PARAM_SUMMARY_PREFIX);
552562
result2msa.push_back(&PARAM_SKIP_QUERY);
@@ -574,6 +584,7 @@ Parameters::Parameters():
574584
filterresult.push_back(&PARAM_GAP_OPEN);
575585
filterresult.push_back(&PARAM_GAP_EXTEND);
576586
filterresult.push_back(&PARAM_NO_COMP_BIAS_CORR);
587+
filterresult.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
577588
filterresult.push_back(&PARAM_ALLOW_DELETION);
578589
filterresult.push_back(&PARAM_FILTER_MIN_ENABLE);
579590
filterresult.push_back(&PARAM_FILTER_MAX_SEQ_ID);
@@ -601,6 +612,7 @@ Parameters::Parameters():
601612
msa2profile.push_back(&PARAM_PCA);
602613
msa2profile.push_back(&PARAM_PCB);
603614
msa2profile.push_back(&PARAM_NO_COMP_BIAS_CORR);
615+
msa2profile.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
604616
msa2profile.push_back(&PARAM_WG);
605617
msa2profile.push_back(&PARAM_FILTER_MSA);
606618
msa2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
@@ -621,6 +633,7 @@ Parameters::Parameters():
621633
profile2pssm.push_back(&PARAM_SUB_MAT);
622634
profile2pssm.push_back(&PARAM_MAX_SEQ_LEN);
623635
profile2pssm.push_back(&PARAM_NO_COMP_BIAS_CORR);
636+
profile2pssm.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
624637
profile2pssm.push_back(&PARAM_DB_OUTPUT);
625638
profile2pssm.push_back(&PARAM_THREADS);
626639
profile2pssm.push_back(&PARAM_COMPRESSED);
@@ -697,6 +710,7 @@ Parameters::Parameters():
697710
indexdb.push_back(&PARAM_K);
698711
indexdb.push_back(&PARAM_ALPH_SIZE);
699712
indexdb.push_back(&PARAM_NO_COMP_BIAS_CORR);
713+
indexdb.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
700714
indexdb.push_back(&PARAM_MAX_SEQ_LEN);
701715
indexdb.push_back(&PARAM_MAX_SEQS);
702716
indexdb.push_back(&PARAM_MASK_RESIDUES);
@@ -1089,6 +1103,7 @@ Parameters::Parameters():
10891103
expandaln.push_back(&PARAM_MAX_SEQ_LEN);
10901104
expandaln.push_back(&PARAM_SCORE_BIAS);
10911105
expandaln.push_back(&PARAM_NO_COMP_BIAS_CORR);
1106+
expandaln.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
10921107
expandaln.push_back(&PARAM_E);
10931108
expandaln.push_back(&PARAM_MIN_SEQ_ID);
10941109
// expandaln.push_back(&PARAM_MIN_SEQ_ID);
@@ -1119,6 +1134,7 @@ Parameters::Parameters():
11191134
expand2profile.push_back(&PARAM_MAX_SEQ_LEN);
11201135
expand2profile.push_back(&PARAM_SCORE_BIAS);
11211136
expand2profile.push_back(&PARAM_NO_COMP_BIAS_CORR);
1137+
expand2profile.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
11221138
expand2profile.push_back(&PARAM_E_PROFILE);
11231139
// expand2profile.push_back(&PARAM_E);
11241140
// expand2profile.push_back(&PARAM_MIN_SEQ_ID);
@@ -2185,6 +2201,7 @@ void Parameters::setDefaults() {
21852201

21862202
#endif
21872203
compBiasCorrection = 1;
2204+
compBiasCorrectionScale = 1.0;
21882205
diagonalScoring = true;
21892206
exactKmerMatching = 0;
21902207
maskMode = 1;

src/commons/Parameters.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ class Parameters {
376376
MultiParam<SeqProf<int>> kmerScore; // kmer score for the prefilter
377377
MultiParam<NuclAA<int>> alphabetSize; // alphabet size for the prefilter
378378
int compBiasCorrection; // Aminoacid composiont correction
379+
float compBiasCorrectionScale; // Aminoacid composiont correction scale factor
380+
379381
bool diagonalScoring; // switch diagonal scoring
380382
int exactKmerMatching; // only exact k-mer matching
381383
int maskMode; // mask low complex areas
@@ -715,6 +717,7 @@ class Parameters {
715717
PARAMETER(PARAM_SUB_MAT)
716718
PARAMETER(PARAM_SEED_SUB_MAT)
717719
PARAMETER(PARAM_NO_COMP_BIAS_CORR)
720+
PARAMETER(PARAM_NO_COMP_BIAS_CORR_SCALE)
718721
PARAMETER(PARAM_SPACED_KMER_MODE)
719722
PARAMETER(PARAM_REMOVE_TMP_FILES)
720723
PARAMETER(PARAM_INCLUDE_IDENTITY)

src/commons/SubstitutionMatrix.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ bool SubstitutionMatrix::estimateLambdaAndBackground(const double **scoreMatrix,
9292
void SubstitutionMatrix::calcLocalAaBiasCorrection(const BaseMatrix *m,
9393
const unsigned char *int_sequence,
9494
const int N,
95-
float *compositionBias) {
95+
float *compositionBias,
96+
float scale) {
9697
const int windowSize = 40;
9798
for (int i = 0; i < N; i++) {
9899
const int minPos = std::max(0, (i - windowSize / 2));
@@ -115,7 +116,7 @@ void SubstitutionMatrix::calcLocalAaBiasCorrection(const BaseMatrix *m,
115116
for (int a = 0; a < m->alphabetSize; a++) {
116117
deltaS_i += m->pBack[a] * static_cast<float>(subMat[a]);
117118
}
118-
compositionBias[i] = deltaS_i;
119+
compositionBias[i] = scale * deltaS_i;
119120
// std::cout << i << " " << compositionBias[i] << std::endl;
120121
}
121122
}

src/commons/SubstitutionMatrix.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class SubstitutionMatrix : public BaseMatrix {
2222

2323
virtual double getBackgroundProb(size_t aa_index) { return pBack[aa_index]; }
2424

25-
static void calcLocalAaBiasCorrection(const BaseMatrix *m ,const unsigned char *int_sequence, const int N, float *compositionBias);
25+
static void calcLocalAaBiasCorrection(const BaseMatrix *m ,const unsigned char *int_sequence, const int N, float *compositionBias, float scale);
2626
static void calcProfileProfileLocalAaBiasCorrection(short *profileScores,
2727
const size_t profileAASize,
2828
const int N,

0 commit comments

Comments
 (0)