Skip to content

Commit 18c78fb

Browse files
committed
shared genome threshold updated
1 parent b8e1dd3 commit 18c78fb

4 files changed

Lines changed: 103 additions & 18 deletions

File tree

src/cgi/core_genome_identity.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,13 @@ int main(int argc, char** argv)
115115

116116
std::cerr << "INFO, skch::main, parallel_for execution finished" << std::endl;
117117

118+
std::unordered_map <std::string, uint64_t> genomeLengths; // name of genome -> length
119+
cgi::computeGenomeLengths(parameters, genomeLengths);
120+
118121
//report output in file
119-
cgi::outputCGI (parameters, finalResults, fileName);
122+
cgi::outputCGI (parameters, genomeLengths, finalResults, fileName);
120123

121124
//report output as matrix
122125
if (parameters.matrixOutput)
123-
cgi::outputPhylip (parameters, finalResults, fileName);
126+
cgi::outputPhylip (parameters, genomeLengths, finalResults, fileName);
124127
}

src/cgi/include/computeCoreIdentity.hpp

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@
1111
#include <unordered_map>
1212
#include <fstream>
1313
#include <omp.h>
14+
#include <zlib.h>
1415

1516
//Own includes
1617
#include "map/include/base_types.hpp"
1718
#include "cgi/include/cgid_types.hpp"
1819

1920
//External includes
21+
#include "common/kseq.h"
2022
#include "common/prettyprint.hpp"
2123

2224
namespace cgi
@@ -39,6 +41,52 @@ namespace cgi
3941
}
4042
}
4143

44+
/**
45+
* @brief compute genome lengths in reference and query genome set
46+
* @param[out] genomeLengths
47+
*/
48+
void computeGenomeLengths(skch::Parameters &parameters, std::unordered_map <std::string, uint64_t> &genomeLengths)
49+
{
50+
for(auto &e : parameters.querySequences)
51+
{
52+
//Open the file using kseq
53+
FILE *file = fopen(e.c_str(), "r");
54+
gzFile fp = gzdopen(fileno(file), "r");
55+
kseq_t *seq = kseq_init(fp);
56+
int l; uint64_t genomeLen = 0;
57+
58+
while ((l = kseq_read(seq)) >= 0)
59+
genomeLen = genomeLen + (uint64_t)strlen(seq->seq.s);
60+
61+
genomeLengths[e] = genomeLen;
62+
63+
kseq_destroy(seq);
64+
gzclose(fp); //close the file handler
65+
fclose(file);
66+
}
67+
68+
for(auto &e : parameters.refSequences)
69+
{
70+
if( genomeLengths.find(e) == genomeLengths.end() )
71+
{
72+
//Open the file using kseq
73+
FILE *file = fopen(e.c_str(), "r");
74+
gzFile fp = gzdopen(fileno(file), "r");
75+
kseq_t *seq = kseq_init(fp);
76+
int l; uint64_t genomeLen = 0;
77+
78+
while ((l = kseq_read(seq)) >= 0)
79+
genomeLen = genomeLen + (uint64_t)strlen(seq->seq.s);
80+
81+
genomeLengths[e] = genomeLen;
82+
83+
kseq_destroy(seq);
84+
gzclose(fp); //close the file handler
85+
fclose(file);
86+
}
87+
}
88+
}
89+
4290
/**
4391
* @brief output blast tabular mappings for visualization
4492
* @param[in] parameters algorithm parameters
@@ -242,10 +290,12 @@ namespace cgi
242290
/**
243291
* @brief output FastANI results to file
244292
* @param[in] parameters algorithm parameters
293+
* @param[in] genomeLengths
245294
* @param[in] CGI_ResultsVector results
246295
* @param[in] fileName file name where results will be reported
247296
*/
248297
void outputCGI(skch::Parameters &parameters,
298+
std::unordered_map <std::string, uint64_t> &genomeLengths,
249299
std::vector<cgi::CGI_Results> &CGI_ResultsVector,
250300
std::string &fileName)
251301
{
@@ -257,10 +307,22 @@ namespace cgi
257307
//Report results
258308
for(auto &e : CGI_ResultsVector)
259309
{
260-
if(e.countSeq >= parameters.minFragments)
310+
std::string qryGenome = parameters.querySequences[e.qryGenomeId];
311+
std::string refGenome = parameters.refSequences[e.refGenomeId];
312+
313+
assert(genomeLengths.find(qryGenome) != genomeLengths.end());
314+
assert(genomeLengths.find(refGenome) != genomeLengths.end());
315+
316+
uint64_t queryGenomeLength = genomeLengths[qryGenome];
317+
uint64_t refGenomeLength = genomeLengths[refGenome];
318+
uint64_t minGenomeLength = std::min(queryGenomeLength, refGenomeLength);
319+
uint64_t sharedLength = e.countSeq * parameters.minReadLength;
320+
321+
//Checking if shared genome is above a certain fraction of genome length
322+
if(sharedLength >= minGenomeLength * parameters.minFraction)
261323
{
262-
outstrm << parameters.querySequences[e.qryGenomeId]
263-
<< "\t" << parameters.refSequences[e.refGenomeId]
324+
outstrm << qryGenome
325+
<< "\t" << refGenome
264326
<< "\t" << e.identity
265327
<< "\t" << e.countSeq
266328
<< "\t" << e.totalQueryFragments
@@ -274,10 +336,12 @@ namespace cgi
274336
/**
275337
* @brief output FastANI results as lower triangular matrix
276338
* @param[in] parameters algorithm parameters
339+
* @param[in] genomeLengths
277340
* @param[in] CGI_ResultsVector results
278341
* @param[in] fileName file name where results will be reported
279342
*/
280343
void outputPhylip(skch::Parameters &parameters,
344+
std::unordered_map <std::string, uint64_t> &genomeLengths,
281345
std::vector<cgi::CGI_Results> &CGI_ResultsVector,
282346
std::string &fileName)
283347
{
@@ -313,10 +377,22 @@ namespace cgi
313377
//transform FastANI results into 3-tuples
314378
for(auto &e : CGI_ResultsVector)
315379
{
316-
if(e.countSeq >= parameters.minFragments)
380+
std::string qryGenome = parameters.querySequences[e.qryGenomeId];
381+
std::string refGenome = parameters.refSequences[e.refGenomeId];
382+
383+
assert(genomeLengths.find(qryGenome) != genomeLengths.end());
384+
assert(genomeLengths.find(refGenome) != genomeLengths.end());
385+
386+
uint64_t queryGenomeLength = genomeLengths[qryGenome];
387+
uint64_t refGenomeLength = genomeLengths[refGenome];
388+
uint64_t minGenomeLength = std::min(queryGenomeLength, refGenomeLength);
389+
uint64_t sharedLength = e.countSeq * parameters.minReadLength;
390+
391+
//Checking if shared genome is above a certain fraction of genome length
392+
if(sharedLength >= minGenomeLength * parameters.minFraction)
317393
{
318-
int qGenome = genome2Int [ parameters.querySequences[e.qryGenomeId] ];
319-
int rGenome = genome2Int [ parameters.refSequences[e.refGenomeId] ];
394+
int qGenome = genome2Int [ qryGenome ];
395+
int rGenome = genome2Int [ refGenome ];
320396

321397
if (qGenome != rGenome) //ignore if both genomes are same
322398
{

src/map/include/map_parameters.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ namespace skch
2323
int kmerSize; //kmer size for sketching
2424
int windowSize; //window size used for sketching
2525
int minReadLength; //minimum read length which code maps
26-
int minFragments; //minimum fragment mappings for trusting ANI value
26+
float minFraction; //minimum genome fraction for trusting ANI value
2727
int threads; //thread count
2828
int alphabetSize; //alphabet size
2929
uint64_t referenceSize; //Approximate reference size

src/map/include/parseCmdArgs.hpp

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <iostream>
1111
#include <string>
1212
#include <fstream>
13+
#include <cassert>
1314

1415
//Own includes
1516
#include "map/include/map_parameters.hpp"
@@ -57,7 +58,7 @@ Example usage: \n\
5758

5859
cmd.defineOption("fragLen", "fragment length [default : 3,000]", ArgvParser::OptionRequiresValue);
5960

60-
cmd.defineOption("minFrag", "minimum matched fragments for trusting ANI [default : 50]", ArgvParser::OptionRequiresValue);
61+
cmd.defineOption("minFraction", "minimum fraction of genome that must be shared for trusting ANI. If reference and query genome size differ, smaller one among the two is considered. [default : 0.2]", ArgvParser::OptionRequiresValue);
6162

6263
cmd.defineOption("visualize", "output mappings for visualization, can be enabled for single genome to single genome comparison only [disabled by default]");
6364

@@ -167,20 +168,24 @@ Example usage: \n\
167168
if (cmd.foundOption("version"))
168169
{
169170
std::cerr << "version 1.2\n\n";
170-
exit(1);
171+
exit(0);
171172
}
172173
if (result != ArgvParser::NoParserError)
173174
{
174175
std::cerr << cmd.parseErrorDescription(result) << "\n";
175-
exit(1);
176+
177+
if (result == ArgvParser::ParserHelpRequested)
178+
exit(0);
179+
else
180+
exit(1);
176181
}
177182
else if (!cmd.foundOption("ref") && !cmd.foundOption("refList"))
178-
{
183+
{
179184
std::cerr << "Provide reference file (s)\n";
180185
exit(1);
181186
}
182187
else if (!cmd.foundOption("query") && !cmd.foundOption("queryList"))
183-
{
188+
{
184189
std::cerr << "Provide query file (s)\n";
185190
exit(1);
186191
}
@@ -296,14 +301,15 @@ Example usage: \n\
296301
else
297302
parameters.minReadLength = 3000;
298303

299-
if(cmd.foundOption("minFrag"))
304+
if(cmd.foundOption("minFraction"))
300305
{
301-
str << cmd.optionValue("minFrag");
302-
str >> parameters.minFragments;
306+
str << cmd.optionValue("minFraction");
307+
str >> parameters.minFraction;
303308
str.clear();
309+
assert(parameters.minFraction >= 0.0 && parameters.minFraction <= 1.0);
304310
}
305311
else
306-
parameters.minFragments = 50;
312+
parameters.minFraction = 0.2;
307313

308314
parameters.percentageIdentity = 80;
309315

0 commit comments

Comments
 (0)