1111#include < unordered_map>
1212#include < fstream>
1313#include < omp.h>
14+ #include < zlib.h>
1415
1516// Own includes
1617#include " map/include/base_types.hpp"
1718#include " cgi/include/cgid_types.hpp"
1819
1920// External includes
21+ #include " common/kseq.h"
2022#include " common/prettyprint.hpp"
2123
2224namespace cgi
@@ -39,6 +41,52 @@ namespace cgi
3941 }
4042 }
4143
44+ /* *
45+ * @brief compute genome lengths in reference and query genome set
46+ * @param[out] genomeLengths
47+ */
48+ void computeGenomeLengths (skch::Parameters ¶meters, std::unordered_map <std::string, uint64_t > &genomeLengths)
49+ {
50+ for (auto &e : parameters.querySequences )
51+ {
52+ // Open the file using kseq
53+ FILE *file = fopen (e.c_str (), " r" );
54+ gzFile fp = gzdopen (fileno (file), " r" );
55+ kseq_t *seq = kseq_init (fp);
56+ int l; uint64_t genomeLen = 0 ;
57+
58+ while ((l = kseq_read (seq)) >= 0 )
59+ genomeLen = genomeLen + (uint64_t )strlen (seq->seq .s );
60+
61+ genomeLengths[e] = genomeLen;
62+
63+ kseq_destroy (seq);
64+ gzclose (fp); // close the file handler
65+ fclose (file);
66+ }
67+
68+ for (auto &e : parameters.refSequences )
69+ {
70+ if ( genomeLengths.find (e) == genomeLengths.end () )
71+ {
72+ // Open the file using kseq
73+ FILE *file = fopen (e.c_str (), " r" );
74+ gzFile fp = gzdopen (fileno (file), " r" );
75+ kseq_t *seq = kseq_init (fp);
76+ int l; uint64_t genomeLen = 0 ;
77+
78+ while ((l = kseq_read (seq)) >= 0 )
79+ genomeLen = genomeLen + (uint64_t )strlen (seq->seq .s );
80+
81+ genomeLengths[e] = genomeLen;
82+
83+ kseq_destroy (seq);
84+ gzclose (fp); // close the file handler
85+ fclose (file);
86+ }
87+ }
88+ }
89+
4290 /* *
4391 * @brief output blast tabular mappings for visualization
4492 * @param[in] parameters algorithm parameters
@@ -242,10 +290,12 @@ namespace cgi
242290 /* *
243291 * @brief output FastANI results to file
244292 * @param[in] parameters algorithm parameters
293+ * @param[in] genomeLengths
245294 * @param[in] CGI_ResultsVector results
246295 * @param[in] fileName file name where results will be reported
247296 */
248297 void outputCGI (skch::Parameters ¶meters,
298+ std::unordered_map <std::string, uint64_t > &genomeLengths,
249299 std::vector<cgi::CGI_Results> &CGI_ResultsVector,
250300 std::string &fileName)
251301 {
@@ -257,10 +307,22 @@ namespace cgi
257307 // Report results
258308 for (auto &e : CGI_ResultsVector)
259309 {
260- if (e.countSeq >= parameters.minFragments )
310+ std::string qryGenome = parameters.querySequences [e.qryGenomeId ];
311+ std::string refGenome = parameters.refSequences [e.refGenomeId ];
312+
313+ assert (genomeLengths.find (qryGenome) != genomeLengths.end ());
314+ assert (genomeLengths.find (refGenome) != genomeLengths.end ());
315+
316+ uint64_t queryGenomeLength = genomeLengths[qryGenome];
317+ uint64_t refGenomeLength = genomeLengths[refGenome];
318+ uint64_t minGenomeLength = std::min (queryGenomeLength, refGenomeLength);
319+ uint64_t sharedLength = e.countSeq * parameters.minReadLength ;
320+
321+ // Checking if shared genome is above a certain fraction of genome length
322+ if (sharedLength >= minGenomeLength * parameters.minFraction )
261323 {
262- outstrm << parameters. querySequences [e. qryGenomeId ]
263- << " \t " << parameters. refSequences [e. refGenomeId ]
324+ outstrm << qryGenome
325+ << " \t " << refGenome
264326 << " \t " << e.identity
265327 << " \t " << e.countSeq
266328 << " \t " << e.totalQueryFragments
@@ -274,10 +336,12 @@ namespace cgi
274336 /* *
275337 * @brief output FastANI results as lower triangular matrix
276338 * @param[in] parameters algorithm parameters
339+ * @param[in] genomeLengths
277340 * @param[in] CGI_ResultsVector results
278341 * @param[in] fileName file name where results will be reported
279342 */
280343 void outputPhylip (skch::Parameters ¶meters,
344+ std::unordered_map <std::string, uint64_t > &genomeLengths,
281345 std::vector<cgi::CGI_Results> &CGI_ResultsVector,
282346 std::string &fileName)
283347 {
@@ -313,10 +377,22 @@ namespace cgi
313377 // transform FastANI results into 3-tuples
314378 for (auto &e : CGI_ResultsVector)
315379 {
316- if (e.countSeq >= parameters.minFragments )
380+ std::string qryGenome = parameters.querySequences [e.qryGenomeId ];
381+ std::string refGenome = parameters.refSequences [e.refGenomeId ];
382+
383+ assert (genomeLengths.find (qryGenome) != genomeLengths.end ());
384+ assert (genomeLengths.find (refGenome) != genomeLengths.end ());
385+
386+ uint64_t queryGenomeLength = genomeLengths[qryGenome];
387+ uint64_t refGenomeLength = genomeLengths[refGenome];
388+ uint64_t minGenomeLength = std::min (queryGenomeLength, refGenomeLength);
389+ uint64_t sharedLength = e.countSeq * parameters.minReadLength ;
390+
391+ // Checking if shared genome is above a certain fraction of genome length
392+ if (sharedLength >= minGenomeLength * parameters.minFraction )
317393 {
318- int qGenome = genome2Int [ parameters. querySequences [e. qryGenomeId ] ];
319- int rGenome = genome2Int [ parameters. refSequences [e. refGenomeId ] ];
394+ int qGenome = genome2Int [ qryGenome ];
395+ int rGenome = genome2Int [ refGenome ];
320396
321397 if (qGenome != rGenome) // ignore if both genomes are same
322398 {
0 commit comments