From 1e7e0dda392e6c85c81d96f3a299549ae413eac3 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 16 Jan 2025 16:17:03 +0100 Subject: [PATCH 01/11] Do not write empty output files --- include/valik/search/env_var_pack.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/valik/search/env_var_pack.hpp b/include/valik/search/env_var_pack.hpp index 7ddc5d70..6f54bf89 100644 --- a/include/valik/search/env_var_pack.hpp +++ b/include/valik/search/env_var_pack.hpp @@ -18,7 +18,10 @@ struct env_var_pack { std::filesystem::path tmp_path; std::string stellar_exec{"stellar"}; +<<<<<<< HEAD std::string merge_exec{"cat"}; +======= +>>>>>>> 20a0e6c (Do not write empty output files) env_var_pack() { From b9d94ce4a0b0b20ec29a6d5712f0fc8f2074ad52 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Mon, 20 Jan 2025 13:16:20 +0100 Subject: [PATCH 02/11] Fix environment variable --- include/valik/search/env_var_pack.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/valik/search/env_var_pack.hpp b/include/valik/search/env_var_pack.hpp index 6f54bf89..7ddc5d70 100644 --- a/include/valik/search/env_var_pack.hpp +++ b/include/valik/search/env_var_pack.hpp @@ -18,10 +18,7 @@ struct env_var_pack { std::filesystem::path tmp_path; std::string stellar_exec{"stellar"}; -<<<<<<< HEAD std::string merge_exec{"cat"}; -======= ->>>>>>> 20a0e6c (Do not write empty output files) env_var_pack() { From c4a1343a0af0ae678cf31d207486de4a0835f863 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Tue, 4 Feb 2025 15:23:23 +0100 Subject: [PATCH 03/11] Adapt threshold for each pattern --- include/valik/search/local_prefilter.hpp | 40 +++++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index b4599f58..d8d65307 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -54,6 +54,11 @@ struct pattern_bounds size_t begin_position; size_t end_position; size_t threshold; + + size_t minimiser_count() const + { + return end_position - begin_position; + } }; /** @@ -89,9 +94,7 @@ pattern_bounds make_pattern_bounds(size_t const & begin, assert(end_it != window_span_begin.begin()); pattern.end_position = end_it - window_span_begin.begin(); - size_t const minimiser_count = pattern.end_position - pattern.begin_position; - - pattern.threshold = thresholder.get(minimiser_count); + pattern.threshold = thresholder.get(pattern.minimiser_count()); return pattern; } @@ -114,15 +117,36 @@ void find_pattern_bins(pattern_bounds const & pattern, for (size_t i = pattern.begin_position; i < pattern.end_position; i++) total_counts += counting_table[i]; - for (size_t current_bin = 0; current_bin < total_counts.size(); current_bin++) + + std::unordered_set pattern_hits; + + bool max_threshold{false}; + uint8_t correction{0}; + while (true) { - auto &&count = total_counts[current_bin]; - if (count >= pattern.threshold) + for (size_t current_bin = 0; current_bin < total_counts.size(); current_bin++) + { + auto &&count = total_counts[current_bin]; + if (count >= (pattern.threshold + correction)) + { + // the result is a union of results from all patterns of a read + sequence_hits.insert(current_bin); + } + } + if (pattern.threshold + correction >= pattern.minimiser_count()) + max_threshold = true; + if (pattern_hits.size() < std::max((size_t) 4, (size_t) std::round(bin_count / 4.0)) || + max_threshold) + break; + else { - // the result is a union of results from all patterns of a read - sequence_hits.insert(current_bin); + pattern_hits.clear(); + correction += std::max((size_t) 1, (size_t) std::round(pattern.threshold * 0.1 * correction)); } } + + for (auto bin : pattern_hits) + sequence_hits.insert(bin); } /** From b32410ceeb85ffb67c6de8a893370f2792a2cdcf Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Tue, 4 Feb 2025 16:33:03 +0100 Subject: [PATCH 04/11] Cleaner output log --- include/valik/search/search_local.hpp | 5 ++++- src/argument_parsing/search.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/valik/search/search_local.hpp b/include/valik/search/search_local.hpp index 60959b9c..7200a64f 100644 --- a/include/valik/search/search_local.hpp +++ b/include/valik/search/search_local.hpp @@ -132,7 +132,10 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st std::cout.precision(3); std::cout << "\n-----------Search parameters-----------\n"; - std::cout << "kmer size " << std::to_string(arguments.shape_size) << '\n'; + if (arguments.shape_size == arguments.shape_weight) + std::cout << "kmer size " << std::to_string(arguments.shape_size) << '\n'; + else + std::cout << "kmer shape " << std::to_string(arguments.shape.to_string()) << '\n'; std::cout << "window size " << std::to_string(arguments.window_size) << '\n'; switch (arguments.search_type) { diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index d3418bb5..601f17dc 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -346,7 +346,7 @@ void run_search(sharg::parser & parser) { arguments.search_type = search_kind::LEMMA; if (arguments.threshold < lemma_thresh) - std::cerr << "[Warning] chosen threshold is less than the k-mer lemma threshold. Ignore this warning if this was deliberate."; + std::cerr << "[Warning] The chosen threshold is less than the k-mer lemma threshold. Ignore this warning if this was deliberate."; } } if (arguments.stellar_only) From 2365c092a0b39a1c957df1ac7269d331ab5744e4 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Tue, 4 Feb 2025 16:59:11 +0100 Subject: [PATCH 05/11] Fix adaptive seeding --- include/valik/search/local_prefilter.hpp | 4 ++-- include/valik/search/search_local.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index d8d65307..e37e843d 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -130,10 +130,10 @@ void find_pattern_bins(pattern_bounds const & pattern, if (count >= (pattern.threshold + correction)) { // the result is a union of results from all patterns of a read - sequence_hits.insert(current_bin); + pattern_hits.insert(current_bin); } } - if (pattern.threshold + correction >= pattern.minimiser_count()) + if ((pattern.threshold + correction) >= pattern.minimiser_count()) max_threshold = true; if (pattern_hits.size() < std::max((size_t) 4, (size_t) std::round(bin_count / 4.0)) || max_threshold) diff --git a/include/valik/search/search_local.hpp b/include/valik/search/search_local.hpp index 7200a64f..9d27a5ce 100644 --- a/include/valik/search/search_local.hpp +++ b/include/valik/search/search_local.hpp @@ -135,7 +135,7 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st if (arguments.shape_size == arguments.shape_weight) std::cout << "kmer size " << std::to_string(arguments.shape_size) << '\n'; else - std::cout << "kmer shape " << std::to_string(arguments.shape.to_string()) << '\n'; + std::cout << "kmer shape " << arguments.shape.to_string() << '\n'; std::cout << "window size " << std::to_string(arguments.window_size) << '\n'; switch (arguments.search_type) { From b0855d4e0fe6cbd4a68a66b3f30c669e36381cb7 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 6 Feb 2025 11:45:31 +0100 Subject: [PATCH 06/11] Try to adapt threshold based on sample of patterns --- include/valik/search/local_prefilter.hpp | 117 +++++++++++++++++++---- include/valik/shared.hpp | 1 + src/argument_parsing/search.cpp | 5 + 3 files changed, 107 insertions(+), 16 deletions(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index e37e843d..3b0de6b5 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -13,6 +13,40 @@ namespace valik { +/** + * @brief Function that samples patterns on a query. + * + * @param read_len Length of query. + * @param pattern_size Length of pattern. + * @param query_every Every nth potential match is considered. + * @param callback Functor that corrects the threshold based on matching k-mer counts. + * @return Lower quartile of threshold correction. + */ +template +constexpr double sample_begin_positions(size_t const read_len, uint64_t const pattern_size, uint8_t const query_every, functor_t && callback) +{ + assert(read_len >= pattern_size); + + size_t first_pos{pattern_size}; + if (read_len < query_every + pattern_size) + first_pos = 0; // start from beginning when short query + + size_t corrected_pattern_count{0u}; + double total_correction{0}; + for (size_t pos = first_pos; pos <= read_len - pattern_size; pos = pos + query_every * pattern_size * 2) + { + auto correction = callback(pos); + if (correction > 0) + { + corrected_pattern_count++; + total_correction += correction; + } + } + + return 1 + total_correction / (double) (corrected_pattern_count * 2); +} + + /** * @brief Function that finds the begin positions of all pattern of a query. * @@ -33,10 +67,10 @@ constexpr void pattern_begin_positions(size_t const read_len, uint64_t const pat assert(read_len >= pattern_size); size_t last_begin{0u}; - for (size_t i = 0; i <= read_len - pattern_size; i = i + query_every) + for (size_t pos = 0; pos <= read_len - pattern_size; pos = pos + query_every) { - callback(i); - last_begin = i; + callback(pos); + last_begin = pos; } if (last_begin < read_len - pattern_size) @@ -99,18 +133,17 @@ pattern_bounds make_pattern_bounds(size_t const & begin, } /** - * @brief Function that for a single pattern counts matching k-mers and returns bins that exceed the threshold. + * @brief Function that for a single pattern counts matching k-mers and corrects the threshold to avoid too many spuriously matching bins. * * @param pattern Slice of a query record that is being considered. * @param bin_count Number of bins in the IBF. * @param counting_table Rows: minimisers of the query. Columns: bins of the IBF. - * @param sequence_hits Bins that likely contain a match for the pattern (IN-OUT parameter). + * @return Threshold correction that avoids too many spurious matches. */ template -void find_pattern_bins(pattern_bounds const & pattern, - size_t const & bin_count, - binning_bitvector_t const & counting_table, - std::unordered_set & sequence_hits) +double find_dynamic_threshold_correction(pattern_bounds const & pattern, + size_t const & bin_count, + binning_bitvector_t const & counting_table) { // counting vector for the current pattern seqan3::counting_vector total_counts(bin_count, 0); @@ -121,19 +154,19 @@ void find_pattern_bins(pattern_bounds const & pattern, std::unordered_set pattern_hits; bool max_threshold{false}; - uint8_t correction{0}; + uint8_t correction_count{0}; while (true) { for (size_t current_bin = 0; current_bin < total_counts.size(); current_bin++) { auto &&count = total_counts[current_bin]; - if (count >= (pattern.threshold + correction)) + if (count >= (pattern.threshold + correction_count)) { // the result is a union of results from all patterns of a read pattern_hits.insert(current_bin); } } - if ((pattern.threshold + correction) >= pattern.minimiser_count()) + if ((pattern.threshold + correction_count) >= pattern.minimiser_count()) max_threshold = true; if (pattern_hits.size() < std::max((size_t) 4, (size_t) std::round(bin_count / 4.0)) || max_threshold) @@ -141,12 +174,51 @@ void find_pattern_bins(pattern_bounds const & pattern, else { pattern_hits.clear(); - correction += std::max((size_t) 1, (size_t) std::round(pattern.threshold * 0.1 * correction)); + correction_count += std::max((size_t) 1, (size_t) std::round(pattern.threshold * 0.1 * correction_count)); } } - for (auto bin : pattern_hits) - sequence_hits.insert(bin); + return (double) correction_count / (double) pattern.threshold; +} + + +/** + * @brief Function that for a single pattern counts matching k-mers and returns bins that exceed the threshold. + * + * @param pattern Slice of a query record that is being considered. + * @param correction Threshold correction determined from a sample of patterns. + * @param bin_count Number of bins in the IBF. + * @param counting_table Rows: minimisers of the query. Columns: bins of the IBF. + * @param sequence_hits Bins that likely contain a match for the pattern (IN-OUT parameter). + */ +template +void find_pattern_bins(pattern_bounds const & pattern, + double const & correction_coef, + size_t const & bin_count, + binning_bitvector_t const & counting_table, + std::unordered_set & sequence_hits) +{ + // counting vector for the current pattern + seqan3::counting_vector total_counts(bin_count, 0); + + for (size_t i = pattern.begin_position; i < pattern.end_position; i++) + total_counts += counting_table[i]; + + for (size_t current_bin = 0; current_bin < total_counts.size(); current_bin++) + { + auto &&count = total_counts[current_bin]; + if (current_bin == 0) + { + seqan3::debug_stream << "Threshold was " << pattern.threshold << '\n'; + seqan3::debug_stream << "New threshold " << std::to_string(pattern.threshold * correction_coef) << '\n'; + } + + if (count >= (pattern.threshold * correction_coef)) + { + // the result is a union of results from all patterns of a read + sequence_hits.insert(current_bin); + } + } } /** @@ -223,10 +295,23 @@ void local_prefilter( minimiser.clear(); std::unordered_set sequence_hits{}; + double threshold_correction{1}; + if (!arguments.static_threshold) + { + threshold_correction = sample_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin) -> double + { + pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder); + return find_dynamic_threshold_correction(pattern, bin_count, counting_table); + }); + } + + if (threshold_correction > 1.0000001) + seqan3::debug_stream << "Correct threshold by " << threshold_correction << '\n'; + pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin) { pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder); - find_pattern_bins(pattern, bin_count, counting_table, sequence_hits); + find_pattern_bins(pattern, threshold_correction, bin_count, counting_table, sequence_hits); }); result_cb(record, sequence_hits); diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index da173659..e8323548 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -188,6 +188,7 @@ struct search_arguments final : public minimiser_threshold_arguments, search_pro bool keep_best_repeats{false}; double best_bin_entropy_cutoff{0.25}; bool keep_all_repeats{false}; + bool static_threshold{false}; bool stellar_only{false}; size_t cart_max_capacity{1000}; diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index 601f17dc..209cd3ba 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -106,6 +106,11 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) .long_id = "keep-all-repeats", .description = "Do not filter out query matches from repeat regions. This may significantly increase the runtime.", .advanced = true}); + parser.add_flag(arguments.static_threshold, + sharg::config{.short_id = '\0', + .long_id = "static-threshold", + .description = "Do not correct threshold to avoid many spuriously matching bins.", + .advanced = true}); parser.add_option(arguments.seg_count_in, sharg::config{.short_id = 'n', .long_id = "seg-count", From 93a1ba69581722d6e7019fd19b2d8688fdbb12eb Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 6 Feb 2025 17:12:18 +0100 Subject: [PATCH 07/11] Adapt threshold by mean --- include/valik/search/local_prefilter.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 3b0de6b5..bdafa897 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -43,7 +43,7 @@ constexpr double sample_begin_positions(size_t const read_len, uint64_t const pa } } - return 1 + total_correction / (double) (corrected_pattern_count * 2); + return 1 + total_correction / (double) std::max(corrected_pattern_count, (size_t) 1); } @@ -162,7 +162,6 @@ double find_dynamic_threshold_correction(pattern_bounds const & pattern, auto &&count = total_counts[current_bin]; if (count >= (pattern.threshold + correction_count)) { - // the result is a union of results from all patterns of a read pattern_hits.insert(current_bin); } } @@ -174,6 +173,7 @@ double find_dynamic_threshold_correction(pattern_bounds const & pattern, else { pattern_hits.clear(); + // increase threshold in 10% increments or by at least 1 to find lowest threshold that is not ubiquitous correction_count += std::max((size_t) 1, (size_t) std::round(pattern.threshold * 0.1 * correction_count)); } } @@ -209,8 +209,11 @@ void find_pattern_bins(pattern_bounds const & pattern, auto &&count = total_counts[current_bin]; if (current_bin == 0) { - seqan3::debug_stream << "Threshold was " << pattern.threshold << '\n'; - seqan3::debug_stream << "New threshold " << std::to_string(pattern.threshold * correction_coef) << '\n'; + if (std::round(pattern.threshold * correction_coef) > pattern.threshold) + { + seqan3::debug_stream << "Threshold was " << pattern.threshold << '\n'; + seqan3::debug_stream << "New threshold " << std::to_string((size_t) std::round(pattern.threshold * correction_coef)) << '\n'; + } } if (count >= (pattern.threshold * correction_coef)) From 4c1dcbca4f7a10b83d61ec3671410f9408c6f90e Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Fri, 7 Feb 2025 11:20:41 +0100 Subject: [PATCH 08/11] Remove debugging --- include/valik/search/local_prefilter.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index bdafa897..38f4138f 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -207,6 +207,7 @@ void find_pattern_bins(pattern_bounds const & pattern, for (size_t current_bin = 0; current_bin < total_counts.size(); current_bin++) { auto &&count = total_counts[current_bin]; + /* if (current_bin == 0) { if (std::round(pattern.threshold * correction_coef) > pattern.threshold) @@ -215,7 +216,7 @@ void find_pattern_bins(pattern_bounds const & pattern, seqan3::debug_stream << "New threshold " << std::to_string((size_t) std::round(pattern.threshold * correction_coef)) << '\n'; } } - + */ if (count >= (pattern.threshold * correction_coef)) { // the result is a union of results from all patterns of a read From 0da204f70e66bcb3e608ee41f9682a1e311af29f Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Sun, 9 Feb 2025 18:54:30 +0100 Subject: [PATCH 09/11] Make tests with --static-threshold --- test/cli/valik_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 8fb9ce24..31e2cb0b 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -350,7 +350,7 @@ TEST_P(valik_search_clusters, search) "--error-rate ", std::to_string(error_rate), "--index ", ibf_path(number_of_bins, window_size), "--query ", data("query.fq"), - "--threads 1", "--very-verbose", + "--threads 1", "--very-verbose", "--static-threshold", "--cart-max-capacity 3", "--max-queued-carts 10", "--without-parameter-tuning"); @@ -399,7 +399,7 @@ TEST_P(valik_search_segments, search) "--error-rate ", std::to_string(error_rate), "--index ", ibf_path(segment_overlap, number_of_bins, window_size), "--query ", data("single_query.fasta"), - "--threads 1", "--very-verbose", + "--threads 1", "--very-verbose", "--static-threshold", "--ref-meta", segment_metadata_path(segment_overlap, number_of_bins), "--cart-max-capacity 3", "--max-queued-carts 10", From cf0a9d11f21be8f205b66d6d5b88f15e03052a8b Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Sun, 9 Feb 2025 19:03:24 +0100 Subject: [PATCH 10/11] Remove all debugging --- include/valik/search/local_prefilter.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 38f4138f..1f6c4e27 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -309,9 +309,10 @@ void local_prefilter( }); } + /* if (threshold_correction > 1.0000001) seqan3::debug_stream << "Correct threshold by " << threshold_correction << '\n'; - + */ pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin) { pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder); From de0a2baa40b848c48f806653065274922574a6fb Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Fri, 28 Feb 2025 18:31:16 +0100 Subject: [PATCH 11/11] Sample more patterns --- include/valik/search/local_prefilter.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 1f6c4e27..a09eb1c5 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -33,7 +33,7 @@ constexpr double sample_begin_positions(size_t const read_len, uint64_t const pa size_t corrected_pattern_count{0u}; double total_correction{0}; - for (size_t pos = first_pos; pos <= read_len - pattern_size; pos = pos + query_every * pattern_size * 2) + for (size_t pos = first_pos; pos <= read_len - pattern_size; pos = pos + query_every * pattern_size) { auto correction = callback(pos); if (correction > 0) @@ -309,10 +309,8 @@ void local_prefilter( }); } - /* if (threshold_correction > 1.0000001) seqan3::debug_stream << "Correct threshold by " << threshold_correction << '\n'; - */ pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin) { pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder);