From 4d15e9e16a89eb2792f0fbad9015ff19a2222156 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Mon, 2 Jun 2025 17:29:29 +0200 Subject: [PATCH] Count unique k-mers --- include/valik/build/index_factory.hpp | 7 ++++--- include/valik/shared.hpp | 4 ++-- include/valik/split/metadata.hpp | 4 ++-- src/argument_parsing/build.cpp | 13 ++++++++++++- src/prepare/compute_bin_size.cpp | 24 ++++++++++++++---------- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/valik/build/index_factory.hpp b/include/valik/build/index_factory.hpp index 56e9e7bd..f8cb97b8 100644 --- a/include/valik/build/index_factory.hpp +++ b/include/valik/build/index_factory.hpp @@ -69,14 +69,15 @@ class index_factory std::vector header_paths = parse_bin_paths(*arguments, "header"); std::string shape_string{}; uint64_t window_size{}; - size_t count{}; + uint64_t distinct_count{}; + uint64_t unique_count{}; uint64_t bin_size{}; entropy_ranking.reserve(header_paths.size()); for (auto && [file_name, bin_number] : seqan3::views::zip(header_paths, std::views::iota(0u))) { std::ifstream file_stream{file_name}; - file_stream >> shape_string >> window_size >> count >> bin_size; - entropy_map.emplace_back(std::make_pair((size_t) bin_number, (double) count / (double) bin_size)); + file_stream >> shape_string >> window_size >> unique_count >> distinct_count >> bin_size; + entropy_map.emplace_back(std::make_pair((size_t) bin_number, (double) unique_count / (double) bin_size)); } std::ranges::sort(entropy_map.begin(), entropy_map.end(), [](const std::pair &a, const std::pair &b) diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index e67d09fd..47cfd566 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -93,8 +93,8 @@ struct build_arguments final : public split_arguments bool manual_parameters{false}; bool input_is_minimiser{false}; - uint8_t kmer_count_min_cutoff{2}; - uint8_t kmer_count_max_cutoff{64}; + uint8_t kmer_count_min_cutoff{0}; + uint8_t kmer_count_max_cutoff{254}; bool use_filesize_dependent_cutoff{false}; std::filesystem::path ref_meta_path{}; diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index 7de569cf..c7ad3eb3 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -559,8 +559,8 @@ struct metadata segment_stats seg = segments[seg_id]; out_str << seg_id << '\t'; for (size_t ind : seg.seq_vec) - out_str << ind << '\t'; - out_str << seg.start << '\t' << seg.len << '\n'; + out_str << ind << ';'; + out_str << '\t' << seg.start << '\t' << seg.len << '\n'; } out_str << "$\n"; diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index 533e398a..08246652 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -116,7 +116,7 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments) .long_id = "kmer-count-min", .description = "Only store k-mers with at least (>=) x occurrences. " "Mutually exclusive with --use-filesize-dependent-cutoff.", - .validator = sharg::arithmetic_range_validator{0, 254}}); + .validator = sharg::arithmetic_range_validator{1, 254}}); parser.add_option(arguments.kmer_count_max_cutoff, sharg::config{.short_id = '\0', .long_id = "kmer-count-max", @@ -224,6 +224,8 @@ void run_build(sharg::parser & parser) std::cout << "database size " << meta.total_len << "bp\n"; std::cout << "segment count " << meta.seg_count << '\n'; std::cout << "segment len " << std::to_string((uint64_t) std::round(meta.total_len / (double) meta.seg_count)) << "bp\n"; + std::cout << "\n-----------Reference segments-----------\n"; + std::cout << meta.to_string(); } @@ -277,6 +279,15 @@ void run_build(sharg::parser & parser) arguments.window_size = arguments.kmer_size; } + if (parser.is_option_set("kmer-count-min") || + parser.is_option_set("kmer-count-max")) + { + if (arguments.kmer_count_min_cutoff > arguments.kmer_count_max_cutoff) + throw sharg::parser_error{"Set --kmer-count-min <= --kmer-count-max."}; + if (!arguments.input_is_minimiser) + seqan3::debug_stream << "[Warning] Arguments --kmer-count-min and --kmer-count-max are only compatible with minimisers (add --fast).\n"; + } + if (arguments.kmer_size > arguments.window_size) { throw sharg::parser_error{"The k-mer size cannot be bigger than the window size."}; diff --git a/src/prepare/compute_bin_size.cpp b/src/prepare/compute_bin_size.cpp index 1c717a93..ef8dee73 100644 --- a/src/prepare/compute_bin_size.cpp +++ b/src/prepare/compute_bin_size.cpp @@ -70,22 +70,22 @@ void compute_minimiser(valik::build_arguments const & arguments) distinct_minimisers.insert(hash); }); - uint64_t count{}; - + uint64_t distinct_kmer_count{0}; + uint64_t unique_kmer_count{0}; { - //!TODO: apply k-mer count cutoffs in metagenome search + //!TODO: apply k-mer count cutoffs in metagenome search and count unique k-mers std::ofstream outfile{minimiser_file, std::ios::binary}; for (auto && hash : distinct_minimisers) { outfile.write(reinterpret_cast(&hash), sizeof(hash)); - ++count; + ++distinct_kmer_count; } } { std::ofstream headerfile{header_file}; headerfile << arguments.shape.to_string() << '\t' << std::to_string(arguments.window_size) << '\t' << - count << '\t' << seq_size << '\n'; + unique_kmer_count << '\t' << distinct_kmer_count << '\t' << seq_size << '\n'; } std::filesystem::remove(progress_file); @@ -145,15 +145,18 @@ void compute_minimiser(valik::build_arguments const & arguments) minimiser_table[value] = std::min(254u, minimiser_table[value] + 1); } - uint64_t count{}; + uint64_t distinct_kmer_count{0}; + uint64_t unique_kmer_count{0}; { std::ofstream outfile{minimiser_file, std::ios::binary}; for (auto && [hash, occurrences] : minimiser_table) { if (occurrences >= arguments.kmer_count_min_cutoff && occurrences <= arguments.kmer_count_max_cutoff) { + if (occurrences == 1) + ++unique_kmer_count; outfile.write(reinterpret_cast(&hash), sizeof(hash)); - ++count; + ++distinct_kmer_count; } } } @@ -161,7 +164,7 @@ void compute_minimiser(valik::build_arguments const & arguments) { std::ofstream headerfile{header_file}; headerfile << arguments.shape.to_string() << '\t' << std::to_string(arguments.window_size) << '\t' << - count << '\t' << seg.len << '\n'; + unique_kmer_count << '\t' << distinct_kmer_count << '\t' << seg.len << '\n'; } std::filesystem::remove(progress_file); @@ -244,12 +247,13 @@ size_t kmer_count_from_minimiser_files(std::vector const & minimise std::string shape_string{}; uint64_t window_size{}; - size_t max_count{}; + uint64_t max_unique{}; + uint64_t max_count{}; uint64_t bin_size{}; biggest_file.replace_extension("header"); std::ifstream file_stream{biggest_file}; - file_stream >> shape_string >> window_size >> max_count >> bin_size; + file_stream >> shape_string >> window_size >> max_unique >> max_count >> bin_size; return max_count; }